1 /*
2  * Copyright © 2016 Red Hat.
3  * Copyright © 2016 Bas Nieuwenhuizen
4  *
5  * based in part on anv driver which is:
6  * Copyright © 2015 Intel Corporation
7  *
8  * Permission is hereby granted, free of charge, to any person obtaining a
9  * copy of this software and associated documentation files (the "Software"),
10  * to deal in the Software without restriction, including without limitation
11  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
12  * and/or sell copies of the Software, and to permit persons to whom the
13  * Software is furnished to do so, subject to the following conditions:
14  *
15  * The above copyright notice and this permission notice (including the next
16  * paragraph) shall be included in all copies or substantial portions of the
17  * Software.
18  *
19  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
22  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25  * IN THE SOFTWARE.
26  */
27 
28 #include "radv_private.h"
29 #include "radv_radeon_winsys.h"
30 #include "radv_shader.h"
31 #include "radv_cs.h"
32 #include "sid.h"
33 #include "vk_format.h"
34 #include "vk_util.h"
35 #include "radv_debug.h"
36 #include "radv_meta.h"
37 
38 #include "ac_debug.h"
39 
40 enum {
41 	RADV_PREFETCH_VBO_DESCRIPTORS	= (1 << 0),
42 	RADV_PREFETCH_VS		= (1 << 1),
43 	RADV_PREFETCH_TCS		= (1 << 2),
44 	RADV_PREFETCH_TES		= (1 << 3),
45 	RADV_PREFETCH_GS		= (1 << 4),
46 	RADV_PREFETCH_PS		= (1 << 5),
47 	RADV_PREFETCH_SHADERS		= (RADV_PREFETCH_VS  |
48 					   RADV_PREFETCH_TCS |
49 					   RADV_PREFETCH_TES |
50 					   RADV_PREFETCH_GS  |
51 					   RADV_PREFETCH_PS)
52 };
53 
54 static void radv_handle_image_transition(struct radv_cmd_buffer *cmd_buffer,
55 					 struct radv_image *image,
56 					 VkImageLayout src_layout,
57 					 bool src_render_loop,
58 					 VkImageLayout dst_layout,
59 					 bool dst_render_loop,
60 					 uint32_t src_family,
61 					 uint32_t dst_family,
62 					 const VkImageSubresourceRange *range,
63 					 struct radv_sample_locations_state *sample_locs);
64 
65 const struct radv_dynamic_state default_dynamic_state = {
66 	.viewport = {
67 		.count = 0,
68 	},
69 	.scissor = {
70 		.count = 0,
71 	},
72 	.line_width = 1.0f,
73 	.depth_bias = {
74 		.bias = 0.0f,
75 		.clamp = 0.0f,
76 		.slope = 0.0f,
77 	},
78 	.blend_constants = { 0.0f, 0.0f, 0.0f, 0.0f },
79 	.depth_bounds = {
80 		.min = 0.0f,
81 		.max = 1.0f,
82 	},
83 	.stencil_compare_mask = {
84 		.front = ~0u,
85 		.back = ~0u,
86 	},
87 	.stencil_write_mask = {
88 		.front = ~0u,
89 		.back = ~0u,
90 	},
91 	.stencil_reference = {
92 		.front = 0u,
93 		.back = 0u,
94 	},
95 	.line_stipple = {
96 		.factor = 0u,
97 		.pattern = 0u,
98 	},
99 	.cull_mode = 0u,
100 	.front_face = 0u,
101 	.primitive_topology = 0u,
102 };
103 
104 static void
radv_bind_dynamic_state(struct radv_cmd_buffer * cmd_buffer,const struct radv_dynamic_state * src)105 radv_bind_dynamic_state(struct radv_cmd_buffer *cmd_buffer,
106 			const struct radv_dynamic_state *src)
107 {
108 	struct radv_dynamic_state *dest = &cmd_buffer->state.dynamic;
109 	uint32_t copy_mask = src->mask;
110 	uint32_t dest_mask = 0;
111 
112 	dest->discard_rectangle.count = src->discard_rectangle.count;
113 	dest->sample_location.count = src->sample_location.count;
114 
115 	if (copy_mask & RADV_DYNAMIC_VIEWPORT) {
116 		if (dest->viewport.count != src->viewport.count) {
117 			dest->viewport.count = src->viewport.count;
118 			dest_mask |= RADV_DYNAMIC_VIEWPORT;
119 		}
120 
121 		if (memcmp(&dest->viewport.viewports, &src->viewport.viewports,
122 			   src->viewport.count * sizeof(VkViewport))) {
123 			typed_memcpy(dest->viewport.viewports,
124 				     src->viewport.viewports,
125 				     src->viewport.count);
126 			dest_mask |= RADV_DYNAMIC_VIEWPORT;
127 		}
128 	}
129 
130 	if (copy_mask & RADV_DYNAMIC_SCISSOR) {
131 		if (dest->scissor.count != src->scissor.count) {
132 			dest->scissor.count = src->scissor.count;
133 			dest_mask |= RADV_DYNAMIC_SCISSOR;
134 		}
135 
136 		if (memcmp(&dest->scissor.scissors, &src->scissor.scissors,
137 			   src->scissor.count * sizeof(VkRect2D))) {
138 			typed_memcpy(dest->scissor.scissors,
139 				     src->scissor.scissors, src->scissor.count);
140 			dest_mask |= RADV_DYNAMIC_SCISSOR;
141 		}
142 	}
143 
144 	if (copy_mask & RADV_DYNAMIC_LINE_WIDTH) {
145 		if (dest->line_width != src->line_width) {
146 			dest->line_width = src->line_width;
147 			dest_mask |= RADV_DYNAMIC_LINE_WIDTH;
148 		}
149 	}
150 
151 	if (copy_mask & RADV_DYNAMIC_DEPTH_BIAS) {
152 		if (memcmp(&dest->depth_bias, &src->depth_bias,
153 			   sizeof(src->depth_bias))) {
154 			dest->depth_bias = src->depth_bias;
155 			dest_mask |= RADV_DYNAMIC_DEPTH_BIAS;
156 		}
157 	}
158 
159 	if (copy_mask & RADV_DYNAMIC_BLEND_CONSTANTS) {
160 		if (memcmp(&dest->blend_constants, &src->blend_constants,
161 			   sizeof(src->blend_constants))) {
162 			typed_memcpy(dest->blend_constants,
163 				     src->blend_constants, 4);
164 			dest_mask |= RADV_DYNAMIC_BLEND_CONSTANTS;
165 		}
166 	}
167 
168 	if (copy_mask & RADV_DYNAMIC_DEPTH_BOUNDS) {
169 		if (memcmp(&dest->depth_bounds, &src->depth_bounds,
170 			   sizeof(src->depth_bounds))) {
171 			dest->depth_bounds = src->depth_bounds;
172 			dest_mask |= RADV_DYNAMIC_DEPTH_BOUNDS;
173 		}
174 	}
175 
176 	if (copy_mask & RADV_DYNAMIC_STENCIL_COMPARE_MASK) {
177 		if (memcmp(&dest->stencil_compare_mask,
178 			   &src->stencil_compare_mask,
179 			   sizeof(src->stencil_compare_mask))) {
180 			dest->stencil_compare_mask = src->stencil_compare_mask;
181 			dest_mask |= RADV_DYNAMIC_STENCIL_COMPARE_MASK;
182 		}
183 	}
184 
185 	if (copy_mask & RADV_DYNAMIC_STENCIL_WRITE_MASK) {
186 		if (memcmp(&dest->stencil_write_mask, &src->stencil_write_mask,
187 			   sizeof(src->stencil_write_mask))) {
188 			dest->stencil_write_mask = src->stencil_write_mask;
189 			dest_mask |= RADV_DYNAMIC_STENCIL_WRITE_MASK;
190 		}
191 	}
192 
193 	if (copy_mask & RADV_DYNAMIC_STENCIL_REFERENCE) {
194 		if (memcmp(&dest->stencil_reference, &src->stencil_reference,
195 			   sizeof(src->stencil_reference))) {
196 			dest->stencil_reference = src->stencil_reference;
197 			dest_mask |= RADV_DYNAMIC_STENCIL_REFERENCE;
198 		}
199 	}
200 
201 	if (copy_mask & RADV_DYNAMIC_DISCARD_RECTANGLE) {
202 		if (memcmp(&dest->discard_rectangle.rectangles, &src->discard_rectangle.rectangles,
203 			   src->discard_rectangle.count * sizeof(VkRect2D))) {
204 			typed_memcpy(dest->discard_rectangle.rectangles,
205 				     src->discard_rectangle.rectangles,
206 				     src->discard_rectangle.count);
207 			dest_mask |= RADV_DYNAMIC_DISCARD_RECTANGLE;
208 		}
209 	}
210 
211 	if (copy_mask & RADV_DYNAMIC_SAMPLE_LOCATIONS) {
212 		if (dest->sample_location.per_pixel != src->sample_location.per_pixel ||
213 		    dest->sample_location.grid_size.width != src->sample_location.grid_size.width ||
214 		    dest->sample_location.grid_size.height != src->sample_location.grid_size.height ||
215 		    memcmp(&dest->sample_location.locations,
216 			   &src->sample_location.locations,
217 			   src->sample_location.count * sizeof(VkSampleLocationEXT))) {
218 			dest->sample_location.per_pixel = src->sample_location.per_pixel;
219 			dest->sample_location.grid_size = src->sample_location.grid_size;
220 			typed_memcpy(dest->sample_location.locations,
221 				     src->sample_location.locations,
222 				     src->sample_location.count);
223 			dest_mask |= RADV_DYNAMIC_SAMPLE_LOCATIONS;
224 		}
225 	}
226 
227 	if (copy_mask & RADV_DYNAMIC_LINE_STIPPLE) {
228 		if (memcmp(&dest->line_stipple, &src->line_stipple,
229 			   sizeof(src->line_stipple))) {
230 			dest->line_stipple = src->line_stipple;
231 			dest_mask |= RADV_DYNAMIC_LINE_STIPPLE;
232 		}
233 	}
234 
235 	if (copy_mask & RADV_DYNAMIC_CULL_MODE) {
236 		if (dest->cull_mode != src->cull_mode) {
237 			dest->cull_mode = src->cull_mode;
238 			dest_mask |= RADV_DYNAMIC_CULL_MODE;
239 		}
240 	}
241 
242 	if (copy_mask & RADV_DYNAMIC_FRONT_FACE) {
243 		if (dest->front_face != src->front_face) {
244 			dest->front_face = src->front_face;
245 			dest_mask |= RADV_DYNAMIC_FRONT_FACE;
246 		}
247 	}
248 
249 	if (copy_mask & RADV_DYNAMIC_PRIMITIVE_TOPOLOGY) {
250 		if (dest->primitive_topology != src->primitive_topology) {
251 			dest->primitive_topology = src->primitive_topology;
252 			dest_mask |= RADV_DYNAMIC_PRIMITIVE_TOPOLOGY;
253 		}
254 	}
255 
256 	if (copy_mask & RADV_DYNAMIC_DEPTH_TEST_ENABLE) {
257 		if (dest->depth_test_enable != src->depth_test_enable) {
258 			dest->depth_test_enable = src->depth_test_enable;
259 			dest_mask |= RADV_DYNAMIC_DEPTH_TEST_ENABLE;
260 		}
261 	}
262 
263 	if (copy_mask & RADV_DYNAMIC_DEPTH_WRITE_ENABLE) {
264 		if (dest->depth_write_enable != src->depth_write_enable) {
265 			dest->depth_write_enable = src->depth_write_enable;
266 			dest_mask |= RADV_DYNAMIC_DEPTH_WRITE_ENABLE;
267 		}
268 	}
269 
270 	if (copy_mask & RADV_DYNAMIC_DEPTH_COMPARE_OP) {
271 		if (dest->depth_compare_op != src->depth_compare_op) {
272 			dest->depth_compare_op = src->depth_compare_op;
273 			dest_mask |= RADV_DYNAMIC_DEPTH_COMPARE_OP;
274 		}
275 	}
276 
277 	if (copy_mask & RADV_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE) {
278 		if (dest->depth_bounds_test_enable != src->depth_bounds_test_enable) {
279 			dest->depth_bounds_test_enable = src->depth_bounds_test_enable;
280 			dest_mask |= RADV_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE;
281 		}
282 	}
283 
284 	if (copy_mask & RADV_DYNAMIC_STENCIL_TEST_ENABLE) {
285 		if (dest->stencil_test_enable != src->stencil_test_enable) {
286 			dest->stencil_test_enable = src->stencil_test_enable;
287 			dest_mask |= RADV_DYNAMIC_STENCIL_TEST_ENABLE;
288 		}
289 	}
290 
291 	if (copy_mask & RADV_DYNAMIC_STENCIL_OP) {
292 		if (memcmp(&dest->stencil_op, &src->stencil_op,
293 			   sizeof(src->stencil_op))) {
294 			dest->stencil_op = src->stencil_op;
295 			dest_mask |= RADV_DYNAMIC_STENCIL_OP;
296 		}
297 	}
298 
299 	cmd_buffer->state.dirty |= dest_mask;
300 }
301 
302 static void
radv_bind_streamout_state(struct radv_cmd_buffer * cmd_buffer,struct radv_pipeline * pipeline)303 radv_bind_streamout_state(struct radv_cmd_buffer *cmd_buffer,
304 			  struct radv_pipeline *pipeline)
305 {
306 	struct radv_streamout_state *so = &cmd_buffer->state.streamout;
307 	struct radv_shader_info *info;
308 
309 	if (!pipeline->streamout_shader ||
310 	    cmd_buffer->device->physical_device->use_ngg_streamout)
311 		return;
312 
313 	info = &pipeline->streamout_shader->info;
314 	for (int i = 0; i < MAX_SO_BUFFERS; i++)
315 		so->stride_in_dw[i] = info->so.strides[i];
316 
317 	so->enabled_stream_buffers_mask = info->so.enabled_stream_buffers_mask;
318 }
319 
radv_cmd_buffer_uses_mec(struct radv_cmd_buffer * cmd_buffer)320 bool radv_cmd_buffer_uses_mec(struct radv_cmd_buffer *cmd_buffer)
321 {
322 	return cmd_buffer->queue_family_index == RADV_QUEUE_COMPUTE &&
323 	       cmd_buffer->device->physical_device->rad_info.chip_class >= GFX7;
324 }
325 
radv_queue_family_to_ring(int f)326 enum ring_type radv_queue_family_to_ring(int f) {
327 	switch (f) {
328 	case RADV_QUEUE_GENERAL:
329 		return RING_GFX;
330 	case RADV_QUEUE_COMPUTE:
331 		return RING_COMPUTE;
332 	case RADV_QUEUE_TRANSFER:
333 		return RING_DMA;
334 	default:
335 		unreachable("Unknown queue family");
336 	}
337 }
338 
339 static void
radv_destroy_cmd_buffer(struct radv_cmd_buffer * cmd_buffer)340 radv_destroy_cmd_buffer(struct radv_cmd_buffer *cmd_buffer)
341 {
342 	list_del(&cmd_buffer->pool_link);
343 
344 	list_for_each_entry_safe(struct radv_cmd_buffer_upload, up,
345 				 &cmd_buffer->upload.list, list) {
346 		cmd_buffer->device->ws->buffer_destroy(up->upload_bo);
347 		list_del(&up->list);
348 		free(up);
349 	}
350 
351 	if (cmd_buffer->upload.upload_bo)
352 		cmd_buffer->device->ws->buffer_destroy(cmd_buffer->upload.upload_bo);
353 
354 	if (cmd_buffer->cs)
355 		cmd_buffer->device->ws->cs_destroy(cmd_buffer->cs);
356 
357 	for (unsigned i = 0; i < MAX_BIND_POINTS; i++)
358 		free(cmd_buffer->descriptors[i].push_set.set.mapped_ptr);
359 
360 	vk_object_base_finish(&cmd_buffer->base);
361 	vk_free(&cmd_buffer->pool->alloc, cmd_buffer);
362 }
363 
radv_create_cmd_buffer(struct radv_device * device,struct radv_cmd_pool * pool,VkCommandBufferLevel level,VkCommandBuffer * pCommandBuffer)364 static VkResult radv_create_cmd_buffer(
365 	struct radv_device *                         device,
366 	struct radv_cmd_pool *                       pool,
367 	VkCommandBufferLevel                        level,
368 	VkCommandBuffer*                            pCommandBuffer)
369 {
370 	struct radv_cmd_buffer *cmd_buffer;
371 	unsigned ring;
372 	cmd_buffer = vk_zalloc(&pool->alloc, sizeof(*cmd_buffer), 8,
373 			       VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
374 	if (cmd_buffer == NULL)
375 		return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
376 
377 	vk_object_base_init(&device->vk, &cmd_buffer->base,
378 			    VK_OBJECT_TYPE_COMMAND_BUFFER);
379 
380 	cmd_buffer->device = device;
381 	cmd_buffer->pool = pool;
382 	cmd_buffer->level = level;
383 
384 	list_addtail(&cmd_buffer->pool_link, &pool->cmd_buffers);
385 	cmd_buffer->queue_family_index = pool->queue_family_index;
386 
387 	ring = radv_queue_family_to_ring(cmd_buffer->queue_family_index);
388 
389 	cmd_buffer->cs = device->ws->cs_create(device->ws, ring);
390 	if (!cmd_buffer->cs) {
391 		radv_destroy_cmd_buffer(cmd_buffer);
392 		return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
393 	}
394 
395 	*pCommandBuffer = radv_cmd_buffer_to_handle(cmd_buffer);
396 
397 	list_inithead(&cmd_buffer->upload.list);
398 
399 	return VK_SUCCESS;
400 }
401 
402 static VkResult
radv_reset_cmd_buffer(struct radv_cmd_buffer * cmd_buffer)403 radv_reset_cmd_buffer(struct radv_cmd_buffer *cmd_buffer)
404 {
405 	cmd_buffer->device->ws->cs_reset(cmd_buffer->cs);
406 
407 	list_for_each_entry_safe(struct radv_cmd_buffer_upload, up,
408 				 &cmd_buffer->upload.list, list) {
409 		cmd_buffer->device->ws->buffer_destroy(up->upload_bo);
410 		list_del(&up->list);
411 		free(up);
412 	}
413 
414 	cmd_buffer->push_constant_stages = 0;
415 	cmd_buffer->scratch_size_per_wave_needed = 0;
416 	cmd_buffer->scratch_waves_wanted = 0;
417 	cmd_buffer->compute_scratch_size_per_wave_needed = 0;
418 	cmd_buffer->compute_scratch_waves_wanted = 0;
419 	cmd_buffer->esgs_ring_size_needed = 0;
420 	cmd_buffer->gsvs_ring_size_needed = 0;
421 	cmd_buffer->tess_rings_needed = false;
422 	cmd_buffer->gds_needed = false;
423 	cmd_buffer->gds_oa_needed = false;
424 	cmd_buffer->sample_positions_needed = false;
425 
426 	if (cmd_buffer->upload.upload_bo)
427 		radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs,
428 				   cmd_buffer->upload.upload_bo);
429 	cmd_buffer->upload.offset = 0;
430 
431 	cmd_buffer->record_result = VK_SUCCESS;
432 
433 	memset(cmd_buffer->vertex_bindings, 0, sizeof(cmd_buffer->vertex_bindings));
434 
435 	for (unsigned i = 0; i < MAX_BIND_POINTS; i++) {
436 		cmd_buffer->descriptors[i].dirty = 0;
437 		cmd_buffer->descriptors[i].valid = 0;
438 		cmd_buffer->descriptors[i].push_dirty = false;
439 	}
440 
441 	if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9 &&
442 	    cmd_buffer->queue_family_index == RADV_QUEUE_GENERAL) {
443 		unsigned num_db = cmd_buffer->device->physical_device->rad_info.num_render_backends;
444 		unsigned fence_offset, eop_bug_offset;
445 		void *fence_ptr;
446 
447 		radv_cmd_buffer_upload_alloc(cmd_buffer, 8, 8, &fence_offset,
448 					     &fence_ptr);
449 
450 		cmd_buffer->gfx9_fence_va =
451 			radv_buffer_get_va(cmd_buffer->upload.upload_bo);
452 		cmd_buffer->gfx9_fence_va += fence_offset;
453 
454 		if (cmd_buffer->device->physical_device->rad_info.chip_class == GFX9) {
455 			/* Allocate a buffer for the EOP bug on GFX9. */
456 			radv_cmd_buffer_upload_alloc(cmd_buffer, 16 * num_db, 8,
457 						     &eop_bug_offset, &fence_ptr);
458 			cmd_buffer->gfx9_eop_bug_va =
459 				radv_buffer_get_va(cmd_buffer->upload.upload_bo);
460 			cmd_buffer->gfx9_eop_bug_va += eop_bug_offset;
461 		}
462 	}
463 
464 	cmd_buffer->status = RADV_CMD_BUFFER_STATUS_INITIAL;
465 
466 	return cmd_buffer->record_result;
467 }
468 
469 static bool
radv_cmd_buffer_resize_upload_buf(struct radv_cmd_buffer * cmd_buffer,uint64_t min_needed)470 radv_cmd_buffer_resize_upload_buf(struct radv_cmd_buffer *cmd_buffer,
471 				  uint64_t min_needed)
472 {
473 	uint64_t new_size;
474 	struct radeon_winsys_bo *bo;
475 	struct radv_cmd_buffer_upload *upload;
476 	struct radv_device *device = cmd_buffer->device;
477 
478 	new_size = MAX2(min_needed, 16 * 1024);
479 	new_size = MAX2(new_size, 2 * cmd_buffer->upload.size);
480 
481 	bo = device->ws->buffer_create(device->ws,
482 				       new_size, 4096,
483 				       RADEON_DOMAIN_GTT,
484 				       RADEON_FLAG_CPU_ACCESS|
485 				       RADEON_FLAG_NO_INTERPROCESS_SHARING |
486 				       RADEON_FLAG_32BIT |
487 				       RADEON_FLAG_GTT_WC,
488 				       RADV_BO_PRIORITY_UPLOAD_BUFFER);
489 
490 	if (!bo) {
491 		cmd_buffer->record_result = VK_ERROR_OUT_OF_DEVICE_MEMORY;
492 		return false;
493 	}
494 
495 	radv_cs_add_buffer(device->ws, cmd_buffer->cs, bo);
496 	if (cmd_buffer->upload.upload_bo) {
497 		upload = malloc(sizeof(*upload));
498 
499 		if (!upload) {
500 			cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY;
501 			device->ws->buffer_destroy(bo);
502 			return false;
503 		}
504 
505 		memcpy(upload, &cmd_buffer->upload, sizeof(*upload));
506 		list_add(&upload->list, &cmd_buffer->upload.list);
507 	}
508 
509 	cmd_buffer->upload.upload_bo = bo;
510 	cmd_buffer->upload.size = new_size;
511 	cmd_buffer->upload.offset = 0;
512 	cmd_buffer->upload.map = device->ws->buffer_map(cmd_buffer->upload.upload_bo);
513 
514 	if (!cmd_buffer->upload.map) {
515 		cmd_buffer->record_result = VK_ERROR_OUT_OF_DEVICE_MEMORY;
516 		return false;
517 	}
518 
519 	return true;
520 }
521 
522 bool
radv_cmd_buffer_upload_alloc(struct radv_cmd_buffer * cmd_buffer,unsigned size,unsigned alignment,unsigned * out_offset,void ** ptr)523 radv_cmd_buffer_upload_alloc(struct radv_cmd_buffer *cmd_buffer,
524 			     unsigned size,
525 			     unsigned alignment,
526 			     unsigned *out_offset,
527 			     void **ptr)
528 {
529 	assert(util_is_power_of_two_nonzero(alignment));
530 
531 	uint64_t offset = align(cmd_buffer->upload.offset, alignment);
532 	if (offset + size > cmd_buffer->upload.size) {
533 		if (!radv_cmd_buffer_resize_upload_buf(cmd_buffer, size))
534 			return false;
535 		offset = 0;
536 	}
537 
538 	*out_offset = offset;
539 	*ptr = cmd_buffer->upload.map + offset;
540 
541 	cmd_buffer->upload.offset = offset + size;
542 	return true;
543 }
544 
545 bool
radv_cmd_buffer_upload_data(struct radv_cmd_buffer * cmd_buffer,unsigned size,unsigned alignment,const void * data,unsigned * out_offset)546 radv_cmd_buffer_upload_data(struct radv_cmd_buffer *cmd_buffer,
547 			    unsigned size, unsigned alignment,
548 			    const void *data, unsigned *out_offset)
549 {
550 	uint8_t *ptr;
551 
552 	if (!radv_cmd_buffer_upload_alloc(cmd_buffer, size, alignment,
553 					  out_offset, (void **)&ptr))
554 		return false;
555 
556 	if (ptr)
557 		memcpy(ptr, data, size);
558 
559 	return true;
560 }
561 
562 static void
radv_emit_write_data_packet(struct radv_cmd_buffer * cmd_buffer,uint64_t va,unsigned count,const uint32_t * data)563 radv_emit_write_data_packet(struct radv_cmd_buffer *cmd_buffer, uint64_t va,
564 			    unsigned count, const uint32_t *data)
565 {
566 	struct radeon_cmdbuf *cs = cmd_buffer->cs;
567 
568 	radeon_check_space(cmd_buffer->device->ws, cs, 4 + count);
569 
570 	radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + count, 0));
571 	radeon_emit(cs, S_370_DST_SEL(V_370_MEM) |
572 		    S_370_WR_CONFIRM(1) |
573 		    S_370_ENGINE_SEL(V_370_ME));
574 	radeon_emit(cs, va);
575 	radeon_emit(cs, va >> 32);
576 	radeon_emit_array(cs, data, count);
577 }
578 
radv_cmd_buffer_trace_emit(struct radv_cmd_buffer * cmd_buffer)579 void radv_cmd_buffer_trace_emit(struct radv_cmd_buffer *cmd_buffer)
580 {
581 	struct radv_device *device = cmd_buffer->device;
582 	struct radeon_cmdbuf *cs = cmd_buffer->cs;
583 	uint64_t va;
584 
585 	va = radv_buffer_get_va(device->trace_bo);
586 	if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY)
587 		va += 4;
588 
589 	++cmd_buffer->state.trace_id;
590 	radv_emit_write_data_packet(cmd_buffer, va, 1,
591 				    &cmd_buffer->state.trace_id);
592 
593 	radeon_check_space(cmd_buffer->device->ws, cs, 2);
594 
595 	radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
596 	radeon_emit(cs, AC_ENCODE_TRACE_POINT(cmd_buffer->state.trace_id));
597 }
598 
599 static void
radv_cmd_buffer_after_draw(struct radv_cmd_buffer * cmd_buffer,enum radv_cmd_flush_bits flags)600 radv_cmd_buffer_after_draw(struct radv_cmd_buffer *cmd_buffer,
601 			   enum radv_cmd_flush_bits flags)
602 {
603 	if (unlikely(cmd_buffer->device->thread_trace_bo)) {
604 		radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
605 		radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_THREAD_TRACE_MARKER) | EVENT_INDEX(0));
606 	}
607 
608 	if (cmd_buffer->device->instance->debug_flags & RADV_DEBUG_SYNC_SHADERS) {
609 		enum rgp_flush_bits sqtt_flush_bits = 0;
610 		assert(flags & (RADV_CMD_FLAG_PS_PARTIAL_FLUSH |
611 				RADV_CMD_FLAG_CS_PARTIAL_FLUSH));
612 
613 		radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 4);
614 
615 		/* Force wait for graphics or compute engines to be idle. */
616 		si_cs_emit_cache_flush(cmd_buffer->cs,
617 				       cmd_buffer->device->physical_device->rad_info.chip_class,
618 				       &cmd_buffer->gfx9_fence_idx,
619 				       cmd_buffer->gfx9_fence_va,
620 				       radv_cmd_buffer_uses_mec(cmd_buffer),
621 				       flags, &sqtt_flush_bits, cmd_buffer->gfx9_eop_bug_va);
622 	}
623 
624 	if (unlikely(cmd_buffer->device->trace_bo))
625 		radv_cmd_buffer_trace_emit(cmd_buffer);
626 }
627 
628 static void
radv_save_pipeline(struct radv_cmd_buffer * cmd_buffer,struct radv_pipeline * pipeline)629 radv_save_pipeline(struct radv_cmd_buffer *cmd_buffer,
630 		   struct radv_pipeline *pipeline)
631 {
632 	struct radv_device *device = cmd_buffer->device;
633 	enum ring_type ring;
634 	uint32_t data[2];
635 	uint64_t va;
636 
637 	va = radv_buffer_get_va(device->trace_bo);
638 
639 	ring = radv_queue_family_to_ring(cmd_buffer->queue_family_index);
640 
641 	switch (ring) {
642 	case RING_GFX:
643 		va += 8;
644 		break;
645 	case RING_COMPUTE:
646 		va += 16;
647 		break;
648 	default:
649 		assert(!"invalid ring type");
650 	}
651 
652 	uint64_t pipeline_address = (uintptr_t)pipeline;
653 	data[0] = pipeline_address;
654 	data[1] = pipeline_address >> 32;
655 
656 	radv_emit_write_data_packet(cmd_buffer, va, 2, data);
657 }
658 
radv_set_descriptor_set(struct radv_cmd_buffer * cmd_buffer,VkPipelineBindPoint bind_point,struct radv_descriptor_set * set,unsigned idx)659 void radv_set_descriptor_set(struct radv_cmd_buffer *cmd_buffer,
660 			     VkPipelineBindPoint bind_point,
661 			     struct radv_descriptor_set *set,
662 			     unsigned idx)
663 {
664 	struct radv_descriptor_state *descriptors_state =
665 		radv_get_descriptors_state(cmd_buffer, bind_point);
666 
667 	descriptors_state->sets[idx] = set;
668 
669 	descriptors_state->valid |= (1u << idx); /* active descriptors */
670 	descriptors_state->dirty |= (1u << idx);
671 }
672 
673 static void
radv_save_descriptors(struct radv_cmd_buffer * cmd_buffer,VkPipelineBindPoint bind_point)674 radv_save_descriptors(struct radv_cmd_buffer *cmd_buffer,
675 		      VkPipelineBindPoint bind_point)
676 {
677 	struct radv_descriptor_state *descriptors_state =
678 		radv_get_descriptors_state(cmd_buffer, bind_point);
679 	struct radv_device *device = cmd_buffer->device;
680 	uint32_t data[MAX_SETS * 2] = {0};
681 	uint64_t va;
682 	unsigned i;
683 	va = radv_buffer_get_va(device->trace_bo) + 24;
684 
685 	for_each_bit(i, descriptors_state->valid) {
686 		struct radv_descriptor_set *set = descriptors_state->sets[i];
687 		data[i * 2] = (uint64_t)(uintptr_t)set;
688 		data[i * 2 + 1] = (uint64_t)(uintptr_t)set >> 32;
689 	}
690 
691 	radv_emit_write_data_packet(cmd_buffer, va, MAX_SETS * 2, data);
692 }
693 
694 struct radv_userdata_info *
radv_lookup_user_sgpr(struct radv_pipeline * pipeline,gl_shader_stage stage,int idx)695 radv_lookup_user_sgpr(struct radv_pipeline *pipeline,
696 		      gl_shader_stage stage,
697 		      int idx)
698 {
699 	struct radv_shader_variant *shader = radv_get_shader(pipeline, stage);
700 	return &shader->info.user_sgprs_locs.shader_data[idx];
701 }
702 
703 static void
radv_emit_userdata_address(struct radv_cmd_buffer * cmd_buffer,struct radv_pipeline * pipeline,gl_shader_stage stage,int idx,uint64_t va)704 radv_emit_userdata_address(struct radv_cmd_buffer *cmd_buffer,
705 			   struct radv_pipeline *pipeline,
706 			   gl_shader_stage stage,
707 			   int idx, uint64_t va)
708 {
709 	struct radv_userdata_info *loc = radv_lookup_user_sgpr(pipeline, stage, idx);
710 	uint32_t base_reg = pipeline->user_data_0[stage];
711 	if (loc->sgpr_idx == -1)
712 		return;
713 
714 	assert(loc->num_sgprs == 1);
715 
716 	radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs,
717 				 base_reg + loc->sgpr_idx * 4, va, false);
718 }
719 
720 static void
radv_emit_descriptor_pointers(struct radv_cmd_buffer * cmd_buffer,struct radv_pipeline * pipeline,struct radv_descriptor_state * descriptors_state,gl_shader_stage stage)721 radv_emit_descriptor_pointers(struct radv_cmd_buffer *cmd_buffer,
722 			      struct radv_pipeline *pipeline,
723 			      struct radv_descriptor_state *descriptors_state,
724 			      gl_shader_stage stage)
725 {
726 	struct radv_device *device = cmd_buffer->device;
727 	struct radeon_cmdbuf *cs = cmd_buffer->cs;
728 	uint32_t sh_base = pipeline->user_data_0[stage];
729 	struct radv_userdata_locations *locs =
730 		&pipeline->shaders[stage]->info.user_sgprs_locs;
731 	unsigned mask = locs->descriptor_sets_enabled;
732 
733 	mask &= descriptors_state->dirty & descriptors_state->valid;
734 
735 	while (mask) {
736 		int start, count;
737 
738 		u_bit_scan_consecutive_range(&mask, &start, &count);
739 
740 		struct radv_userdata_info *loc = &locs->descriptor_sets[start];
741 		unsigned sh_offset = sh_base + loc->sgpr_idx * 4;
742 
743 		radv_emit_shader_pointer_head(cs, sh_offset, count, true);
744 		for (int i = 0; i < count; i++) {
745 			struct radv_descriptor_set *set =
746 				descriptors_state->sets[start + i];
747 
748 			radv_emit_shader_pointer_body(device, cs, set->va, true);
749 		}
750 	}
751 }
752 
753 /**
754  * Convert the user sample locations to hardware sample locations (the values
755  * that will be emitted by PA_SC_AA_SAMPLE_LOCS_PIXEL_*).
756  */
757 static void
radv_convert_user_sample_locs(struct radv_sample_locations_state * state,uint32_t x,uint32_t y,VkOffset2D * sample_locs)758 radv_convert_user_sample_locs(struct radv_sample_locations_state *state,
759 			      uint32_t x, uint32_t y, VkOffset2D *sample_locs)
760 {
761 	uint32_t x_offset = x % state->grid_size.width;
762 	uint32_t y_offset = y % state->grid_size.height;
763 	uint32_t num_samples = (uint32_t)state->per_pixel;
764 	VkSampleLocationEXT *user_locs;
765 	uint32_t pixel_offset;
766 
767 	pixel_offset = (x_offset + y_offset * state->grid_size.width) * num_samples;
768 
769 	assert(pixel_offset <= MAX_SAMPLE_LOCATIONS);
770 	user_locs = &state->locations[pixel_offset];
771 
772 	for (uint32_t i = 0; i < num_samples; i++) {
773 		float shifted_pos_x = user_locs[i].x - 0.5;
774 		float shifted_pos_y = user_locs[i].y - 0.5;
775 
776 		int32_t scaled_pos_x = floorf(shifted_pos_x * 16);
777 		int32_t scaled_pos_y = floorf(shifted_pos_y * 16);
778 
779 		sample_locs[i].x = CLAMP(scaled_pos_x, -8, 7);
780 		sample_locs[i].y = CLAMP(scaled_pos_y, -8, 7);
781 	}
782 }
783 
784 /**
785  * Compute the PA_SC_AA_SAMPLE_LOCS_PIXEL_* mask based on hardware sample
786  * locations.
787  */
788 static void
radv_compute_sample_locs_pixel(uint32_t num_samples,VkOffset2D * sample_locs,uint32_t * sample_locs_pixel)789 radv_compute_sample_locs_pixel(uint32_t num_samples, VkOffset2D *sample_locs,
790 			       uint32_t *sample_locs_pixel)
791 {
792 	for (uint32_t i = 0; i < num_samples; i++) {
793 		uint32_t sample_reg_idx = i / 4;
794 		uint32_t sample_loc_idx = i % 4;
795 		int32_t pos_x = sample_locs[i].x;
796 		int32_t pos_y = sample_locs[i].y;
797 
798 		uint32_t shift_x = 8 * sample_loc_idx;
799 		uint32_t shift_y = shift_x + 4;
800 
801 		sample_locs_pixel[sample_reg_idx] |= (pos_x & 0xf) << shift_x;
802 		sample_locs_pixel[sample_reg_idx] |= (pos_y & 0xf) << shift_y;
803 	}
804 }
805 
806 /**
807  * Compute the PA_SC_CENTROID_PRIORITY_* mask based on the top left hardware
808  * sample locations.
809  */
810 static uint64_t
radv_compute_centroid_priority(struct radv_cmd_buffer * cmd_buffer,VkOffset2D * sample_locs,uint32_t num_samples)811 radv_compute_centroid_priority(struct radv_cmd_buffer *cmd_buffer,
812 			       VkOffset2D *sample_locs,
813 			       uint32_t num_samples)
814 {
815 	uint32_t centroid_priorities[num_samples];
816 	uint32_t sample_mask = num_samples - 1;
817 	uint32_t distances[num_samples];
818 	uint64_t centroid_priority = 0;
819 
820 	/* Compute the distances from center for each sample. */
821 	for (int i = 0; i < num_samples; i++) {
822 		distances[i] = (sample_locs[i].x * sample_locs[i].x) +
823 			       (sample_locs[i].y * sample_locs[i].y);
824 	}
825 
826 	/* Compute the centroid priorities by looking at the distances array. */
827 	for (int i = 0; i < num_samples; i++) {
828 		uint32_t min_idx = 0;
829 
830 		for (int j = 1; j < num_samples; j++) {
831 			if (distances[j] < distances[min_idx])
832 				min_idx = j;
833 		}
834 
835 		centroid_priorities[i] = min_idx;
836 		distances[min_idx] = 0xffffffff;
837 	}
838 
839 	/* Compute the final centroid priority. */
840 	for (int i = 0; i < 8; i++) {
841 		centroid_priority |=
842 			centroid_priorities[i & sample_mask] << (i * 4);
843 	}
844 
845 	return centroid_priority << 32 | centroid_priority;
846 }
847 
848 /**
849  * Emit the sample locations that are specified with VK_EXT_sample_locations.
850  */
851 static void
radv_emit_sample_locations(struct radv_cmd_buffer * cmd_buffer)852 radv_emit_sample_locations(struct radv_cmd_buffer *cmd_buffer)
853 {
854 	struct radv_sample_locations_state *sample_location =
855 		&cmd_buffer->state.dynamic.sample_location;
856 	uint32_t num_samples = (uint32_t)sample_location->per_pixel;
857 	struct radeon_cmdbuf *cs = cmd_buffer->cs;
858 	uint32_t sample_locs_pixel[4][2] = {0};
859 	VkOffset2D sample_locs[4][8]; /* 8 is the max. sample count supported */
860 	uint32_t max_sample_dist = 0;
861 	uint64_t centroid_priority;
862 
863 	if (!cmd_buffer->state.dynamic.sample_location.count)
864 		return;
865 
866 	/* Convert the user sample locations to hardware sample locations. */
867 	radv_convert_user_sample_locs(sample_location, 0, 0, sample_locs[0]);
868 	radv_convert_user_sample_locs(sample_location, 1, 0, sample_locs[1]);
869 	radv_convert_user_sample_locs(sample_location, 0, 1, sample_locs[2]);
870 	radv_convert_user_sample_locs(sample_location, 1, 1, sample_locs[3]);
871 
872 	/* Compute the PA_SC_AA_SAMPLE_LOCS_PIXEL_* mask. */
873 	for (uint32_t i = 0; i < 4; i++) {
874 		radv_compute_sample_locs_pixel(num_samples, sample_locs[i],
875 					       sample_locs_pixel[i]);
876 	}
877 
878 	/* Compute the PA_SC_CENTROID_PRIORITY_* mask. */
879 	centroid_priority =
880 		radv_compute_centroid_priority(cmd_buffer, sample_locs[0],
881 					       num_samples);
882 
883 	/* Compute the maximum sample distance from the specified locations. */
884 	for (unsigned i = 0; i < 4; ++i) {
885 		for (uint32_t j = 0; j < num_samples; j++) {
886 			VkOffset2D offset = sample_locs[i][j];
887 			max_sample_dist = MAX2(max_sample_dist,
888 			                       MAX2(abs(offset.x), abs(offset.y)));
889 		}
890 	}
891 
892 	/* Emit the specified user sample locations. */
893 	switch (num_samples) {
894 	case 2:
895 	case 4:
896 		radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, sample_locs_pixel[0][0]);
897 		radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, sample_locs_pixel[1][0]);
898 		radeon_set_context_reg(cs, R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, sample_locs_pixel[2][0]);
899 		radeon_set_context_reg(cs, R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, sample_locs_pixel[3][0]);
900 		break;
901 	case 8:
902 		radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, sample_locs_pixel[0][0]);
903 		radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, sample_locs_pixel[1][0]);
904 		radeon_set_context_reg(cs, R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, sample_locs_pixel[2][0]);
905 		radeon_set_context_reg(cs, R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, sample_locs_pixel[3][0]);
906 		radeon_set_context_reg(cs, R_028BFC_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_1, sample_locs_pixel[0][1]);
907 		radeon_set_context_reg(cs, R_028C0C_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_1, sample_locs_pixel[1][1]);
908 		radeon_set_context_reg(cs, R_028C1C_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_1, sample_locs_pixel[2][1]);
909 		radeon_set_context_reg(cs, R_028C2C_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_1, sample_locs_pixel[3][1]);
910 		break;
911 	default:
912 		unreachable("invalid number of samples");
913 	}
914 
915 	/* Emit the maximum sample distance and the centroid priority. */
916 	radeon_set_context_reg_rmw(cs, R_028BE0_PA_SC_AA_CONFIG,
917 				   S_028BE0_MAX_SAMPLE_DIST(max_sample_dist),
918 				   ~C_028BE0_MAX_SAMPLE_DIST);
919 
920 	radeon_set_context_reg_seq(cs, R_028BD4_PA_SC_CENTROID_PRIORITY_0, 2);
921 	radeon_emit(cs, centroid_priority);
922 	radeon_emit(cs, centroid_priority >> 32);
923 
924 	/* GFX9: Flush DFSM when the AA mode changes. */
925 	if (cmd_buffer->device->dfsm_allowed) {
926 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
927 		radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_DFSM) | EVENT_INDEX(0));
928 	}
929 
930 	cmd_buffer->state.context_roll_without_scissor_emitted = true;
931 }
932 
933 static void
radv_emit_inline_push_consts(struct radv_cmd_buffer * cmd_buffer,struct radv_pipeline * pipeline,gl_shader_stage stage,int idx,int count,uint32_t * values)934 radv_emit_inline_push_consts(struct radv_cmd_buffer *cmd_buffer,
935 			     struct radv_pipeline *pipeline,
936 			     gl_shader_stage stage,
937 			     int idx, int count, uint32_t *values)
938 {
939 	struct radv_userdata_info *loc = radv_lookup_user_sgpr(pipeline, stage, idx);
940 	uint32_t base_reg = pipeline->user_data_0[stage];
941 	if (loc->sgpr_idx == -1)
942 		return;
943 
944 	assert(loc->num_sgprs == count);
945 
946 	radeon_set_sh_reg_seq(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, count);
947 	radeon_emit_array(cmd_buffer->cs, values, count);
948 }
949 
950 static void
radv_update_multisample_state(struct radv_cmd_buffer * cmd_buffer,struct radv_pipeline * pipeline)951 radv_update_multisample_state(struct radv_cmd_buffer *cmd_buffer,
952 			      struct radv_pipeline *pipeline)
953 {
954 	int num_samples = pipeline->graphics.ms.num_samples;
955 	struct radv_pipeline *old_pipeline = cmd_buffer->state.emitted_pipeline;
956 
957 	if (pipeline->shaders[MESA_SHADER_FRAGMENT]->info.ps.needs_sample_positions)
958 		cmd_buffer->sample_positions_needed = true;
959 
960 	if (old_pipeline && num_samples == old_pipeline->graphics.ms.num_samples)
961 		return;
962 
963 	radv_emit_default_sample_locations(cmd_buffer->cs, num_samples);
964 
965 	cmd_buffer->state.context_roll_without_scissor_emitted = true;
966 }
967 
968 static void
radv_update_binning_state(struct radv_cmd_buffer * cmd_buffer,struct radv_pipeline * pipeline)969 radv_update_binning_state(struct radv_cmd_buffer *cmd_buffer,
970 			  struct radv_pipeline *pipeline)
971 {
972 	const struct radv_pipeline *old_pipeline = cmd_buffer->state.emitted_pipeline;
973 
974 
975 	if (pipeline->device->physical_device->rad_info.chip_class < GFX9)
976 		return;
977 
978 	if (old_pipeline &&
979 	    old_pipeline->graphics.binning.pa_sc_binner_cntl_0 == pipeline->graphics.binning.pa_sc_binner_cntl_0 &&
980 	    old_pipeline->graphics.binning.db_dfsm_control == pipeline->graphics.binning.db_dfsm_control)
981 		return;
982 
983 	bool binning_flush = false;
984 	if (cmd_buffer->device->physical_device->rad_info.family == CHIP_VEGA12 ||
985 	    cmd_buffer->device->physical_device->rad_info.family == CHIP_VEGA20 ||
986 	    cmd_buffer->device->physical_device->rad_info.family == CHIP_RAVEN2 ||
987 	    cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10) {
988 		binning_flush = !old_pipeline ||
989 			G_028C44_BINNING_MODE(old_pipeline->graphics.binning.pa_sc_binner_cntl_0) !=
990 			G_028C44_BINNING_MODE(pipeline->graphics.binning.pa_sc_binner_cntl_0);
991 	}
992 
993 	radeon_set_context_reg(cmd_buffer->cs, R_028C44_PA_SC_BINNER_CNTL_0,
994 			       pipeline->graphics.binning.pa_sc_binner_cntl_0 |
995 			       S_028C44_FLUSH_ON_BINNING_TRANSITION(!!binning_flush));
996 
997 	if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10) {
998 		radeon_set_context_reg(cmd_buffer->cs, R_028038_DB_DFSM_CONTROL,
999 				       pipeline->graphics.binning.db_dfsm_control);
1000 	} else {
1001 		radeon_set_context_reg(cmd_buffer->cs, R_028060_DB_DFSM_CONTROL,
1002 				       pipeline->graphics.binning.db_dfsm_control);
1003 	}
1004 
1005 	cmd_buffer->state.context_roll_without_scissor_emitted = true;
1006 }
1007 
1008 
1009 static void
radv_emit_shader_prefetch(struct radv_cmd_buffer * cmd_buffer,struct radv_shader_variant * shader)1010 radv_emit_shader_prefetch(struct radv_cmd_buffer *cmd_buffer,
1011 			  struct radv_shader_variant *shader)
1012 {
1013 	uint64_t va;
1014 
1015 	if (!shader)
1016 		return;
1017 
1018 	va = radv_buffer_get_va(shader->bo) + shader->bo_offset;
1019 
1020 	si_cp_dma_prefetch(cmd_buffer, va, shader->code_size);
1021 }
1022 
1023 static void
radv_emit_prefetch_L2(struct radv_cmd_buffer * cmd_buffer,struct radv_pipeline * pipeline,bool vertex_stage_only)1024 radv_emit_prefetch_L2(struct radv_cmd_buffer *cmd_buffer,
1025 		      struct radv_pipeline *pipeline,
1026 		      bool vertex_stage_only)
1027 {
1028 	struct radv_cmd_state *state = &cmd_buffer->state;
1029 	uint32_t mask = state->prefetch_L2_mask;
1030 
1031 	if (vertex_stage_only) {
1032 		/* Fast prefetch path for starting draws as soon as possible.
1033 		 */
1034 		mask = state->prefetch_L2_mask & (RADV_PREFETCH_VS |
1035 						  RADV_PREFETCH_VBO_DESCRIPTORS);
1036 	}
1037 
1038 	if (mask & RADV_PREFETCH_VS)
1039 		radv_emit_shader_prefetch(cmd_buffer,
1040 					  pipeline->shaders[MESA_SHADER_VERTEX]);
1041 
1042 	if (mask & RADV_PREFETCH_VBO_DESCRIPTORS)
1043 		si_cp_dma_prefetch(cmd_buffer, state->vb_va, state->vb_size);
1044 
1045 	if (mask & RADV_PREFETCH_TCS)
1046 		radv_emit_shader_prefetch(cmd_buffer,
1047 					  pipeline->shaders[MESA_SHADER_TESS_CTRL]);
1048 
1049 	if (mask & RADV_PREFETCH_TES)
1050 		radv_emit_shader_prefetch(cmd_buffer,
1051 					  pipeline->shaders[MESA_SHADER_TESS_EVAL]);
1052 
1053 	if (mask & RADV_PREFETCH_GS) {
1054 		radv_emit_shader_prefetch(cmd_buffer,
1055 					  pipeline->shaders[MESA_SHADER_GEOMETRY]);
1056 		if (radv_pipeline_has_gs_copy_shader(pipeline))
1057 			radv_emit_shader_prefetch(cmd_buffer, pipeline->gs_copy_shader);
1058 	}
1059 
1060 	if (mask & RADV_PREFETCH_PS)
1061 		radv_emit_shader_prefetch(cmd_buffer,
1062 					  pipeline->shaders[MESA_SHADER_FRAGMENT]);
1063 
1064 	state->prefetch_L2_mask &= ~mask;
1065 }
1066 
1067 static void
radv_emit_rbplus_state(struct radv_cmd_buffer * cmd_buffer)1068 radv_emit_rbplus_state(struct radv_cmd_buffer *cmd_buffer)
1069 {
1070 	if (!cmd_buffer->device->physical_device->rad_info.rbplus_allowed)
1071 		return;
1072 
1073 	struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
1074 	const struct radv_subpass *subpass = cmd_buffer->state.subpass;
1075 
1076 	unsigned sx_ps_downconvert = 0;
1077 	unsigned sx_blend_opt_epsilon = 0;
1078 	unsigned sx_blend_opt_control = 0;
1079 
1080 	if (!cmd_buffer->state.attachments || !subpass)
1081 		return;
1082 
1083 	for (unsigned i = 0; i < subpass->color_count; ++i) {
1084 		if (subpass->color_attachments[i].attachment == VK_ATTACHMENT_UNUSED) {
1085 			/* We don't set the DISABLE bits, because the HW can't have holes,
1086 			 * so the SPI color format is set to 32-bit 1-component. */
1087 			sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_R << (i * 4);
1088 			continue;
1089 		}
1090 
1091 		int idx = subpass->color_attachments[i].attachment;
1092 		struct radv_color_buffer_info *cb = &cmd_buffer->state.attachments[idx].cb;
1093 
1094 		unsigned format = G_028C70_FORMAT(cb->cb_color_info);
1095 		unsigned swap = G_028C70_COMP_SWAP(cb->cb_color_info);
1096 		uint32_t spi_format = (pipeline->graphics.col_format >> (i * 4)) & 0xf;
1097 		uint32_t colormask = (pipeline->graphics.cb_target_mask >> (i * 4)) & 0xf;
1098 
1099 		bool has_alpha, has_rgb;
1100 
1101 		/* Set if RGB and A are present. */
1102 		has_alpha = !G_028C74_FORCE_DST_ALPHA_1(cb->cb_color_attrib);
1103 
1104 		if (format == V_028C70_COLOR_8 ||
1105 		    format == V_028C70_COLOR_16 ||
1106 		    format == V_028C70_COLOR_32)
1107 			has_rgb = !has_alpha;
1108 		else
1109 			has_rgb = true;
1110 
1111 		/* Check the colormask and export format. */
1112 		if (!(colormask & 0x7))
1113 			has_rgb = false;
1114 		if (!(colormask & 0x8))
1115 			has_alpha = false;
1116 
1117 		if (spi_format == V_028714_SPI_SHADER_ZERO) {
1118 			has_rgb = false;
1119 			has_alpha = false;
1120 		}
1121 
1122 		/* The HW doesn't quite blend correctly with rgb9e5 if we disable the alpha
1123 		 * optimization, even though it has no alpha. */
1124 		if (has_rgb && format == V_028C70_COLOR_5_9_9_9)
1125 			has_alpha = true;
1126 
1127 		/* Disable value checking for disabled channels. */
1128 		if (!has_rgb)
1129 			sx_blend_opt_control |= S_02875C_MRT0_COLOR_OPT_DISABLE(1) << (i * 4);
1130 		if (!has_alpha)
1131 			sx_blend_opt_control |= S_02875C_MRT0_ALPHA_OPT_DISABLE(1) << (i * 4);
1132 
1133 		/* Enable down-conversion for 32bpp and smaller formats. */
1134 		switch (format) {
1135 		case V_028C70_COLOR_8:
1136 		case V_028C70_COLOR_8_8:
1137 		case V_028C70_COLOR_8_8_8_8:
1138 			/* For 1 and 2-channel formats, use the superset thereof. */
1139 			if (spi_format == V_028714_SPI_SHADER_FP16_ABGR ||
1140 			    spi_format == V_028714_SPI_SHADER_UINT16_ABGR ||
1141 			    spi_format == V_028714_SPI_SHADER_SINT16_ABGR) {
1142 				sx_ps_downconvert |= V_028754_SX_RT_EXPORT_8_8_8_8 << (i * 4);
1143 				sx_blend_opt_epsilon |= V_028758_8BIT_FORMAT << (i * 4);
1144 			}
1145 			break;
1146 
1147 		case V_028C70_COLOR_5_6_5:
1148 			if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
1149 				sx_ps_downconvert |= V_028754_SX_RT_EXPORT_5_6_5 << (i * 4);
1150 				sx_blend_opt_epsilon |= V_028758_6BIT_FORMAT << (i * 4);
1151 			}
1152 			break;
1153 
1154 		case V_028C70_COLOR_1_5_5_5:
1155 			if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
1156 				sx_ps_downconvert |= V_028754_SX_RT_EXPORT_1_5_5_5 << (i * 4);
1157 				sx_blend_opt_epsilon |= V_028758_5BIT_FORMAT << (i * 4);
1158 			}
1159 			break;
1160 
1161 		case V_028C70_COLOR_4_4_4_4:
1162 			if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
1163 				sx_ps_downconvert |= V_028754_SX_RT_EXPORT_4_4_4_4 << (i * 4);
1164 				sx_blend_opt_epsilon |= V_028758_4BIT_FORMAT << (i * 4);
1165 			}
1166 			break;
1167 
1168 		case V_028C70_COLOR_32:
1169 			if (swap == V_028C70_SWAP_STD &&
1170 			    spi_format == V_028714_SPI_SHADER_32_R)
1171 				sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_R << (i * 4);
1172 			else if (swap == V_028C70_SWAP_ALT_REV &&
1173 				 spi_format == V_028714_SPI_SHADER_32_AR)
1174 				sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_A << (i * 4);
1175 			break;
1176 
1177 		case V_028C70_COLOR_16:
1178 		case V_028C70_COLOR_16_16:
1179 			/* For 1-channel formats, use the superset thereof. */
1180 			if (spi_format == V_028714_SPI_SHADER_UNORM16_ABGR ||
1181 			    spi_format == V_028714_SPI_SHADER_SNORM16_ABGR ||
1182 			    spi_format == V_028714_SPI_SHADER_UINT16_ABGR ||
1183 			    spi_format == V_028714_SPI_SHADER_SINT16_ABGR) {
1184 				if (swap == V_028C70_SWAP_STD ||
1185 				    swap == V_028C70_SWAP_STD_REV)
1186 					sx_ps_downconvert |= V_028754_SX_RT_EXPORT_16_16_GR << (i * 4);
1187 				else
1188 					sx_ps_downconvert |= V_028754_SX_RT_EXPORT_16_16_AR << (i * 4);
1189 			}
1190 			break;
1191 
1192 		case V_028C70_COLOR_10_11_11:
1193 			if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
1194 				sx_ps_downconvert |= V_028754_SX_RT_EXPORT_10_11_11 << (i * 4);
1195 				sx_blend_opt_epsilon |= V_028758_11BIT_FORMAT << (i * 4);
1196 			}
1197 			break;
1198 
1199 		case V_028C70_COLOR_2_10_10_10:
1200 			if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
1201 				sx_ps_downconvert |= V_028754_SX_RT_EXPORT_2_10_10_10 << (i * 4);
1202 				sx_blend_opt_epsilon |= V_028758_10BIT_FORMAT << (i * 4);
1203 			}
1204 			break;
1205 		case V_028C70_COLOR_5_9_9_9:
1206 			if (spi_format == V_028714_SPI_SHADER_FP16_ABGR)
1207 				sx_ps_downconvert |= V_028754_SX_RT_EXPORT_9_9_9_E5 << (i * 4);
1208 			break;
1209 		}
1210 	}
1211 
1212 	/* Do not set the DISABLE bits for the unused attachments, as that
1213 	 * breaks dual source blending in SkQP and does not seem to improve
1214 	 * performance. */
1215 
1216 	if (sx_ps_downconvert == cmd_buffer->state.last_sx_ps_downconvert &&
1217 	    sx_blend_opt_epsilon == cmd_buffer->state.last_sx_blend_opt_epsilon &&
1218 	    sx_blend_opt_control == cmd_buffer->state.last_sx_blend_opt_control)
1219 		return;
1220 
1221 	radeon_set_context_reg_seq(cmd_buffer->cs, R_028754_SX_PS_DOWNCONVERT, 3);
1222 	radeon_emit(cmd_buffer->cs, sx_ps_downconvert);
1223 	radeon_emit(cmd_buffer->cs, sx_blend_opt_epsilon);
1224 	radeon_emit(cmd_buffer->cs, sx_blend_opt_control);
1225 
1226 	cmd_buffer->state.context_roll_without_scissor_emitted = true;
1227 
1228 	cmd_buffer->state.last_sx_ps_downconvert = sx_ps_downconvert;
1229 	cmd_buffer->state.last_sx_blend_opt_epsilon = sx_blend_opt_epsilon;
1230 	cmd_buffer->state.last_sx_blend_opt_control = sx_blend_opt_control;
1231 }
1232 
1233 static void
radv_emit_batch_break_on_new_ps(struct radv_cmd_buffer * cmd_buffer)1234 radv_emit_batch_break_on_new_ps(struct radv_cmd_buffer *cmd_buffer)
1235 {
1236 	if (!cmd_buffer->device->pbb_allowed)
1237 		return;
1238 
1239         struct radv_binning_settings settings =
1240                 radv_get_binning_settings(cmd_buffer->device->physical_device);
1241 	bool break_for_new_ps =
1242 		(!cmd_buffer->state.emitted_pipeline ||
1243 		 cmd_buffer->state.emitted_pipeline->shaders[MESA_SHADER_FRAGMENT] !=
1244 		 cmd_buffer->state.pipeline->shaders[MESA_SHADER_FRAGMENT]) &&
1245 		(settings.context_states_per_bin > 1 ||
1246 		 settings.persistent_states_per_bin > 1);
1247 	bool break_for_new_cb_target_mask =
1248 		(!cmd_buffer->state.emitted_pipeline ||
1249 		 cmd_buffer->state.emitted_pipeline->graphics.cb_target_mask !=
1250 		 cmd_buffer->state.pipeline->graphics.cb_target_mask) &&
1251 		 settings.context_states_per_bin > 1;
1252 
1253 	if (!break_for_new_ps && !break_for_new_cb_target_mask)
1254 		return;
1255 
1256 	radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
1257 	radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0));
1258 }
1259 
1260 static void
radv_emit_graphics_pipeline(struct radv_cmd_buffer * cmd_buffer)1261 radv_emit_graphics_pipeline(struct radv_cmd_buffer *cmd_buffer)
1262 {
1263 	struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
1264 
1265 	if (!pipeline || cmd_buffer->state.emitted_pipeline == pipeline)
1266 		return;
1267 
1268 	radv_update_multisample_state(cmd_buffer, pipeline);
1269 	radv_update_binning_state(cmd_buffer, pipeline);
1270 
1271 	cmd_buffer->scratch_size_per_wave_needed = MAX2(cmd_buffer->scratch_size_per_wave_needed,
1272 	                                                pipeline->scratch_bytes_per_wave);
1273 	cmd_buffer->scratch_waves_wanted = MAX2(cmd_buffer->scratch_waves_wanted,
1274 	                                        pipeline->max_waves);
1275 
1276 	if (!cmd_buffer->state.emitted_pipeline ||
1277 	    cmd_buffer->state.emitted_pipeline->graphics.can_use_guardband !=
1278 	     pipeline->graphics.can_use_guardband)
1279 		cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_SCISSOR;
1280 
1281 	if (!cmd_buffer->state.emitted_pipeline ||
1282 	    cmd_buffer->state.emitted_pipeline->graphics.pa_su_sc_mode_cntl !=
1283 	    pipeline->graphics.pa_su_sc_mode_cntl)
1284 		cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_CULL_MODE |
1285 					   RADV_CMD_DIRTY_DYNAMIC_FRONT_FACE;
1286 
1287 	if (!cmd_buffer->state.emitted_pipeline)
1288 		cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY;
1289 
1290 	if (!cmd_buffer->state.emitted_pipeline ||
1291 	    cmd_buffer->state.emitted_pipeline->graphics.db_depth_control !=
1292 	    pipeline->graphics.db_depth_control)
1293 		cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_TEST_ENABLE |
1294 					   RADV_CMD_DIRTY_DYNAMIC_DEPTH_WRITE_ENABLE |
1295 					   RADV_CMD_DIRTY_DYNAMIC_DEPTH_COMPARE_OP |
1296 					   RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE |
1297 					   RADV_CMD_DIRTY_DYNAMIC_STENCIL_TEST_ENABLE |
1298 					   RADV_CMD_DIRTY_DYNAMIC_STENCIL_OP;
1299 
1300 	if (!cmd_buffer->state.emitted_pipeline)
1301 		cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_OP;
1302 
1303 	radeon_emit_array(cmd_buffer->cs, pipeline->cs.buf, pipeline->cs.cdw);
1304 
1305 	if (!cmd_buffer->state.emitted_pipeline ||
1306 	    cmd_buffer->state.emitted_pipeline->ctx_cs.cdw != pipeline->ctx_cs.cdw ||
1307 	    cmd_buffer->state.emitted_pipeline->ctx_cs_hash != pipeline->ctx_cs_hash ||
1308 	    memcmp(cmd_buffer->state.emitted_pipeline->ctx_cs.buf,
1309 	           pipeline->ctx_cs.buf, pipeline->ctx_cs.cdw * 4)) {
1310 		radeon_emit_array(cmd_buffer->cs, pipeline->ctx_cs.buf, pipeline->ctx_cs.cdw);
1311 		cmd_buffer->state.context_roll_without_scissor_emitted = true;
1312 	}
1313 
1314 	radv_emit_batch_break_on_new_ps(cmd_buffer);
1315 
1316 	for (unsigned i = 0; i < MESA_SHADER_COMPUTE; i++) {
1317 		if (!pipeline->shaders[i])
1318 			continue;
1319 
1320 		radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs,
1321 				   pipeline->shaders[i]->bo);
1322 	}
1323 
1324 	if (radv_pipeline_has_gs_copy_shader(pipeline))
1325 		radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs,
1326 				   pipeline->gs_copy_shader->bo);
1327 
1328 	if (unlikely(cmd_buffer->device->trace_bo))
1329 		radv_save_pipeline(cmd_buffer, pipeline);
1330 
1331 	cmd_buffer->state.emitted_pipeline = pipeline;
1332 
1333 	cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_PIPELINE;
1334 }
1335 
1336 static void
radv_emit_viewport(struct radv_cmd_buffer * cmd_buffer)1337 radv_emit_viewport(struct radv_cmd_buffer *cmd_buffer)
1338 {
1339 	si_write_viewport(cmd_buffer->cs, 0, cmd_buffer->state.dynamic.viewport.count,
1340 			  cmd_buffer->state.dynamic.viewport.viewports);
1341 }
1342 
1343 static void
radv_emit_scissor(struct radv_cmd_buffer * cmd_buffer)1344 radv_emit_scissor(struct radv_cmd_buffer *cmd_buffer)
1345 {
1346 	uint32_t count = cmd_buffer->state.dynamic.scissor.count;
1347 
1348 	si_write_scissors(cmd_buffer->cs, 0, count,
1349 			  cmd_buffer->state.dynamic.scissor.scissors,
1350 			  cmd_buffer->state.dynamic.viewport.viewports,
1351 			  cmd_buffer->state.emitted_pipeline->graphics.can_use_guardband);
1352 
1353 	cmd_buffer->state.context_roll_without_scissor_emitted = false;
1354 }
1355 
1356 static void
radv_emit_discard_rectangle(struct radv_cmd_buffer * cmd_buffer)1357 radv_emit_discard_rectangle(struct radv_cmd_buffer *cmd_buffer)
1358 {
1359 	if (!cmd_buffer->state.dynamic.discard_rectangle.count)
1360 		return;
1361 
1362 	radeon_set_context_reg_seq(cmd_buffer->cs, R_028210_PA_SC_CLIPRECT_0_TL,
1363 	                           cmd_buffer->state.dynamic.discard_rectangle.count * 2);
1364 	for (unsigned i = 0; i < cmd_buffer->state.dynamic.discard_rectangle.count; ++i) {
1365 		VkRect2D rect = cmd_buffer->state.dynamic.discard_rectangle.rectangles[i];
1366 		radeon_emit(cmd_buffer->cs, S_028210_TL_X(rect.offset.x) | S_028210_TL_Y(rect.offset.y));
1367 		radeon_emit(cmd_buffer->cs, S_028214_BR_X(rect.offset.x + rect.extent.width) |
1368 		                            S_028214_BR_Y(rect.offset.y + rect.extent.height));
1369 	}
1370 }
1371 
1372 static void
radv_emit_line_width(struct radv_cmd_buffer * cmd_buffer)1373 radv_emit_line_width(struct radv_cmd_buffer *cmd_buffer)
1374 {
1375 	unsigned width = cmd_buffer->state.dynamic.line_width * 8;
1376 
1377 	radeon_set_context_reg(cmd_buffer->cs, R_028A08_PA_SU_LINE_CNTL,
1378 			       S_028A08_WIDTH(CLAMP(width, 0, 0xFFFF)));
1379 }
1380 
1381 static void
radv_emit_blend_constants(struct radv_cmd_buffer * cmd_buffer)1382 radv_emit_blend_constants(struct radv_cmd_buffer *cmd_buffer)
1383 {
1384 	struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1385 
1386 	radeon_set_context_reg_seq(cmd_buffer->cs, R_028414_CB_BLEND_RED, 4);
1387 	radeon_emit_array(cmd_buffer->cs, (uint32_t *)d->blend_constants, 4);
1388 }
1389 
1390 static void
radv_emit_stencil(struct radv_cmd_buffer * cmd_buffer)1391 radv_emit_stencil(struct radv_cmd_buffer *cmd_buffer)
1392 {
1393 	struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1394 
1395 	radeon_set_context_reg_seq(cmd_buffer->cs,
1396 				   R_028430_DB_STENCILREFMASK, 2);
1397 	radeon_emit(cmd_buffer->cs,
1398 		    S_028430_STENCILTESTVAL(d->stencil_reference.front) |
1399 		    S_028430_STENCILMASK(d->stencil_compare_mask.front) |
1400 		    S_028430_STENCILWRITEMASK(d->stencil_write_mask.front) |
1401 		    S_028430_STENCILOPVAL(1));
1402 	radeon_emit(cmd_buffer->cs,
1403 		    S_028434_STENCILTESTVAL_BF(d->stencil_reference.back) |
1404 		    S_028434_STENCILMASK_BF(d->stencil_compare_mask.back) |
1405 		    S_028434_STENCILWRITEMASK_BF(d->stencil_write_mask.back) |
1406 		    S_028434_STENCILOPVAL_BF(1));
1407 }
1408 
1409 static void
radv_emit_depth_bounds(struct radv_cmd_buffer * cmd_buffer)1410 radv_emit_depth_bounds(struct radv_cmd_buffer *cmd_buffer)
1411 {
1412 	struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1413 
1414 	radeon_set_context_reg(cmd_buffer->cs, R_028020_DB_DEPTH_BOUNDS_MIN,
1415 			       fui(d->depth_bounds.min));
1416 	radeon_set_context_reg(cmd_buffer->cs, R_028024_DB_DEPTH_BOUNDS_MAX,
1417 			       fui(d->depth_bounds.max));
1418 }
1419 
1420 static void
radv_emit_depth_bias(struct radv_cmd_buffer * cmd_buffer)1421 radv_emit_depth_bias(struct radv_cmd_buffer *cmd_buffer)
1422 {
1423 	struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1424 	unsigned slope = fui(d->depth_bias.slope * 16.0f);
1425 	unsigned bias = fui(d->depth_bias.bias * cmd_buffer->state.offset_scale);
1426 
1427 
1428 	radeon_set_context_reg_seq(cmd_buffer->cs,
1429 				   R_028B7C_PA_SU_POLY_OFFSET_CLAMP, 5);
1430 	radeon_emit(cmd_buffer->cs, fui(d->depth_bias.clamp)); /* CLAMP */
1431 	radeon_emit(cmd_buffer->cs, slope); /* FRONT SCALE */
1432 	radeon_emit(cmd_buffer->cs, bias); /* FRONT OFFSET */
1433 	radeon_emit(cmd_buffer->cs, slope); /* BACK SCALE */
1434 	radeon_emit(cmd_buffer->cs, bias); /* BACK OFFSET */
1435 }
1436 
1437 static void
radv_emit_line_stipple(struct radv_cmd_buffer * cmd_buffer)1438 radv_emit_line_stipple(struct radv_cmd_buffer *cmd_buffer)
1439 {
1440 	struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1441 	uint32_t auto_reset_cntl = 1;
1442 
1443 	if (d->primitive_topology == V_008958_DI_PT_LINESTRIP)
1444 		auto_reset_cntl = 2;
1445 
1446 	radeon_set_context_reg(cmd_buffer->cs, R_028A0C_PA_SC_LINE_STIPPLE,
1447 			       S_028A0C_LINE_PATTERN(d->line_stipple.pattern) |
1448 			       S_028A0C_REPEAT_COUNT(d->line_stipple.factor - 1) |
1449 			       S_028A0C_AUTO_RESET_CNTL(auto_reset_cntl));
1450 }
1451 
1452 static void
radv_emit_culling(struct radv_cmd_buffer * cmd_buffer,uint32_t states)1453 radv_emit_culling(struct radv_cmd_buffer *cmd_buffer, uint32_t states)
1454 {
1455 	unsigned pa_su_sc_mode_cntl = cmd_buffer->state.pipeline->graphics.pa_su_sc_mode_cntl;
1456 	struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1457 
1458 	if (states & RADV_CMD_DIRTY_DYNAMIC_CULL_MODE) {
1459 		pa_su_sc_mode_cntl &= C_028814_CULL_FRONT;
1460 		pa_su_sc_mode_cntl |= S_028814_CULL_FRONT(!!(d->cull_mode & VK_CULL_MODE_FRONT_BIT));
1461 
1462 		pa_su_sc_mode_cntl &= C_028814_CULL_BACK;
1463 		pa_su_sc_mode_cntl |= S_028814_CULL_BACK(!!(d->cull_mode & VK_CULL_MODE_BACK_BIT));
1464 	}
1465 
1466 	if (states & RADV_CMD_DIRTY_DYNAMIC_FRONT_FACE) {
1467 		pa_su_sc_mode_cntl &= C_028814_FACE;
1468 		pa_su_sc_mode_cntl |= S_028814_FACE(d->front_face);
1469 	}
1470 
1471 	radeon_set_context_reg(cmd_buffer->cs, R_028814_PA_SU_SC_MODE_CNTL,
1472 			       pa_su_sc_mode_cntl);
1473 }
1474 
1475 static void
radv_emit_primitive_topology(struct radv_cmd_buffer * cmd_buffer)1476 radv_emit_primitive_topology(struct radv_cmd_buffer *cmd_buffer)
1477 {
1478 	struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1479 
1480 	if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX7) {
1481 		radeon_set_uconfig_reg_idx(cmd_buffer->device->physical_device,
1482 					   cmd_buffer->cs,
1483 					   R_030908_VGT_PRIMITIVE_TYPE, 1,
1484 					   d->primitive_topology);
1485 	} else {
1486 		radeon_set_config_reg(cmd_buffer->cs,
1487 				      R_008958_VGT_PRIMITIVE_TYPE,
1488 				      d->primitive_topology);
1489 	}
1490 }
1491 
1492 static void
radv_emit_depth_control(struct radv_cmd_buffer * cmd_buffer,uint32_t states)1493 radv_emit_depth_control(struct radv_cmd_buffer *cmd_buffer, uint32_t states)
1494 {
1495 	unsigned db_depth_control = cmd_buffer->state.pipeline->graphics.db_depth_control;
1496 	struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1497 
1498 	if (states & RADV_CMD_DIRTY_DYNAMIC_DEPTH_TEST_ENABLE) {
1499 		db_depth_control &= C_028800_Z_ENABLE;
1500 		db_depth_control |= S_028800_Z_ENABLE(d->depth_test_enable ? 1 : 0);
1501 	}
1502 
1503 	if (states & RADV_CMD_DIRTY_DYNAMIC_DEPTH_WRITE_ENABLE) {
1504 		db_depth_control &= C_028800_Z_WRITE_ENABLE;
1505 		db_depth_control |= S_028800_Z_WRITE_ENABLE(d->depth_write_enable ? 1 : 0);
1506 	}
1507 
1508 	if (states & RADV_CMD_DIRTY_DYNAMIC_DEPTH_COMPARE_OP) {
1509 		db_depth_control &= C_028800_ZFUNC;
1510 		db_depth_control |= S_028800_ZFUNC(d->depth_compare_op);
1511 	}
1512 
1513 	if (states & RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE) {
1514 		db_depth_control &= C_028800_DEPTH_BOUNDS_ENABLE;
1515 		db_depth_control |= S_028800_DEPTH_BOUNDS_ENABLE(d->depth_bounds_test_enable ? 1 : 0);
1516 	}
1517 
1518 	if (states & RADV_CMD_DIRTY_DYNAMIC_STENCIL_TEST_ENABLE) {
1519 		db_depth_control &= C_028800_STENCIL_ENABLE;
1520 		db_depth_control |= S_028800_STENCIL_ENABLE(d->stencil_test_enable ? 1 : 0);
1521 
1522 		db_depth_control &= C_028800_BACKFACE_ENABLE;
1523 		db_depth_control |= S_028800_BACKFACE_ENABLE(d->stencil_test_enable ? 1 : 0);
1524 	}
1525 
1526 	if (states & RADV_CMD_DIRTY_DYNAMIC_STENCIL_OP) {
1527 		db_depth_control &= C_028800_STENCILFUNC;
1528 		db_depth_control |= S_028800_STENCILFUNC(d->stencil_op.front.compare_op);
1529 
1530 		db_depth_control &= C_028800_STENCILFUNC_BF;
1531 		db_depth_control |= S_028800_STENCILFUNC_BF(d->stencil_op.back.compare_op);
1532 	}
1533 
1534 	radeon_set_context_reg(cmd_buffer->cs, R_028800_DB_DEPTH_CONTROL,
1535 			       db_depth_control);
1536 }
1537 
1538 static void
radv_emit_stencil_control(struct radv_cmd_buffer * cmd_buffer)1539 radv_emit_stencil_control(struct radv_cmd_buffer *cmd_buffer)
1540 {
1541 	struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1542 
1543 	radeon_set_context_reg(cmd_buffer->cs, R_02842C_DB_STENCIL_CONTROL,
1544 			       S_02842C_STENCILFAIL(si_translate_stencil_op(d->stencil_op.front.fail_op)) |
1545 			       S_02842C_STENCILZPASS(si_translate_stencil_op(d->stencil_op.front.pass_op)) |
1546 			       S_02842C_STENCILZFAIL(si_translate_stencil_op(d->stencil_op.front.depth_fail_op)) |
1547 			       S_02842C_STENCILFAIL_BF(si_translate_stencil_op(d->stencil_op.back.fail_op)) |
1548 			       S_02842C_STENCILZPASS_BF(si_translate_stencil_op(d->stencil_op.back.pass_op)) |
1549 			       S_02842C_STENCILZFAIL_BF(si_translate_stencil_op(d->stencil_op.back.depth_fail_op)));
1550 }
1551 
1552 static void
radv_emit_fb_color_state(struct radv_cmd_buffer * cmd_buffer,int index,struct radv_color_buffer_info * cb,struct radv_image_view * iview,VkImageLayout layout,bool in_render_loop)1553 radv_emit_fb_color_state(struct radv_cmd_buffer *cmd_buffer,
1554 			 int index,
1555 			 struct radv_color_buffer_info *cb,
1556 			 struct radv_image_view *iview,
1557 			 VkImageLayout layout,
1558 			 bool in_render_loop)
1559 {
1560 	bool is_vi = cmd_buffer->device->physical_device->rad_info.chip_class >= GFX8;
1561 	uint32_t cb_color_info = cb->cb_color_info;
1562 	struct radv_image *image = iview->image;
1563 
1564 	if (!radv_layout_dcc_compressed(cmd_buffer->device, image, layout, in_render_loop,
1565 	                                radv_image_queue_family_mask(image,
1566 	                                                             cmd_buffer->queue_family_index,
1567 	                                                             cmd_buffer->queue_family_index))) {
1568 		cb_color_info &= C_028C70_DCC_ENABLE;
1569 	}
1570 
1571 	if (!radv_layout_can_fast_clear(image, layout, in_render_loop,
1572 	                                radv_image_queue_family_mask(image,
1573 	                                                             cmd_buffer->queue_family_index,
1574 	                                                             cmd_buffer->queue_family_index))) {
1575 		cb_color_info &= C_028C70_COMPRESSION;
1576 	}
1577 
1578 	if (radv_image_is_tc_compat_cmask(image) &&
1579 	    (radv_is_fmask_decompress_pipeline(cmd_buffer) ||
1580 	     radv_is_dcc_decompress_pipeline(cmd_buffer))) {
1581 		/* If this bit is set, the FMASK decompression operation
1582 		 * doesn't occur (DCC_COMPRESS also implies FMASK_DECOMPRESS).
1583 		 */
1584 		cb_color_info &= C_028C70_FMASK_COMPRESS_1FRAG_ONLY;
1585 	}
1586 
1587 	if (radv_image_has_fmask(image) &&
1588 	    (radv_is_fmask_decompress_pipeline(cmd_buffer) ||
1589 	     radv_is_hw_resolve_pipeline(cmd_buffer))) {
1590 		/* Make sure FMASK is enabled if it has been cleared because:
1591 		 *
1592 		 * 1) it's required for FMASK_DECOMPRESS operations to avoid
1593 		 * GPU hangs
1594 		 * 2) it's necessary for CB_RESOLVE which can read compressed
1595 		 * FMASK data anyways.
1596 		 */
1597 		cb_color_info |= S_028C70_COMPRESSION(1);
1598 	}
1599 
1600 	if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10) {
1601 			radeon_set_context_reg_seq(cmd_buffer->cs, R_028C60_CB_COLOR0_BASE + index * 0x3c, 11);
1602 			radeon_emit(cmd_buffer->cs, cb->cb_color_base);
1603 			radeon_emit(cmd_buffer->cs, 0);
1604 			radeon_emit(cmd_buffer->cs, 0);
1605 			radeon_emit(cmd_buffer->cs, cb->cb_color_view);
1606 			radeon_emit(cmd_buffer->cs, cb_color_info);
1607 			radeon_emit(cmd_buffer->cs, cb->cb_color_attrib);
1608 			radeon_emit(cmd_buffer->cs, cb->cb_dcc_control);
1609 			radeon_emit(cmd_buffer->cs, cb->cb_color_cmask);
1610 			radeon_emit(cmd_buffer->cs, 0);
1611 			radeon_emit(cmd_buffer->cs, cb->cb_color_fmask);
1612 			radeon_emit(cmd_buffer->cs, 0);
1613 
1614 			radeon_set_context_reg_seq(cmd_buffer->cs, R_028C94_CB_COLOR0_DCC_BASE + index * 0x3c, 1);
1615 			radeon_emit(cmd_buffer->cs, cb->cb_dcc_base);
1616 
1617 			radeon_set_context_reg(cmd_buffer->cs, R_028E40_CB_COLOR0_BASE_EXT + index * 4,
1618 					       cb->cb_color_base >> 32);
1619 			radeon_set_context_reg(cmd_buffer->cs, R_028E60_CB_COLOR0_CMASK_BASE_EXT + index * 4,
1620 					       cb->cb_color_cmask >> 32);
1621 			radeon_set_context_reg(cmd_buffer->cs, R_028E80_CB_COLOR0_FMASK_BASE_EXT + index * 4,
1622 					       cb->cb_color_fmask >> 32);
1623 			radeon_set_context_reg(cmd_buffer->cs, R_028EA0_CB_COLOR0_DCC_BASE_EXT + index * 4,
1624 					       cb->cb_dcc_base >> 32);
1625 			radeon_set_context_reg(cmd_buffer->cs, R_028EC0_CB_COLOR0_ATTRIB2 + index * 4,
1626 					       cb->cb_color_attrib2);
1627 			radeon_set_context_reg(cmd_buffer->cs, R_028EE0_CB_COLOR0_ATTRIB3 + index * 4,
1628 					       cb->cb_color_attrib3);
1629 	} else if (cmd_buffer->device->physical_device->rad_info.chip_class == GFX9) {
1630 		radeon_set_context_reg_seq(cmd_buffer->cs, R_028C60_CB_COLOR0_BASE + index * 0x3c, 11);
1631 		radeon_emit(cmd_buffer->cs, cb->cb_color_base);
1632 		radeon_emit(cmd_buffer->cs, S_028C64_BASE_256B(cb->cb_color_base >> 32));
1633 		radeon_emit(cmd_buffer->cs, cb->cb_color_attrib2);
1634 		radeon_emit(cmd_buffer->cs, cb->cb_color_view);
1635 		radeon_emit(cmd_buffer->cs, cb_color_info);
1636 		radeon_emit(cmd_buffer->cs, cb->cb_color_attrib);
1637 		radeon_emit(cmd_buffer->cs, cb->cb_dcc_control);
1638 		radeon_emit(cmd_buffer->cs, cb->cb_color_cmask);
1639 		radeon_emit(cmd_buffer->cs, S_028C80_BASE_256B(cb->cb_color_cmask >> 32));
1640 		radeon_emit(cmd_buffer->cs, cb->cb_color_fmask);
1641 		radeon_emit(cmd_buffer->cs, S_028C88_BASE_256B(cb->cb_color_fmask >> 32));
1642 
1643 		radeon_set_context_reg_seq(cmd_buffer->cs, R_028C94_CB_COLOR0_DCC_BASE + index * 0x3c, 2);
1644 		radeon_emit(cmd_buffer->cs, cb->cb_dcc_base);
1645 		radeon_emit(cmd_buffer->cs, S_028C98_BASE_256B(cb->cb_dcc_base >> 32));
1646 
1647 		radeon_set_context_reg(cmd_buffer->cs, R_0287A0_CB_MRT0_EPITCH + index * 4,
1648 				       cb->cb_mrt_epitch);
1649 	} else {
1650 		radeon_set_context_reg_seq(cmd_buffer->cs, R_028C60_CB_COLOR0_BASE + index * 0x3c, 11);
1651 		radeon_emit(cmd_buffer->cs, cb->cb_color_base);
1652 		radeon_emit(cmd_buffer->cs, cb->cb_color_pitch);
1653 		radeon_emit(cmd_buffer->cs, cb->cb_color_slice);
1654 		radeon_emit(cmd_buffer->cs, cb->cb_color_view);
1655 		radeon_emit(cmd_buffer->cs, cb_color_info);
1656 		radeon_emit(cmd_buffer->cs, cb->cb_color_attrib);
1657 		radeon_emit(cmd_buffer->cs, cb->cb_dcc_control);
1658 		radeon_emit(cmd_buffer->cs, cb->cb_color_cmask);
1659 		radeon_emit(cmd_buffer->cs, cb->cb_color_cmask_slice);
1660 		radeon_emit(cmd_buffer->cs, cb->cb_color_fmask);
1661 		radeon_emit(cmd_buffer->cs, cb->cb_color_fmask_slice);
1662 
1663 		if (is_vi) { /* DCC BASE */
1664 			radeon_set_context_reg(cmd_buffer->cs, R_028C94_CB_COLOR0_DCC_BASE + index * 0x3c, cb->cb_dcc_base);
1665 		}
1666 	}
1667 
1668 	if (radv_dcc_enabled(image, iview->base_mip)) {
1669 		/* Drawing with DCC enabled also compresses colorbuffers. */
1670 		VkImageSubresourceRange range = {
1671 			.aspectMask = iview->aspect_mask,
1672 			.baseMipLevel = iview->base_mip,
1673 			.levelCount = iview->level_count,
1674 			.baseArrayLayer = iview->base_layer,
1675 			.layerCount = iview->layer_count,
1676 		};
1677 
1678 		radv_update_dcc_metadata(cmd_buffer, image, &range, true);
1679 	}
1680 }
1681 
1682 static void
radv_update_zrange_precision(struct radv_cmd_buffer * cmd_buffer,struct radv_ds_buffer_info * ds,const struct radv_image_view * iview,VkImageLayout layout,bool in_render_loop,bool requires_cond_exec)1683 radv_update_zrange_precision(struct radv_cmd_buffer *cmd_buffer,
1684 			     struct radv_ds_buffer_info *ds,
1685 			     const struct radv_image_view *iview,
1686 			     VkImageLayout layout,
1687 			     bool in_render_loop, bool requires_cond_exec)
1688 {
1689 	const struct radv_image *image = iview->image;
1690 	uint32_t db_z_info = ds->db_z_info;
1691 	uint32_t db_z_info_reg;
1692 
1693 	if (!cmd_buffer->device->physical_device->rad_info.has_tc_compat_zrange_bug ||
1694 	    !radv_image_is_tc_compat_htile(image))
1695 		return;
1696 
1697 	if (!radv_layout_is_htile_compressed(cmd_buffer->device, image, layout, in_render_loop,
1698 					     radv_image_queue_family_mask(image,
1699 									  cmd_buffer->queue_family_index,
1700 									  cmd_buffer->queue_family_index))) {
1701 		db_z_info &= C_028040_TILE_SURFACE_ENABLE;
1702 	}
1703 
1704 	db_z_info &= C_028040_ZRANGE_PRECISION;
1705 
1706 	if (cmd_buffer->device->physical_device->rad_info.chip_class == GFX9) {
1707 		db_z_info_reg = R_028038_DB_Z_INFO;
1708 	} else {
1709 		db_z_info_reg = R_028040_DB_Z_INFO;
1710 	}
1711 
1712 	/* When we don't know the last fast clear value we need to emit a
1713 	 * conditional packet that will eventually skip the following
1714 	 * SET_CONTEXT_REG packet.
1715 	 */
1716 	if (requires_cond_exec) {
1717 		uint64_t va = radv_get_tc_compat_zrange_va(image, iview->base_mip);
1718 
1719 		radeon_emit(cmd_buffer->cs, PKT3(PKT3_COND_EXEC, 3, 0));
1720 		radeon_emit(cmd_buffer->cs, va);
1721 		radeon_emit(cmd_buffer->cs, va >> 32);
1722 		radeon_emit(cmd_buffer->cs, 0);
1723 		radeon_emit(cmd_buffer->cs, 3); /* SET_CONTEXT_REG size */
1724 	}
1725 
1726 	radeon_set_context_reg(cmd_buffer->cs, db_z_info_reg, db_z_info);
1727 }
1728 
1729 static void
radv_emit_fb_ds_state(struct radv_cmd_buffer * cmd_buffer,struct radv_ds_buffer_info * ds,struct radv_image_view * iview,VkImageLayout layout,bool in_render_loop)1730 radv_emit_fb_ds_state(struct radv_cmd_buffer *cmd_buffer,
1731 		      struct radv_ds_buffer_info *ds,
1732 		      struct radv_image_view *iview,
1733 		      VkImageLayout layout,
1734 		      bool in_render_loop)
1735 {
1736 	const struct radv_image *image = iview->image;
1737 	uint32_t db_z_info = ds->db_z_info;
1738 	uint32_t db_stencil_info = ds->db_stencil_info;
1739 
1740 	if (!radv_layout_is_htile_compressed(cmd_buffer->device, image, layout, in_render_loop,
1741 					     radv_image_queue_family_mask(image,
1742 									  cmd_buffer->queue_family_index,
1743 									  cmd_buffer->queue_family_index))) {
1744 		db_z_info &= C_028040_TILE_SURFACE_ENABLE;
1745 		db_stencil_info |= S_028044_TILE_STENCIL_DISABLE(1);
1746 	}
1747 
1748 	radeon_set_context_reg(cmd_buffer->cs, R_028008_DB_DEPTH_VIEW, ds->db_depth_view);
1749 	radeon_set_context_reg(cmd_buffer->cs, R_028ABC_DB_HTILE_SURFACE, ds->db_htile_surface);
1750 
1751 	if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10) {
1752 		radeon_set_context_reg(cmd_buffer->cs, R_028014_DB_HTILE_DATA_BASE, ds->db_htile_data_base);
1753 		radeon_set_context_reg(cmd_buffer->cs, R_02801C_DB_DEPTH_SIZE_XY, ds->db_depth_size);
1754 
1755 		radeon_set_context_reg_seq(cmd_buffer->cs, R_02803C_DB_DEPTH_INFO, 7);
1756 		radeon_emit(cmd_buffer->cs, S_02803C_RESOURCE_LEVEL(1));
1757 		radeon_emit(cmd_buffer->cs, db_z_info);
1758 		radeon_emit(cmd_buffer->cs, db_stencil_info);
1759 		radeon_emit(cmd_buffer->cs, ds->db_z_read_base);
1760 		radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base);
1761 		radeon_emit(cmd_buffer->cs, ds->db_z_read_base);
1762 		radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base);
1763 
1764 		radeon_set_context_reg_seq(cmd_buffer->cs, R_028068_DB_Z_READ_BASE_HI, 5);
1765 		radeon_emit(cmd_buffer->cs, ds->db_z_read_base >> 32);
1766 		radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base >> 32);
1767 		radeon_emit(cmd_buffer->cs, ds->db_z_read_base >> 32);
1768 		radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base >> 32);
1769 		radeon_emit(cmd_buffer->cs, ds->db_htile_data_base >> 32);
1770 	} else if (cmd_buffer->device->physical_device->rad_info.chip_class == GFX9) {
1771 		radeon_set_context_reg_seq(cmd_buffer->cs, R_028014_DB_HTILE_DATA_BASE, 3);
1772 		radeon_emit(cmd_buffer->cs, ds->db_htile_data_base);
1773 		radeon_emit(cmd_buffer->cs, S_028018_BASE_HI(ds->db_htile_data_base >> 32));
1774 		radeon_emit(cmd_buffer->cs, ds->db_depth_size);
1775 
1776 		radeon_set_context_reg_seq(cmd_buffer->cs, R_028038_DB_Z_INFO, 10);
1777 		radeon_emit(cmd_buffer->cs, db_z_info);			/* DB_Z_INFO */
1778 		radeon_emit(cmd_buffer->cs, db_stencil_info);	        /* DB_STENCIL_INFO */
1779 		radeon_emit(cmd_buffer->cs, ds->db_z_read_base);	/* DB_Z_READ_BASE */
1780 		radeon_emit(cmd_buffer->cs, S_028044_BASE_HI(ds->db_z_read_base >> 32));	/* DB_Z_READ_BASE_HI */
1781 		radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base);	/* DB_STENCIL_READ_BASE */
1782 		radeon_emit(cmd_buffer->cs, S_02804C_BASE_HI(ds->db_stencil_read_base >> 32)); /* DB_STENCIL_READ_BASE_HI */
1783 		radeon_emit(cmd_buffer->cs, ds->db_z_write_base);	/* DB_Z_WRITE_BASE */
1784 		radeon_emit(cmd_buffer->cs, S_028054_BASE_HI(ds->db_z_write_base >> 32));	/* DB_Z_WRITE_BASE_HI */
1785 		radeon_emit(cmd_buffer->cs, ds->db_stencil_write_base);	/* DB_STENCIL_WRITE_BASE */
1786 		radeon_emit(cmd_buffer->cs, S_02805C_BASE_HI(ds->db_stencil_write_base >> 32)); /* DB_STENCIL_WRITE_BASE_HI */
1787 
1788 		radeon_set_context_reg_seq(cmd_buffer->cs, R_028068_DB_Z_INFO2, 2);
1789 		radeon_emit(cmd_buffer->cs, ds->db_z_info2);
1790 		radeon_emit(cmd_buffer->cs, ds->db_stencil_info2);
1791 	} else {
1792 		radeon_set_context_reg(cmd_buffer->cs, R_028014_DB_HTILE_DATA_BASE, ds->db_htile_data_base);
1793 
1794 		radeon_set_context_reg_seq(cmd_buffer->cs, R_02803C_DB_DEPTH_INFO, 9);
1795 		radeon_emit(cmd_buffer->cs, ds->db_depth_info);	/* R_02803C_DB_DEPTH_INFO */
1796 		radeon_emit(cmd_buffer->cs, db_z_info);			/* R_028040_DB_Z_INFO */
1797 		radeon_emit(cmd_buffer->cs, db_stencil_info);	        /* R_028044_DB_STENCIL_INFO */
1798 		radeon_emit(cmd_buffer->cs, ds->db_z_read_base);	/* R_028048_DB_Z_READ_BASE */
1799 		radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base);	/* R_02804C_DB_STENCIL_READ_BASE */
1800 		radeon_emit(cmd_buffer->cs, ds->db_z_write_base);	/* R_028050_DB_Z_WRITE_BASE */
1801 		radeon_emit(cmd_buffer->cs, ds->db_stencil_write_base);	/* R_028054_DB_STENCIL_WRITE_BASE */
1802 		radeon_emit(cmd_buffer->cs, ds->db_depth_size);	/* R_028058_DB_DEPTH_SIZE */
1803 		radeon_emit(cmd_buffer->cs, ds->db_depth_slice);	/* R_02805C_DB_DEPTH_SLICE */
1804 
1805 	}
1806 
1807 	/* Update the ZRANGE_PRECISION value for the TC-compat bug. */
1808 	radv_update_zrange_precision(cmd_buffer, ds, iview, layout,
1809 				     in_render_loop, true);
1810 
1811 	radeon_set_context_reg(cmd_buffer->cs, R_028B78_PA_SU_POLY_OFFSET_DB_FMT_CNTL,
1812 			       ds->pa_su_poly_offset_db_fmt_cntl);
1813 }
1814 
1815 /**
1816  * Update the fast clear depth/stencil values if the image is bound as a
1817  * depth/stencil buffer.
1818  */
1819 static void
radv_update_bound_fast_clear_ds(struct radv_cmd_buffer * cmd_buffer,const struct radv_image_view * iview,VkClearDepthStencilValue ds_clear_value,VkImageAspectFlags aspects)1820 radv_update_bound_fast_clear_ds(struct radv_cmd_buffer *cmd_buffer,
1821 				const struct radv_image_view *iview,
1822 				VkClearDepthStencilValue ds_clear_value,
1823 				VkImageAspectFlags aspects)
1824 {
1825 	const struct radv_subpass *subpass = cmd_buffer->state.subpass;
1826 	const struct radv_image *image = iview->image;
1827 	struct radeon_cmdbuf *cs = cmd_buffer->cs;
1828 	uint32_t att_idx;
1829 
1830 	if (!cmd_buffer->state.attachments || !subpass)
1831 		return;
1832 
1833 	if (!subpass->depth_stencil_attachment)
1834 		return;
1835 
1836 	att_idx = subpass->depth_stencil_attachment->attachment;
1837 	if (cmd_buffer->state.attachments[att_idx].iview->image != image)
1838 		return;
1839 
1840 	if (aspects == (VK_IMAGE_ASPECT_DEPTH_BIT |
1841 			VK_IMAGE_ASPECT_STENCIL_BIT)) {
1842 		radeon_set_context_reg_seq(cs, R_028028_DB_STENCIL_CLEAR, 2);
1843 		radeon_emit(cs, ds_clear_value.stencil);
1844 		radeon_emit(cs, fui(ds_clear_value.depth));
1845 	} else if (aspects == VK_IMAGE_ASPECT_DEPTH_BIT) {
1846 		radeon_set_context_reg_seq(cs, R_02802C_DB_DEPTH_CLEAR, 1);
1847 		radeon_emit(cs, fui(ds_clear_value.depth));
1848 	} else {
1849 		assert(aspects == VK_IMAGE_ASPECT_STENCIL_BIT);
1850 		radeon_set_context_reg_seq(cs, R_028028_DB_STENCIL_CLEAR, 1);
1851 		radeon_emit(cs, ds_clear_value.stencil);
1852 	}
1853 
1854 	/* Update the ZRANGE_PRECISION value for the TC-compat bug. This is
1855 	 * only needed when clearing Z to 0.0.
1856 	 */
1857 	if ((aspects & VK_IMAGE_ASPECT_DEPTH_BIT) &&
1858 	    ds_clear_value.depth == 0.0) {
1859 		VkImageLayout layout = subpass->depth_stencil_attachment->layout;
1860 		bool in_render_loop = subpass->depth_stencil_attachment->in_render_loop;
1861 
1862 		radv_update_zrange_precision(cmd_buffer, &cmd_buffer->state.attachments[att_idx].ds,
1863 					     iview, layout, in_render_loop, false);
1864 	}
1865 
1866 	cmd_buffer->state.context_roll_without_scissor_emitted = true;
1867 }
1868 
1869 /**
1870  * Set the clear depth/stencil values to the image's metadata.
1871  */
1872 static void
radv_set_ds_clear_metadata(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,const VkImageSubresourceRange * range,VkClearDepthStencilValue ds_clear_value,VkImageAspectFlags aspects)1873 radv_set_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer,
1874 			   struct radv_image *image,
1875 			   const VkImageSubresourceRange *range,
1876 			   VkClearDepthStencilValue ds_clear_value,
1877 			   VkImageAspectFlags aspects)
1878 {
1879 	struct radeon_cmdbuf *cs = cmd_buffer->cs;
1880 	uint64_t va = radv_get_ds_clear_value_va(image, range->baseMipLevel);
1881 	uint32_t level_count = radv_get_levelCount(image, range);
1882 
1883 	if (aspects == (VK_IMAGE_ASPECT_DEPTH_BIT |
1884 		        VK_IMAGE_ASPECT_STENCIL_BIT)) {
1885 		/* Use the fastest way when both aspects are used. */
1886 		radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + 2 * level_count, cmd_buffer->state.predicating));
1887 		radeon_emit(cs, S_370_DST_SEL(V_370_MEM) |
1888 				S_370_WR_CONFIRM(1) |
1889 				S_370_ENGINE_SEL(V_370_PFP));
1890 		radeon_emit(cs, va);
1891 		radeon_emit(cs, va >> 32);
1892 
1893 		for (uint32_t l = 0; l < level_count; l++) {
1894 			radeon_emit(cs, ds_clear_value.stencil);
1895 			radeon_emit(cs, fui(ds_clear_value.depth));
1896 		}
1897 	} else {
1898 		/* Otherwise we need one WRITE_DATA packet per level. */
1899 		for (uint32_t l = 0; l < level_count; l++) {
1900 			uint64_t va = radv_get_ds_clear_value_va(image, range->baseMipLevel + l);
1901 			unsigned value;
1902 
1903 			if (aspects == VK_IMAGE_ASPECT_DEPTH_BIT) {
1904 				value = fui(ds_clear_value.depth);
1905 				va += 4;
1906 			} else {
1907 				assert(aspects == VK_IMAGE_ASPECT_STENCIL_BIT);
1908 				value = ds_clear_value.stencil;
1909 			}
1910 
1911 			radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, cmd_buffer->state.predicating));
1912 			radeon_emit(cs, S_370_DST_SEL(V_370_MEM) |
1913 					S_370_WR_CONFIRM(1) |
1914 					S_370_ENGINE_SEL(V_370_PFP));
1915 			radeon_emit(cs, va);
1916 			radeon_emit(cs, va >> 32);
1917 			radeon_emit(cs, value);
1918 		}
1919 	}
1920 }
1921 
1922 /**
1923  * Update the TC-compat metadata value for this image.
1924  */
1925 static void
radv_set_tc_compat_zrange_metadata(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,const VkImageSubresourceRange * range,uint32_t value)1926 radv_set_tc_compat_zrange_metadata(struct radv_cmd_buffer *cmd_buffer,
1927 				   struct radv_image *image,
1928 				   const VkImageSubresourceRange *range,
1929 				   uint32_t value)
1930 {
1931 	struct radeon_cmdbuf *cs = cmd_buffer->cs;
1932 
1933 	if (!cmd_buffer->device->physical_device->rad_info.has_tc_compat_zrange_bug)
1934 		return;
1935 
1936 	uint64_t va = radv_get_tc_compat_zrange_va(image, range->baseMipLevel);
1937 	uint32_t level_count = radv_get_levelCount(image, range);
1938 
1939 	radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + level_count, cmd_buffer->state.predicating));
1940 	radeon_emit(cs, S_370_DST_SEL(V_370_MEM) |
1941 			S_370_WR_CONFIRM(1) |
1942 			S_370_ENGINE_SEL(V_370_PFP));
1943 	radeon_emit(cs, va);
1944 	radeon_emit(cs, va >> 32);
1945 
1946 	for (uint32_t l = 0; l < level_count; l++)
1947 		radeon_emit(cs, value);
1948 }
1949 
1950 static void
radv_update_tc_compat_zrange_metadata(struct radv_cmd_buffer * cmd_buffer,const struct radv_image_view * iview,VkClearDepthStencilValue ds_clear_value)1951 radv_update_tc_compat_zrange_metadata(struct radv_cmd_buffer *cmd_buffer,
1952 				      const struct radv_image_view *iview,
1953 				      VkClearDepthStencilValue ds_clear_value)
1954 {
1955 	VkImageSubresourceRange range = {
1956 		.aspectMask = iview->aspect_mask,
1957 		.baseMipLevel = iview->base_mip,
1958 		.levelCount = iview->level_count,
1959 		.baseArrayLayer = iview->base_layer,
1960 		.layerCount = iview->layer_count,
1961 	};
1962 	uint32_t cond_val;
1963 
1964 	/* Conditionally set DB_Z_INFO.ZRANGE_PRECISION to 0 when the last
1965 	 * depth clear value is 0.0f.
1966 	 */
1967 	cond_val = ds_clear_value.depth == 0.0f ? UINT_MAX : 0;
1968 
1969 	radv_set_tc_compat_zrange_metadata(cmd_buffer, iview->image, &range,
1970 					   cond_val);
1971 }
1972 
1973 /**
1974  * Update the clear depth/stencil values for this image.
1975  */
1976 void
radv_update_ds_clear_metadata(struct radv_cmd_buffer * cmd_buffer,const struct radv_image_view * iview,VkClearDepthStencilValue ds_clear_value,VkImageAspectFlags aspects)1977 radv_update_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer,
1978 			      const struct radv_image_view *iview,
1979 			      VkClearDepthStencilValue ds_clear_value,
1980 			      VkImageAspectFlags aspects)
1981 {
1982 	VkImageSubresourceRange range = {
1983 		.aspectMask = iview->aspect_mask,
1984 		.baseMipLevel = iview->base_mip,
1985 		.levelCount = iview->level_count,
1986 		.baseArrayLayer = iview->base_layer,
1987 		.layerCount = iview->layer_count,
1988 	};
1989 	struct radv_image *image = iview->image;
1990 
1991 	assert(radv_image_has_htile(image));
1992 
1993 	radv_set_ds_clear_metadata(cmd_buffer, iview->image, &range,
1994 				   ds_clear_value, aspects);
1995 
1996 	if (radv_image_is_tc_compat_htile(image) &&
1997 	    (aspects & VK_IMAGE_ASPECT_DEPTH_BIT)) {
1998 		radv_update_tc_compat_zrange_metadata(cmd_buffer, iview,
1999 						      ds_clear_value);
2000 	}
2001 
2002 	radv_update_bound_fast_clear_ds(cmd_buffer, iview, ds_clear_value,
2003 					aspects);
2004 }
2005 
2006 /**
2007  * Load the clear depth/stencil values from the image's metadata.
2008  */
2009 static void
radv_load_ds_clear_metadata(struct radv_cmd_buffer * cmd_buffer,const struct radv_image_view * iview)2010 radv_load_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer,
2011 			    const struct radv_image_view *iview)
2012 {
2013 	struct radeon_cmdbuf *cs = cmd_buffer->cs;
2014 	const struct radv_image *image = iview->image;
2015 	VkImageAspectFlags aspects = vk_format_aspects(image->vk_format);
2016 	uint64_t va = radv_get_ds_clear_value_va(image, iview->base_mip);
2017 	unsigned reg_offset = 0, reg_count = 0;
2018 
2019 	if (!radv_image_has_htile(image))
2020 		return;
2021 
2022 	if (aspects & VK_IMAGE_ASPECT_STENCIL_BIT) {
2023 		++reg_count;
2024 	} else {
2025 		++reg_offset;
2026 		va += 4;
2027 	}
2028 	if (aspects & VK_IMAGE_ASPECT_DEPTH_BIT)
2029 		++reg_count;
2030 
2031 	uint32_t reg = R_028028_DB_STENCIL_CLEAR + 4 * reg_offset;
2032 
2033 	if (cmd_buffer->device->physical_device->rad_info.has_load_ctx_reg_pkt) {
2034 		radeon_emit(cs, PKT3(PKT3_LOAD_CONTEXT_REG_INDEX, 3, 0));
2035 		radeon_emit(cs, va);
2036 		radeon_emit(cs, va >> 32);
2037 		radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2);
2038 		radeon_emit(cs, reg_count);
2039 	} else {
2040 		radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
2041 		radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) |
2042 				COPY_DATA_DST_SEL(COPY_DATA_REG) |
2043 				(reg_count == 2 ? COPY_DATA_COUNT_SEL : 0));
2044 		radeon_emit(cs, va);
2045 		radeon_emit(cs, va >> 32);
2046 		radeon_emit(cs, reg >> 2);
2047 		radeon_emit(cs, 0);
2048 
2049 		radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
2050 		radeon_emit(cs, 0);
2051 	}
2052 }
2053 
2054 /*
2055  * With DCC some colors don't require CMASK elimination before being
2056  * used as a texture. This sets a predicate value to determine if the
2057  * cmask eliminate is required.
2058  */
2059 void
radv_update_fce_metadata(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,const VkImageSubresourceRange * range,bool value)2060 radv_update_fce_metadata(struct radv_cmd_buffer *cmd_buffer,
2061 			 struct radv_image *image,
2062 			 const VkImageSubresourceRange *range, bool value)
2063 {
2064 	uint64_t pred_val = value;
2065 	uint64_t va = radv_image_get_fce_pred_va(image, range->baseMipLevel);
2066 	uint32_t level_count = radv_get_levelCount(image, range);
2067 	uint32_t count = 2 * level_count;
2068 
2069 	assert(radv_dcc_enabled(image, range->baseMipLevel));
2070 
2071 	radeon_emit(cmd_buffer->cs, PKT3(PKT3_WRITE_DATA, 2 + count, 0));
2072 	radeon_emit(cmd_buffer->cs, S_370_DST_SEL(V_370_MEM) |
2073 				    S_370_WR_CONFIRM(1) |
2074 				    S_370_ENGINE_SEL(V_370_PFP));
2075 	radeon_emit(cmd_buffer->cs, va);
2076 	radeon_emit(cmd_buffer->cs, va >> 32);
2077 
2078 	for (uint32_t l = 0; l < level_count; l++) {
2079 		radeon_emit(cmd_buffer->cs, pred_val);
2080 		radeon_emit(cmd_buffer->cs, pred_val >> 32);
2081 	}
2082 }
2083 
2084 /**
2085  * Update the DCC predicate to reflect the compression state.
2086  */
2087 void
radv_update_dcc_metadata(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,const VkImageSubresourceRange * range,bool value)2088 radv_update_dcc_metadata(struct radv_cmd_buffer *cmd_buffer,
2089 			 struct radv_image *image,
2090 			 const VkImageSubresourceRange *range, bool value)
2091 {
2092 	uint64_t pred_val = value;
2093 	uint64_t va = radv_image_get_dcc_pred_va(image, range->baseMipLevel);
2094 	uint32_t level_count = radv_get_levelCount(image, range);
2095 	uint32_t count = 2 * level_count;
2096 
2097 	assert(radv_dcc_enabled(image, range->baseMipLevel));
2098 
2099 	radeon_emit(cmd_buffer->cs, PKT3(PKT3_WRITE_DATA, 2 + count, 0));
2100 	radeon_emit(cmd_buffer->cs, S_370_DST_SEL(V_370_MEM) |
2101 				    S_370_WR_CONFIRM(1) |
2102 				    S_370_ENGINE_SEL(V_370_PFP));
2103 	radeon_emit(cmd_buffer->cs, va);
2104 	radeon_emit(cmd_buffer->cs, va >> 32);
2105 
2106 	for (uint32_t l = 0; l < level_count; l++) {
2107 		radeon_emit(cmd_buffer->cs, pred_val);
2108 		radeon_emit(cmd_buffer->cs, pred_val >> 32);
2109 	}
2110 }
2111 
2112 /**
2113  * Update the fast clear color values if the image is bound as a color buffer.
2114  */
2115 static void
radv_update_bound_fast_clear_color(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,int cb_idx,uint32_t color_values[2])2116 radv_update_bound_fast_clear_color(struct radv_cmd_buffer *cmd_buffer,
2117 				   struct radv_image *image,
2118 				   int cb_idx,
2119 				   uint32_t color_values[2])
2120 {
2121 	const struct radv_subpass *subpass = cmd_buffer->state.subpass;
2122 	struct radeon_cmdbuf *cs = cmd_buffer->cs;
2123 	uint32_t att_idx;
2124 
2125 	if (!cmd_buffer->state.attachments || !subpass)
2126 		return;
2127 
2128 	att_idx = subpass->color_attachments[cb_idx].attachment;
2129 	if (att_idx == VK_ATTACHMENT_UNUSED)
2130 		return;
2131 
2132 	if (cmd_buffer->state.attachments[att_idx].iview->image != image)
2133 		return;
2134 
2135 	radeon_set_context_reg_seq(cs, R_028C8C_CB_COLOR0_CLEAR_WORD0 + cb_idx * 0x3c, 2);
2136 	radeon_emit(cs, color_values[0]);
2137 	radeon_emit(cs, color_values[1]);
2138 
2139 	cmd_buffer->state.context_roll_without_scissor_emitted = true;
2140 }
2141 
2142 /**
2143  * Set the clear color values to the image's metadata.
2144  */
2145 static void
radv_set_color_clear_metadata(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,const VkImageSubresourceRange * range,uint32_t color_values[2])2146 radv_set_color_clear_metadata(struct radv_cmd_buffer *cmd_buffer,
2147 			      struct radv_image *image,
2148 			      const VkImageSubresourceRange *range,
2149 			      uint32_t color_values[2])
2150 {
2151 	struct radeon_cmdbuf *cs = cmd_buffer->cs;
2152 	uint64_t va = radv_image_get_fast_clear_va(image, range->baseMipLevel);
2153 	uint32_t level_count = radv_get_levelCount(image, range);
2154 	uint32_t count = 2 * level_count;
2155 
2156 	assert(radv_image_has_cmask(image) ||
2157 	       radv_dcc_enabled(image, range->baseMipLevel));
2158 
2159 	radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + count, cmd_buffer->state.predicating));
2160 	radeon_emit(cs, S_370_DST_SEL(V_370_MEM) |
2161 			S_370_WR_CONFIRM(1) |
2162 			S_370_ENGINE_SEL(V_370_PFP));
2163 	radeon_emit(cs, va);
2164 	radeon_emit(cs, va >> 32);
2165 
2166 	for (uint32_t l = 0; l < level_count; l++) {
2167 		radeon_emit(cs, color_values[0]);
2168 		radeon_emit(cs, color_values[1]);
2169 	}
2170 }
2171 
2172 /**
2173  * Update the clear color values for this image.
2174  */
2175 void
radv_update_color_clear_metadata(struct radv_cmd_buffer * cmd_buffer,const struct radv_image_view * iview,int cb_idx,uint32_t color_values[2])2176 radv_update_color_clear_metadata(struct radv_cmd_buffer *cmd_buffer,
2177 				 const struct radv_image_view *iview,
2178 				 int cb_idx,
2179 				 uint32_t color_values[2])
2180 {
2181 	struct radv_image *image = iview->image;
2182 	VkImageSubresourceRange range = {
2183 		.aspectMask = iview->aspect_mask,
2184 		.baseMipLevel = iview->base_mip,
2185 		.levelCount = iview->level_count,
2186 		.baseArrayLayer = iview->base_layer,
2187 		.layerCount = iview->layer_count,
2188 	};
2189 
2190 	assert(radv_image_has_cmask(image) ||
2191 	       radv_dcc_enabled(image, iview->base_mip));
2192 
2193 	radv_set_color_clear_metadata(cmd_buffer, image, &range, color_values);
2194 
2195 	radv_update_bound_fast_clear_color(cmd_buffer, image, cb_idx,
2196 					   color_values);
2197 }
2198 
2199 /**
2200  * Load the clear color values from the image's metadata.
2201  */
2202 static void
radv_load_color_clear_metadata(struct radv_cmd_buffer * cmd_buffer,struct radv_image_view * iview,int cb_idx)2203 radv_load_color_clear_metadata(struct radv_cmd_buffer *cmd_buffer,
2204 			       struct radv_image_view *iview,
2205 			       int cb_idx)
2206 {
2207 	struct radeon_cmdbuf *cs = cmd_buffer->cs;
2208 	struct radv_image *image = iview->image;
2209 	uint64_t va = radv_image_get_fast_clear_va(image, iview->base_mip);
2210 
2211 	if (!radv_image_has_cmask(image) &&
2212 	    !radv_dcc_enabled(image, iview->base_mip))
2213 		return;
2214 
2215 	uint32_t reg = R_028C8C_CB_COLOR0_CLEAR_WORD0 + cb_idx * 0x3c;
2216 
2217 	if (cmd_buffer->device->physical_device->rad_info.has_load_ctx_reg_pkt) {
2218 		radeon_emit(cs, PKT3(PKT3_LOAD_CONTEXT_REG_INDEX, 3, cmd_buffer->state.predicating));
2219 		radeon_emit(cs, va);
2220 		radeon_emit(cs, va >> 32);
2221 		radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2);
2222 		radeon_emit(cs, 2);
2223 	} else {
2224 		radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, cmd_buffer->state.predicating));
2225 		radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) |
2226 				COPY_DATA_DST_SEL(COPY_DATA_REG) |
2227 				COPY_DATA_COUNT_SEL);
2228 		radeon_emit(cs, va);
2229 		radeon_emit(cs, va >> 32);
2230 		radeon_emit(cs, reg >> 2);
2231 		radeon_emit(cs, 0);
2232 
2233 		radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, cmd_buffer->state.predicating));
2234 		radeon_emit(cs, 0);
2235 	}
2236 }
2237 
2238 /* GFX9+ metadata cache flushing workaround. metadata cache coherency is
2239  * broken if the CB caches data of multiple mips of the same image at the
2240  * same time.
2241  *
2242  * Insert some flushes to avoid this.
2243  */
2244 static void
radv_emit_fb_mip_change_flush(struct radv_cmd_buffer * cmd_buffer)2245 radv_emit_fb_mip_change_flush(struct radv_cmd_buffer *cmd_buffer)
2246 {
2247 	struct radv_framebuffer *framebuffer = cmd_buffer->state.framebuffer;
2248 	const struct radv_subpass *subpass = cmd_buffer->state.subpass;
2249 	bool color_mip_changed = false;
2250 
2251 	/* Entire workaround is not applicable before GFX9 */
2252 	if (cmd_buffer->device->physical_device->rad_info.chip_class < GFX9)
2253 		return;
2254 
2255 	if (!framebuffer)
2256 		return;
2257 
2258 	for (int i = 0; i < subpass->color_count; ++i) {
2259 		int idx = subpass->color_attachments[i].attachment;
2260 		if (idx == VK_ATTACHMENT_UNUSED)
2261 			continue;
2262 
2263 		struct radv_image_view *iview = cmd_buffer->state.attachments[idx].iview;
2264 
2265 		if ((radv_image_has_CB_metadata(iview->image) ||
2266 		     radv_image_has_dcc(iview->image)) &&
2267 		    cmd_buffer->state.cb_mip[i] != iview->base_mip)
2268 			color_mip_changed = true;
2269 
2270 		cmd_buffer->state.cb_mip[i] = iview->base_mip;
2271 	}
2272 
2273 	if (color_mip_changed) {
2274 		cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB |
2275 		                                RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
2276 	}
2277 }
2278 
2279 /* This function does the flushes for mip changes if the levels are not zero for
2280  * all render targets. This way we can assume at the start of the next cmd_buffer
2281  * that rendering to mip 0 doesn't need any flushes. As that is the most common
2282  * case that saves some flushes. */
2283 static void
radv_emit_mip_change_flush_default(struct radv_cmd_buffer * cmd_buffer)2284 radv_emit_mip_change_flush_default(struct radv_cmd_buffer *cmd_buffer)
2285 {
2286 	/* Entire workaround is not applicable before GFX9 */
2287 	if (cmd_buffer->device->physical_device->rad_info.chip_class < GFX9)
2288 		return;
2289 
2290 	bool need_color_mip_flush = false;
2291 	for (unsigned i = 0; i < 8; ++i) {
2292 		if (cmd_buffer->state.cb_mip[i]) {
2293 			need_color_mip_flush = true;
2294 			break;
2295 		}
2296 	}
2297 
2298 	if (need_color_mip_flush) {
2299 		cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB |
2300 		                                RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
2301 	}
2302 
2303 	memset(cmd_buffer->state.cb_mip, 0, sizeof(cmd_buffer->state.cb_mip));
2304 }
2305 
2306 static void
radv_emit_framebuffer_state(struct radv_cmd_buffer * cmd_buffer)2307 radv_emit_framebuffer_state(struct radv_cmd_buffer *cmd_buffer)
2308 {
2309 	int i;
2310 	struct radv_framebuffer *framebuffer = cmd_buffer->state.framebuffer;
2311 	const struct radv_subpass *subpass = cmd_buffer->state.subpass;
2312 
2313 	/* this may happen for inherited secondary recording */
2314 	if (!framebuffer)
2315 		return;
2316 
2317 	for (i = 0; i < 8; ++i) {
2318 		if (i >= subpass->color_count || subpass->color_attachments[i].attachment == VK_ATTACHMENT_UNUSED) {
2319 			radeon_set_context_reg(cmd_buffer->cs, R_028C70_CB_COLOR0_INFO + i * 0x3C,
2320 				       S_028C70_FORMAT(V_028C70_COLOR_INVALID));
2321 			continue;
2322 		}
2323 
2324 		int idx = subpass->color_attachments[i].attachment;
2325 		struct radv_image_view *iview = cmd_buffer->state.attachments[idx].iview;
2326 		VkImageLayout layout = subpass->color_attachments[i].layout;
2327 		bool in_render_loop = subpass->color_attachments[i].in_render_loop;
2328 
2329 		radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, iview->bo);
2330 
2331 		assert(iview->aspect_mask & (VK_IMAGE_ASPECT_COLOR_BIT | VK_IMAGE_ASPECT_PLANE_0_BIT |
2332 		                                       VK_IMAGE_ASPECT_PLANE_1_BIT | VK_IMAGE_ASPECT_PLANE_2_BIT));
2333 		radv_emit_fb_color_state(cmd_buffer, i, &cmd_buffer->state.attachments[idx].cb, iview, layout, in_render_loop);
2334 
2335 		radv_load_color_clear_metadata(cmd_buffer, iview, i);
2336 	}
2337 
2338 	if (subpass->depth_stencil_attachment) {
2339 		int idx = subpass->depth_stencil_attachment->attachment;
2340 		VkImageLayout layout = subpass->depth_stencil_attachment->layout;
2341 		bool in_render_loop = subpass->depth_stencil_attachment->in_render_loop;
2342 		struct radv_image_view *iview = cmd_buffer->state.attachments[idx].iview;
2343 		radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, cmd_buffer->state.attachments[idx].iview->bo);
2344 
2345 		radv_emit_fb_ds_state(cmd_buffer, &cmd_buffer->state.attachments[idx].ds, iview, layout, in_render_loop);
2346 
2347 		if (cmd_buffer->state.attachments[idx].ds.offset_scale != cmd_buffer->state.offset_scale) {
2348 			cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS;
2349 			cmd_buffer->state.offset_scale = cmd_buffer->state.attachments[idx].ds.offset_scale;
2350 		}
2351 		radv_load_ds_clear_metadata(cmd_buffer, iview);
2352 	} else {
2353 		if (cmd_buffer->device->physical_device->rad_info.chip_class == GFX9)
2354 			radeon_set_context_reg_seq(cmd_buffer->cs, R_028038_DB_Z_INFO, 2);
2355 		else
2356 			radeon_set_context_reg_seq(cmd_buffer->cs, R_028040_DB_Z_INFO, 2);
2357 
2358 		radeon_emit(cmd_buffer->cs, S_028040_FORMAT(V_028040_Z_INVALID)); /* DB_Z_INFO */
2359 		radeon_emit(cmd_buffer->cs, S_028044_FORMAT(V_028044_STENCIL_INVALID)); /* DB_STENCIL_INFO */
2360 	}
2361 	radeon_set_context_reg(cmd_buffer->cs, R_028208_PA_SC_WINDOW_SCISSOR_BR,
2362 			       S_028208_BR_X(framebuffer->width) |
2363 			       S_028208_BR_Y(framebuffer->height));
2364 
2365 	if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX8) {
2366 		bool disable_constant_encode =
2367 			cmd_buffer->device->physical_device->rad_info.has_dcc_constant_encode;
2368 		enum chip_class chip_class =
2369 			cmd_buffer->device->physical_device->rad_info.chip_class;
2370 		uint8_t watermark = chip_class >= GFX10 ? 6 : 4;
2371 
2372 		radeon_set_context_reg(cmd_buffer->cs, R_028424_CB_DCC_CONTROL,
2373 				       S_028424_OVERWRITE_COMBINER_MRT_SHARING_DISABLE(chip_class <= GFX9) |
2374 				       S_028424_OVERWRITE_COMBINER_WATERMARK(watermark) |
2375 				       S_028424_DISABLE_CONSTANT_ENCODE_REG(disable_constant_encode));
2376 	}
2377 
2378 	if (cmd_buffer->device->dfsm_allowed) {
2379 		radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
2380 		radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0));
2381 	}
2382 
2383 	cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_FRAMEBUFFER;
2384 }
2385 
2386 static void
radv_emit_index_buffer(struct radv_cmd_buffer * cmd_buffer,bool indirect)2387 radv_emit_index_buffer(struct radv_cmd_buffer *cmd_buffer, bool indirect)
2388 {
2389 	struct radeon_cmdbuf *cs = cmd_buffer->cs;
2390 	struct radv_cmd_state *state = &cmd_buffer->state;
2391 
2392 	if (state->index_type != state->last_index_type) {
2393 		if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) {
2394 			radeon_set_uconfig_reg_idx(cmd_buffer->device->physical_device,
2395 						   cs, R_03090C_VGT_INDEX_TYPE,
2396 						   2, state->index_type);
2397 		} else {
2398 			radeon_emit(cs, PKT3(PKT3_INDEX_TYPE, 0, 0));
2399 			radeon_emit(cs, state->index_type);
2400 		}
2401 
2402 		state->last_index_type = state->index_type;
2403 	}
2404 
2405 	/* For the direct indexed draws we use DRAW_INDEX_2, which includes
2406 	 * the index_va and max_index_count already. */
2407 	if (!indirect)
2408 		return;
2409 
2410 	radeon_emit(cs, PKT3(PKT3_INDEX_BASE, 1, 0));
2411 	radeon_emit(cs, state->index_va);
2412 	radeon_emit(cs, state->index_va >> 32);
2413 
2414 	radeon_emit(cs, PKT3(PKT3_INDEX_BUFFER_SIZE, 0, 0));
2415 	radeon_emit(cs, state->max_index_count);
2416 
2417 	cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_INDEX_BUFFER;
2418 }
2419 
radv_set_db_count_control(struct radv_cmd_buffer * cmd_buffer)2420 void radv_set_db_count_control(struct radv_cmd_buffer *cmd_buffer)
2421 {
2422 	bool has_perfect_queries = cmd_buffer->state.perfect_occlusion_queries_enabled;
2423 	struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
2424 	uint32_t pa_sc_mode_cntl_1 =
2425 		pipeline ? pipeline->graphics.ms.pa_sc_mode_cntl_1 : 0;
2426 	uint32_t db_count_control;
2427 
2428 	if(!cmd_buffer->state.active_occlusion_queries) {
2429 		if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX7) {
2430 			if (G_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE(pa_sc_mode_cntl_1) &&
2431 			    pipeline->graphics.disable_out_of_order_rast_for_occlusion &&
2432 			    has_perfect_queries) {
2433 				/* Re-enable out-of-order rasterization if the
2434 				 * bound pipeline supports it and if it's has
2435 				 * been disabled before starting any perfect
2436 				 * occlusion queries.
2437 				 */
2438 				radeon_set_context_reg(cmd_buffer->cs,
2439 						       R_028A4C_PA_SC_MODE_CNTL_1,
2440 						       pa_sc_mode_cntl_1);
2441 			}
2442 		}
2443 		db_count_control = S_028004_ZPASS_INCREMENT_DISABLE(1);
2444 	} else {
2445 		const struct radv_subpass *subpass = cmd_buffer->state.subpass;
2446 		uint32_t sample_rate = subpass ? util_logbase2(subpass->max_sample_count) : 0;
2447 		bool gfx10_perfect = cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10 && has_perfect_queries;
2448 
2449 		if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX7) {
2450 			/* Always enable PERFECT_ZPASS_COUNTS due to issues with partially
2451 			 * covered tiles, discards, and early depth testing. For more details,
2452 			 * see https://gitlab.freedesktop.org/mesa/mesa/-/issues/3218 */
2453 			db_count_control =
2454 				S_028004_PERFECT_ZPASS_COUNTS(1) |
2455 				S_028004_DISABLE_CONSERVATIVE_ZPASS_COUNTS(gfx10_perfect) |
2456 				S_028004_SAMPLE_RATE(sample_rate) |
2457 				S_028004_ZPASS_ENABLE(1) |
2458 				S_028004_SLICE_EVEN_ENABLE(1) |
2459 				S_028004_SLICE_ODD_ENABLE(1);
2460 
2461 			if (G_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE(pa_sc_mode_cntl_1) &&
2462 			    pipeline->graphics.disable_out_of_order_rast_for_occlusion &&
2463 			    has_perfect_queries) {
2464 				/* If the bound pipeline has enabled
2465 				 * out-of-order rasterization, we should
2466 				 * disable it before starting any perfect
2467 				 * occlusion queries.
2468 				 */
2469 				pa_sc_mode_cntl_1 &= C_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE;
2470 
2471 				radeon_set_context_reg(cmd_buffer->cs,
2472 						       R_028A4C_PA_SC_MODE_CNTL_1,
2473 						       pa_sc_mode_cntl_1);
2474 			}
2475 		} else {
2476 			db_count_control = S_028004_PERFECT_ZPASS_COUNTS(1) |
2477 				S_028004_SAMPLE_RATE(sample_rate);
2478 		}
2479 	}
2480 
2481 	radeon_set_context_reg(cmd_buffer->cs, R_028004_DB_COUNT_CONTROL, db_count_control);
2482 
2483 	cmd_buffer->state.context_roll_without_scissor_emitted = true;
2484 }
2485 
2486 static void
radv_cmd_buffer_flush_dynamic_state(struct radv_cmd_buffer * cmd_buffer)2487 radv_cmd_buffer_flush_dynamic_state(struct radv_cmd_buffer *cmd_buffer)
2488 {
2489 	uint32_t states = cmd_buffer->state.dirty & cmd_buffer->state.emitted_pipeline->graphics.needed_dynamic_state;
2490 
2491 	if (states & (RADV_CMD_DIRTY_DYNAMIC_VIEWPORT))
2492 		radv_emit_viewport(cmd_buffer);
2493 
2494 	if (states & (RADV_CMD_DIRTY_DYNAMIC_SCISSOR | RADV_CMD_DIRTY_DYNAMIC_VIEWPORT) &&
2495 	    !cmd_buffer->device->physical_device->rad_info.has_gfx9_scissor_bug)
2496 		radv_emit_scissor(cmd_buffer);
2497 
2498 	if (states & RADV_CMD_DIRTY_DYNAMIC_LINE_WIDTH)
2499 		radv_emit_line_width(cmd_buffer);
2500 
2501 	if (states & RADV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS)
2502 		radv_emit_blend_constants(cmd_buffer);
2503 
2504 	if (states & (RADV_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE |
2505 				       RADV_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK |
2506 				       RADV_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK))
2507 		radv_emit_stencil(cmd_buffer);
2508 
2509 	if (states & RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS)
2510 		radv_emit_depth_bounds(cmd_buffer);
2511 
2512 	if (states & RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS)
2513 		radv_emit_depth_bias(cmd_buffer);
2514 
2515 	if (states & RADV_CMD_DIRTY_DYNAMIC_DISCARD_RECTANGLE)
2516 		radv_emit_discard_rectangle(cmd_buffer);
2517 
2518 	if (states & RADV_CMD_DIRTY_DYNAMIC_SAMPLE_LOCATIONS)
2519 		radv_emit_sample_locations(cmd_buffer);
2520 
2521 	if (states & RADV_CMD_DIRTY_DYNAMIC_LINE_STIPPLE)
2522 		radv_emit_line_stipple(cmd_buffer);
2523 
2524 	if (states & (RADV_CMD_DIRTY_DYNAMIC_CULL_MODE |
2525 		      RADV_CMD_DIRTY_DYNAMIC_FRONT_FACE))
2526 		radv_emit_culling(cmd_buffer, states);
2527 
2528 	if (states & RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY)
2529 		radv_emit_primitive_topology(cmd_buffer);
2530 
2531 	if (states & (RADV_CMD_DIRTY_DYNAMIC_DEPTH_TEST_ENABLE |
2532 		      RADV_CMD_DIRTY_DYNAMIC_DEPTH_WRITE_ENABLE |
2533 		      RADV_CMD_DIRTY_DYNAMIC_DEPTH_COMPARE_OP |
2534 		      RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE |
2535 		      RADV_CMD_DIRTY_DYNAMIC_STENCIL_TEST_ENABLE |
2536 		      RADV_CMD_DIRTY_DYNAMIC_STENCIL_OP))
2537 		radv_emit_depth_control(cmd_buffer, states);
2538 
2539 	if (states & RADV_CMD_DIRTY_DYNAMIC_STENCIL_OP)
2540 		radv_emit_stencil_control(cmd_buffer);
2541 
2542 	cmd_buffer->state.dirty &= ~states;
2543 }
2544 
2545 static void
radv_flush_push_descriptors(struct radv_cmd_buffer * cmd_buffer,VkPipelineBindPoint bind_point)2546 radv_flush_push_descriptors(struct radv_cmd_buffer *cmd_buffer,
2547 			    VkPipelineBindPoint bind_point)
2548 {
2549 	struct radv_descriptor_state *descriptors_state =
2550 		radv_get_descriptors_state(cmd_buffer, bind_point);
2551 	struct radv_descriptor_set *set = &descriptors_state->push_set.set;
2552 	unsigned bo_offset;
2553 
2554 	if (!radv_cmd_buffer_upload_data(cmd_buffer, set->size, 32,
2555 					 set->mapped_ptr,
2556 					 &bo_offset))
2557 		return;
2558 
2559 	set->va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
2560 	set->va += bo_offset;
2561 }
2562 
2563 static void
radv_flush_indirect_descriptor_sets(struct radv_cmd_buffer * cmd_buffer,VkPipelineBindPoint bind_point)2564 radv_flush_indirect_descriptor_sets(struct radv_cmd_buffer *cmd_buffer,
2565 				    VkPipelineBindPoint bind_point)
2566 {
2567 	struct radv_descriptor_state *descriptors_state =
2568 		radv_get_descriptors_state(cmd_buffer, bind_point);
2569 	uint32_t size = MAX_SETS * 4;
2570 	uint32_t offset;
2571 	void *ptr;
2572 
2573 	if (!radv_cmd_buffer_upload_alloc(cmd_buffer, size,
2574 					  256, &offset, &ptr))
2575 		return;
2576 
2577 	for (unsigned i = 0; i < MAX_SETS; i++) {
2578 		uint32_t *uptr = ((uint32_t *)ptr) + i;
2579 		uint64_t set_va = 0;
2580 		struct radv_descriptor_set *set = descriptors_state->sets[i];
2581 		if (descriptors_state->valid & (1u << i))
2582 			set_va = set->va;
2583 		uptr[0] = set_va & 0xffffffff;
2584 	}
2585 
2586 	uint64_t va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
2587 	va += offset;
2588 
2589 	if (cmd_buffer->state.pipeline) {
2590 		if (cmd_buffer->state.pipeline->shaders[MESA_SHADER_VERTEX])
2591 			radv_emit_userdata_address(cmd_buffer, cmd_buffer->state.pipeline, MESA_SHADER_VERTEX,
2592 						   AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
2593 
2594 		if (cmd_buffer->state.pipeline->shaders[MESA_SHADER_FRAGMENT])
2595 			radv_emit_userdata_address(cmd_buffer, cmd_buffer->state.pipeline, MESA_SHADER_FRAGMENT,
2596 						   AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
2597 
2598 		if (radv_pipeline_has_gs(cmd_buffer->state.pipeline))
2599 			radv_emit_userdata_address(cmd_buffer, cmd_buffer->state.pipeline, MESA_SHADER_GEOMETRY,
2600 						   AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
2601 
2602 		if (radv_pipeline_has_tess(cmd_buffer->state.pipeline))
2603 			radv_emit_userdata_address(cmd_buffer, cmd_buffer->state.pipeline, MESA_SHADER_TESS_CTRL,
2604 						   AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
2605 
2606 		if (radv_pipeline_has_tess(cmd_buffer->state.pipeline))
2607 			radv_emit_userdata_address(cmd_buffer, cmd_buffer->state.pipeline, MESA_SHADER_TESS_EVAL,
2608 						   AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
2609 	}
2610 
2611 	if (cmd_buffer->state.compute_pipeline)
2612 		radv_emit_userdata_address(cmd_buffer, cmd_buffer->state.compute_pipeline, MESA_SHADER_COMPUTE,
2613 					   AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
2614 }
2615 
2616 static void
radv_flush_descriptors(struct radv_cmd_buffer * cmd_buffer,VkShaderStageFlags stages)2617 radv_flush_descriptors(struct radv_cmd_buffer *cmd_buffer,
2618 		       VkShaderStageFlags stages)
2619 {
2620 	VkPipelineBindPoint bind_point = stages & VK_SHADER_STAGE_COMPUTE_BIT ?
2621 					 VK_PIPELINE_BIND_POINT_COMPUTE :
2622 					 VK_PIPELINE_BIND_POINT_GRAPHICS;
2623 	struct radv_descriptor_state *descriptors_state =
2624 		radv_get_descriptors_state(cmd_buffer, bind_point);
2625 	struct radv_cmd_state *state = &cmd_buffer->state;
2626 	bool flush_indirect_descriptors;
2627 
2628 	if (!descriptors_state->dirty)
2629 		return;
2630 
2631 	if (descriptors_state->push_dirty)
2632 		radv_flush_push_descriptors(cmd_buffer, bind_point);
2633 
2634 	flush_indirect_descriptors =
2635 		(bind_point == VK_PIPELINE_BIND_POINT_GRAPHICS &&
2636 		 state->pipeline && state->pipeline->need_indirect_descriptor_sets) ||
2637 		(bind_point == VK_PIPELINE_BIND_POINT_COMPUTE &&
2638 		 state->compute_pipeline && state->compute_pipeline->need_indirect_descriptor_sets);
2639 
2640 	if (flush_indirect_descriptors)
2641 		radv_flush_indirect_descriptor_sets(cmd_buffer, bind_point);
2642 
2643 	ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws,
2644 	                                                   cmd_buffer->cs,
2645 	                                                   MAX_SETS * MESA_SHADER_STAGES * 4);
2646 
2647 	if (cmd_buffer->state.pipeline) {
2648 		radv_foreach_stage(stage, stages) {
2649 			if (!cmd_buffer->state.pipeline->shaders[stage])
2650 				continue;
2651 
2652 			radv_emit_descriptor_pointers(cmd_buffer,
2653 						      cmd_buffer->state.pipeline,
2654 						      descriptors_state, stage);
2655 		}
2656 	}
2657 
2658 	if (cmd_buffer->state.compute_pipeline &&
2659 	    (stages & VK_SHADER_STAGE_COMPUTE_BIT)) {
2660 		radv_emit_descriptor_pointers(cmd_buffer,
2661 					      cmd_buffer->state.compute_pipeline,
2662 					      descriptors_state,
2663 					      MESA_SHADER_COMPUTE);
2664 	}
2665 
2666 	descriptors_state->dirty = 0;
2667 	descriptors_state->push_dirty = false;
2668 
2669 	assert(cmd_buffer->cs->cdw <= cdw_max);
2670 
2671 	if (unlikely(cmd_buffer->device->trace_bo))
2672 		radv_save_descriptors(cmd_buffer, bind_point);
2673 }
2674 
2675 static void
radv_flush_constants(struct radv_cmd_buffer * cmd_buffer,VkShaderStageFlags stages)2676 radv_flush_constants(struct radv_cmd_buffer *cmd_buffer,
2677 		     VkShaderStageFlags stages)
2678 {
2679 	struct radv_pipeline *pipeline = stages & VK_SHADER_STAGE_COMPUTE_BIT
2680 					 ? cmd_buffer->state.compute_pipeline
2681 					 : cmd_buffer->state.pipeline;
2682 	VkPipelineBindPoint bind_point = stages & VK_SHADER_STAGE_COMPUTE_BIT ?
2683 					 VK_PIPELINE_BIND_POINT_COMPUTE :
2684 					 VK_PIPELINE_BIND_POINT_GRAPHICS;
2685 	struct radv_descriptor_state *descriptors_state =
2686 		radv_get_descriptors_state(cmd_buffer, bind_point);
2687 	struct radv_pipeline_layout *layout = pipeline->layout;
2688 	struct radv_shader_variant *shader, *prev_shader;
2689 	bool need_push_constants = false;
2690 	unsigned offset;
2691 	void *ptr;
2692 	uint64_t va;
2693 
2694 	stages &= cmd_buffer->push_constant_stages;
2695 	if (!stages ||
2696 	    (!layout->push_constant_size && !layout->dynamic_offset_count))
2697 		return;
2698 
2699 	radv_foreach_stage(stage, stages) {
2700 		shader = radv_get_shader(pipeline, stage);
2701 		if (!shader)
2702 			continue;
2703 
2704 		need_push_constants |= shader->info.loads_push_constants;
2705 		need_push_constants |= shader->info.loads_dynamic_offsets;
2706 
2707 		uint8_t base = shader->info.base_inline_push_consts;
2708 		uint8_t count = shader->info.num_inline_push_consts;
2709 
2710 		radv_emit_inline_push_consts(cmd_buffer, pipeline, stage,
2711 					     AC_UD_INLINE_PUSH_CONSTANTS,
2712 					     count,
2713 					     (uint32_t *)&cmd_buffer->push_constants[base * 4]);
2714 	}
2715 
2716 	if (need_push_constants) {
2717 		if (!radv_cmd_buffer_upload_alloc(cmd_buffer, layout->push_constant_size +
2718 						  16 * layout->dynamic_offset_count,
2719 						  256, &offset, &ptr))
2720 			return;
2721 
2722 		memcpy(ptr, cmd_buffer->push_constants, layout->push_constant_size);
2723 		memcpy((char*)ptr + layout->push_constant_size,
2724 		       descriptors_state->dynamic_buffers,
2725 		       16 * layout->dynamic_offset_count);
2726 
2727 		va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
2728 		va += offset;
2729 
2730 		ASSERTED unsigned cdw_max =
2731 			radeon_check_space(cmd_buffer->device->ws,
2732 	                                   cmd_buffer->cs, MESA_SHADER_STAGES * 4);
2733 
2734 		prev_shader = NULL;
2735 		radv_foreach_stage(stage, stages) {
2736 			shader = radv_get_shader(pipeline, stage);
2737 
2738 			/* Avoid redundantly emitting the address for merged stages. */
2739 			if (shader && shader != prev_shader) {
2740 				radv_emit_userdata_address(cmd_buffer, pipeline, stage,
2741 							   AC_UD_PUSH_CONSTANTS, va);
2742 
2743 				prev_shader = shader;
2744 			}
2745 		}
2746 		assert(cmd_buffer->cs->cdw <= cdw_max);
2747 	}
2748 
2749 	cmd_buffer->push_constant_stages &= ~stages;
2750 }
2751 
2752 static void
radv_flush_vertex_descriptors(struct radv_cmd_buffer * cmd_buffer,bool pipeline_is_dirty)2753 radv_flush_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer,
2754 			      bool pipeline_is_dirty)
2755 {
2756 	if ((pipeline_is_dirty ||
2757 	    (cmd_buffer->state.dirty & RADV_CMD_DIRTY_VERTEX_BUFFER)) &&
2758 	    cmd_buffer->state.pipeline->num_vertex_bindings &&
2759 	    radv_get_shader(cmd_buffer->state.pipeline, MESA_SHADER_VERTEX)->info.vs.has_vertex_buffers) {
2760 		unsigned vb_offset;
2761 		void *vb_ptr;
2762 		uint32_t i = 0;
2763 		uint32_t count = cmd_buffer->state.pipeline->num_vertex_bindings;
2764 		uint64_t va;
2765 
2766 		/* allocate some descriptor state for vertex buffers */
2767 		if (!radv_cmd_buffer_upload_alloc(cmd_buffer, count * 16, 256,
2768 						  &vb_offset, &vb_ptr))
2769 			return;
2770 
2771 		for (i = 0; i < count; i++) {
2772 			uint32_t *desc = &((uint32_t *)vb_ptr)[i * 4];
2773 			uint32_t offset;
2774 			struct radv_buffer *buffer = cmd_buffer->vertex_bindings[i].buffer;
2775 			unsigned num_records;
2776 			unsigned stride;
2777 
2778 			if (!buffer) {
2779 				memset(desc, 0, 4 * 4);
2780 				continue;
2781 			}
2782 
2783 			va = radv_buffer_get_va(buffer->bo);
2784 
2785 			offset = cmd_buffer->vertex_bindings[i].offset;
2786 			va += offset + buffer->offset;
2787 
2788 			if (cmd_buffer->vertex_bindings[i].size) {
2789 				num_records = cmd_buffer->vertex_bindings[i].size;
2790 			} else {
2791 				num_records = buffer->size - offset;
2792 			}
2793 
2794 			if (cmd_buffer->state.pipeline->graphics.uses_dynamic_stride) {
2795 				stride = cmd_buffer->vertex_bindings[i].stride;
2796 			} else {
2797 				stride = cmd_buffer->state.pipeline->binding_stride[i];
2798 			}
2799 
2800 			if (cmd_buffer->device->physical_device->rad_info.chip_class != GFX8 && stride)
2801 				num_records /= stride;
2802 
2803 			uint32_t rsrc_word3 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
2804 					      S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
2805 					      S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
2806 					      S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
2807 
2808 			if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10) {
2809 				/* OOB_SELECT chooses the out-of-bounds check:
2810 				 * - 1: index >= NUM_RECORDS (Structured)
2811 				 * - 3: offset >= NUM_RECORDS (Raw)
2812 				 */
2813                                int oob_select = stride ? V_008F0C_OOB_SELECT_STRUCTURED : V_008F0C_OOB_SELECT_RAW;
2814 
2815                                rsrc_word3 |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_UINT) |
2816 					     S_008F0C_OOB_SELECT(oob_select) |
2817 					     S_008F0C_RESOURCE_LEVEL(1);
2818                        } else {
2819                                rsrc_word3 |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) |
2820 					     S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
2821                        }
2822 
2823 			desc[0] = va;
2824 			desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | S_008F04_STRIDE(stride);
2825 			desc[2] = num_records;
2826 			desc[3] = rsrc_word3;
2827 		}
2828 
2829 		va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
2830 		va += vb_offset;
2831 
2832 		radv_emit_userdata_address(cmd_buffer, cmd_buffer->state.pipeline, MESA_SHADER_VERTEX,
2833 					   AC_UD_VS_VERTEX_BUFFERS, va);
2834 
2835 		cmd_buffer->state.vb_va = va;
2836 		cmd_buffer->state.vb_size = count * 16;
2837 		cmd_buffer->state.prefetch_L2_mask |= RADV_PREFETCH_VBO_DESCRIPTORS;
2838 	}
2839 	cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_VERTEX_BUFFER;
2840 }
2841 
2842 static void
radv_emit_streamout_buffers(struct radv_cmd_buffer * cmd_buffer,uint64_t va)2843 radv_emit_streamout_buffers(struct radv_cmd_buffer *cmd_buffer, uint64_t va)
2844 {
2845 	struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
2846 	struct radv_userdata_info *loc;
2847 	uint32_t base_reg;
2848 
2849 	for (unsigned stage = 0; stage < MESA_SHADER_STAGES; ++stage) {
2850 		if (!radv_get_shader(pipeline, stage))
2851 			continue;
2852 
2853 		loc = radv_lookup_user_sgpr(pipeline, stage,
2854 					    AC_UD_STREAMOUT_BUFFERS);
2855 		if (loc->sgpr_idx == -1)
2856 			continue;
2857 
2858 		base_reg = pipeline->user_data_0[stage];
2859 
2860 		radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs,
2861 					 base_reg + loc->sgpr_idx * 4, va, false);
2862 	}
2863 
2864 	if (radv_pipeline_has_gs_copy_shader(pipeline)) {
2865 		loc = &pipeline->gs_copy_shader->info.user_sgprs_locs.shader_data[AC_UD_STREAMOUT_BUFFERS];
2866 		if (loc->sgpr_idx != -1) {
2867 			base_reg = R_00B130_SPI_SHADER_USER_DATA_VS_0;
2868 
2869 			radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs,
2870 						 base_reg + loc->sgpr_idx * 4, va, false);
2871 		}
2872 	}
2873 }
2874 
2875 static void
radv_flush_streamout_descriptors(struct radv_cmd_buffer * cmd_buffer)2876 radv_flush_streamout_descriptors(struct radv_cmd_buffer *cmd_buffer)
2877 {
2878 	if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_STREAMOUT_BUFFER) {
2879 		struct radv_streamout_binding *sb = cmd_buffer->streamout_bindings;
2880 		struct radv_streamout_state *so = &cmd_buffer->state.streamout;
2881 		unsigned so_offset;
2882 		void *so_ptr;
2883 		uint64_t va;
2884 
2885 		/* Allocate some descriptor state for streamout buffers. */
2886 		if (!radv_cmd_buffer_upload_alloc(cmd_buffer,
2887 						  MAX_SO_BUFFERS * 16, 256,
2888 						  &so_offset, &so_ptr))
2889 			return;
2890 
2891 		for (uint32_t i = 0; i < MAX_SO_BUFFERS; i++) {
2892 			struct radv_buffer *buffer = sb[i].buffer;
2893 			uint32_t *desc = &((uint32_t *)so_ptr)[i * 4];
2894 
2895 			if (!(so->enabled_mask & (1 << i)))
2896 				continue;
2897 
2898 			va = radv_buffer_get_va(buffer->bo) + buffer->offset;
2899 
2900 			va += sb[i].offset;
2901 
2902 			/* Set the descriptor.
2903 			 *
2904 			 * On GFX8, the format must be non-INVALID, otherwise
2905 			 * the buffer will be considered not bound and store
2906 			 * instructions will be no-ops.
2907 			 */
2908 			uint32_t size = 0xffffffff;
2909 
2910 			/* Compute the correct buffer size for NGG streamout
2911 			 * because it's used to determine the max emit per
2912 			 * buffer.
2913 			 */
2914 			if (cmd_buffer->device->physical_device->use_ngg_streamout)
2915 				size = buffer->size - sb[i].offset;
2916 
2917 			uint32_t rsrc_word3 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
2918 					      S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
2919 					      S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
2920 					      S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
2921 
2922 			if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10) {
2923 				rsrc_word3 |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
2924 					      S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) |
2925 					      S_008F0C_RESOURCE_LEVEL(1);
2926 			} else {
2927 				rsrc_word3 |= S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
2928 			}
2929 
2930 			desc[0] = va;
2931 			desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32);
2932 			desc[2] = size;
2933 			desc[3] = rsrc_word3;
2934 		}
2935 
2936 		va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
2937 		va += so_offset;
2938 
2939 		radv_emit_streamout_buffers(cmd_buffer, va);
2940 	}
2941 
2942 	cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_STREAMOUT_BUFFER;
2943 }
2944 
2945 static void
radv_flush_ngg_gs_state(struct radv_cmd_buffer * cmd_buffer)2946 radv_flush_ngg_gs_state(struct radv_cmd_buffer *cmd_buffer)
2947 {
2948 	struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
2949 	struct radv_userdata_info *loc;
2950 	uint32_t ngg_gs_state = 0;
2951 	uint32_t base_reg;
2952 
2953 	if (!radv_pipeline_has_gs(pipeline) ||
2954 	    !radv_pipeline_has_ngg(pipeline))
2955 		return;
2956 
2957 	/* By default NGG GS queries are disabled but they are enabled if the
2958 	 * command buffer has active GDS queries or if it's a secondary command
2959 	 * buffer that inherits the number of generated primitives.
2960 	 */
2961 	if (cmd_buffer->state.active_pipeline_gds_queries ||
2962 	    (cmd_buffer->state.inherited_pipeline_statistics & VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT))
2963 		ngg_gs_state = 1;
2964 
2965 	loc = radv_lookup_user_sgpr(pipeline, MESA_SHADER_GEOMETRY,
2966 				    AC_UD_NGG_GS_STATE);
2967 	base_reg = pipeline->user_data_0[MESA_SHADER_GEOMETRY];
2968 	assert(loc->sgpr_idx != -1);
2969 
2970 	radeon_set_sh_reg(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4,
2971 			  ngg_gs_state);
2972 }
2973 
2974 static void
radv_upload_graphics_shader_descriptors(struct radv_cmd_buffer * cmd_buffer,bool pipeline_is_dirty)2975 radv_upload_graphics_shader_descriptors(struct radv_cmd_buffer *cmd_buffer, bool pipeline_is_dirty)
2976 {
2977 	radv_flush_vertex_descriptors(cmd_buffer, pipeline_is_dirty);
2978 	radv_flush_streamout_descriptors(cmd_buffer);
2979 	radv_flush_descriptors(cmd_buffer, VK_SHADER_STAGE_ALL_GRAPHICS);
2980 	radv_flush_constants(cmd_buffer, VK_SHADER_STAGE_ALL_GRAPHICS);
2981 	radv_flush_ngg_gs_state(cmd_buffer);
2982 }
2983 
2984 struct radv_draw_info {
2985 	/**
2986 	 * Number of vertices.
2987 	 */
2988 	uint32_t count;
2989 
2990 	/**
2991 	 * Index of the first vertex.
2992 	 */
2993 	int32_t vertex_offset;
2994 
2995 	/**
2996 	 * First instance id.
2997 	 */
2998 	uint32_t first_instance;
2999 
3000 	/**
3001 	 * Number of instances.
3002 	 */
3003 	uint32_t instance_count;
3004 
3005 	/**
3006 	 * First index (indexed draws only).
3007 	 */
3008 	uint32_t first_index;
3009 
3010 	/**
3011 	 * Whether it's an indexed draw.
3012 	 */
3013 	bool indexed;
3014 
3015 	/**
3016 	 * Indirect draw parameters resource.
3017 	 */
3018 	struct radv_buffer *indirect;
3019 	uint64_t indirect_offset;
3020 	uint32_t stride;
3021 
3022 	/**
3023 	 * Draw count parameters resource.
3024 	 */
3025 	struct radv_buffer *count_buffer;
3026 	uint64_t count_buffer_offset;
3027 
3028 	/**
3029 	 * Stream output parameters resource.
3030 	 */
3031 	struct radv_buffer *strmout_buffer;
3032 	uint64_t strmout_buffer_offset;
3033 };
3034 
3035 static uint32_t
radv_get_primitive_reset_index(struct radv_cmd_buffer * cmd_buffer)3036 radv_get_primitive_reset_index(struct radv_cmd_buffer *cmd_buffer)
3037 {
3038 	switch (cmd_buffer->state.index_type) {
3039 	case V_028A7C_VGT_INDEX_8:
3040 		return 0xffu;
3041 	case V_028A7C_VGT_INDEX_16:
3042 		return 0xffffu;
3043 	case V_028A7C_VGT_INDEX_32:
3044 		return 0xffffffffu;
3045 	default:
3046 		unreachable("invalid index type");
3047 	}
3048 }
3049 
3050 static void
si_emit_ia_multi_vgt_param(struct radv_cmd_buffer * cmd_buffer,bool instanced_draw,bool indirect_draw,bool count_from_stream_output,uint32_t draw_vertex_count)3051 si_emit_ia_multi_vgt_param(struct radv_cmd_buffer *cmd_buffer,
3052 			   bool instanced_draw, bool indirect_draw,
3053 			   bool count_from_stream_output,
3054 			   uint32_t draw_vertex_count)
3055 {
3056 	struct radeon_info *info = &cmd_buffer->device->physical_device->rad_info;
3057 	struct radv_cmd_state *state = &cmd_buffer->state;
3058 	unsigned topology = state->dynamic.primitive_topology;
3059 	struct radeon_cmdbuf *cs = cmd_buffer->cs;
3060 	unsigned ia_multi_vgt_param;
3061 
3062 	ia_multi_vgt_param =
3063 		si_get_ia_multi_vgt_param(cmd_buffer, instanced_draw,
3064 					  indirect_draw,
3065 					  count_from_stream_output,
3066 					  draw_vertex_count,
3067 					  topology);
3068 
3069 	if (state->last_ia_multi_vgt_param != ia_multi_vgt_param) {
3070 		if (info->chip_class == GFX9) {
3071 			radeon_set_uconfig_reg_idx(cmd_buffer->device->physical_device,
3072 						   cs,
3073 						   R_030960_IA_MULTI_VGT_PARAM,
3074 						   4, ia_multi_vgt_param);
3075 		} else if (info->chip_class >= GFX7) {
3076 			radeon_set_context_reg_idx(cs,
3077 						   R_028AA8_IA_MULTI_VGT_PARAM,
3078 						   1, ia_multi_vgt_param);
3079 		} else {
3080 			radeon_set_context_reg(cs, R_028AA8_IA_MULTI_VGT_PARAM,
3081 					       ia_multi_vgt_param);
3082 		}
3083 		state->last_ia_multi_vgt_param = ia_multi_vgt_param;
3084 	}
3085 }
3086 
3087 static void
radv_emit_draw_registers(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * draw_info)3088 radv_emit_draw_registers(struct radv_cmd_buffer *cmd_buffer,
3089 			 const struct radv_draw_info *draw_info)
3090 {
3091 	struct radeon_info *info = &cmd_buffer->device->physical_device->rad_info;
3092 	struct radv_cmd_state *state = &cmd_buffer->state;
3093 	struct radeon_cmdbuf *cs = cmd_buffer->cs;
3094 	int32_t primitive_reset_en;
3095 
3096 	/* Draw state. */
3097 	if (info->chip_class < GFX10) {
3098 		si_emit_ia_multi_vgt_param(cmd_buffer, draw_info->instance_count > 1,
3099 					   draw_info->indirect,
3100 					   !!draw_info->strmout_buffer,
3101 					   draw_info->indirect ? 0 : draw_info->count);
3102 	}
3103 
3104 	/* Primitive restart. */
3105 	primitive_reset_en =
3106 		draw_info->indexed && state->pipeline->graphics.prim_restart_enable;
3107 
3108 	if (primitive_reset_en != state->last_primitive_reset_en) {
3109 		state->last_primitive_reset_en = primitive_reset_en;
3110 		if (info->chip_class >= GFX9) {
3111 			radeon_set_uconfig_reg(cs,
3112 					       R_03092C_VGT_MULTI_PRIM_IB_RESET_EN,
3113 					       primitive_reset_en);
3114 		} else {
3115 			radeon_set_context_reg(cs,
3116 					       R_028A94_VGT_MULTI_PRIM_IB_RESET_EN,
3117 					       primitive_reset_en);
3118 		}
3119 	}
3120 
3121 	if (primitive_reset_en) {
3122 		uint32_t primitive_reset_index =
3123 			radv_get_primitive_reset_index(cmd_buffer);
3124 
3125 		if (primitive_reset_index != state->last_primitive_reset_index) {
3126 			radeon_set_context_reg(cs,
3127 					       R_02840C_VGT_MULTI_PRIM_IB_RESET_INDX,
3128 					       primitive_reset_index);
3129 			state->last_primitive_reset_index = primitive_reset_index;
3130 		}
3131 	}
3132 
3133 	if (draw_info->strmout_buffer) {
3134 		uint64_t va = radv_buffer_get_va(draw_info->strmout_buffer->bo);
3135 
3136 		va += draw_info->strmout_buffer->offset +
3137 		      draw_info->strmout_buffer_offset;
3138 
3139 		radeon_set_context_reg(cs, R_028B30_VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE,
3140 				       draw_info->stride);
3141 
3142 		radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
3143 		radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) |
3144 				COPY_DATA_DST_SEL(COPY_DATA_REG) |
3145 				COPY_DATA_WR_CONFIRM);
3146 		radeon_emit(cs, va);
3147 		radeon_emit(cs, va >> 32);
3148 		radeon_emit(cs, R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2);
3149 		radeon_emit(cs, 0); /* unused */
3150 
3151 		radv_cs_add_buffer(cmd_buffer->device->ws, cs, draw_info->strmout_buffer->bo);
3152 	}
3153 }
3154 
radv_stage_flush(struct radv_cmd_buffer * cmd_buffer,VkPipelineStageFlags src_stage_mask)3155 static void radv_stage_flush(struct radv_cmd_buffer *cmd_buffer,
3156 			     VkPipelineStageFlags src_stage_mask)
3157 {
3158 	if (src_stage_mask & (VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT |
3159 	                      VK_PIPELINE_STAGE_TRANSFER_BIT |
3160 	                      VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT |
3161 	                      VK_PIPELINE_STAGE_ALL_COMMANDS_BIT)) {
3162 		cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH;
3163 	}
3164 
3165 	if (src_stage_mask & (VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT |
3166 			      VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT |
3167 			      VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT |
3168 			      VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT |
3169 			      VK_PIPELINE_STAGE_TRANSFER_BIT |
3170 			      VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT |
3171 			      VK_PIPELINE_STAGE_ALL_GRAPHICS_BIT |
3172 			      VK_PIPELINE_STAGE_ALL_COMMANDS_BIT)) {
3173 		cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH;
3174 	} else if (src_stage_mask & (VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT |
3175 	                             VK_PIPELINE_STAGE_VERTEX_INPUT_BIT |
3176 	                             VK_PIPELINE_STAGE_VERTEX_SHADER_BIT |
3177 				     VK_PIPELINE_STAGE_TESSELLATION_CONTROL_SHADER_BIT |
3178 				     VK_PIPELINE_STAGE_TESSELLATION_EVALUATION_SHADER_BIT |
3179 				     VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT |
3180 				     VK_PIPELINE_STAGE_TRANSFORM_FEEDBACK_BIT_EXT)) {
3181 		cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VS_PARTIAL_FLUSH;
3182 	}
3183 }
3184 
3185 static enum radv_cmd_flush_bits
radv_src_access_flush(struct radv_cmd_buffer * cmd_buffer,VkAccessFlags src_flags,struct radv_image * image)3186 radv_src_access_flush(struct radv_cmd_buffer *cmd_buffer,
3187 		      VkAccessFlags src_flags,
3188 		      struct radv_image *image)
3189 {
3190 	bool flush_CB_meta = true, flush_DB_meta = true;
3191 	enum radv_cmd_flush_bits flush_bits = 0;
3192 	uint32_t b;
3193 
3194 	if (image) {
3195 		if (!radv_image_has_CB_metadata(image))
3196 			flush_CB_meta = false;
3197 		if (!radv_image_has_htile(image))
3198 			flush_DB_meta = false;
3199 	}
3200 
3201 	for_each_bit(b, src_flags) {
3202 		switch ((VkAccessFlagBits)(1 << b)) {
3203 		case VK_ACCESS_SHADER_WRITE_BIT:
3204 		case VK_ACCESS_TRANSFORM_FEEDBACK_WRITE_BIT_EXT:
3205 		case VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT:
3206 			flush_bits |= RADV_CMD_FLAG_WB_L2;
3207 			break;
3208 		case VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT:
3209 			flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB;
3210 			if (flush_CB_meta)
3211 				flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
3212 			break;
3213 		case VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT:
3214 			flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB;
3215 			if (flush_DB_meta)
3216 				flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
3217 			break;
3218 		case VK_ACCESS_TRANSFER_WRITE_BIT:
3219 			flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB |
3220 			              RADV_CMD_FLAG_FLUSH_AND_INV_DB |
3221 			              RADV_CMD_FLAG_INV_L2;
3222 
3223 			if (flush_CB_meta)
3224 				flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
3225 			if (flush_DB_meta)
3226 				flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
3227 			break;
3228 		case VK_ACCESS_MEMORY_WRITE_BIT:
3229 			flush_bits |= RADV_CMD_FLAG_INV_L2 |
3230 				      RADV_CMD_FLAG_WB_L2 |
3231 				      RADV_CMD_FLAG_FLUSH_AND_INV_CB |
3232 				      RADV_CMD_FLAG_FLUSH_AND_INV_DB;
3233 
3234 			if (flush_CB_meta)
3235 				flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
3236 			if (flush_DB_meta)
3237 				flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
3238 			break;
3239 		default:
3240 			break;
3241 		}
3242 	}
3243 	return flush_bits;
3244 }
3245 
3246 static enum radv_cmd_flush_bits
radv_dst_access_flush(struct radv_cmd_buffer * cmd_buffer,VkAccessFlags dst_flags,struct radv_image * image)3247 radv_dst_access_flush(struct radv_cmd_buffer *cmd_buffer,
3248                       VkAccessFlags dst_flags,
3249                       struct radv_image *image)
3250 {
3251 	bool flush_CB_meta = true, flush_DB_meta = true;
3252 	enum radv_cmd_flush_bits flush_bits = 0;
3253 	bool flush_CB = true, flush_DB = true;
3254 	bool image_is_coherent = false;
3255 	uint32_t b;
3256 
3257 	if (image) {
3258 		if (!(image->usage & VK_IMAGE_USAGE_STORAGE_BIT)) {
3259 			flush_CB = false;
3260 			flush_DB = false;
3261 		}
3262 
3263 		if (!radv_image_has_CB_metadata(image))
3264 			flush_CB_meta = false;
3265 		if (!radv_image_has_htile(image))
3266 			flush_DB_meta = false;
3267 
3268 		/* TODO: implement shader coherent for GFX10 */
3269 
3270 		if (cmd_buffer->device->physical_device->rad_info.chip_class == GFX9) {
3271 			if (image->info.samples == 1 &&
3272 			    (image->usage & (VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT |
3273 					     VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT)) &&
3274 			    !vk_format_is_stencil(image->vk_format)) {
3275 				/* Single-sample color and single-sample depth
3276 				 * (not stencil) are coherent with shaders on
3277 				 * GFX9.
3278 				 */
3279 				image_is_coherent = true;
3280 			}
3281 		}
3282 	}
3283 
3284 	for_each_bit(b, dst_flags) {
3285 		switch ((VkAccessFlagBits)(1 << b)) {
3286 		case VK_ACCESS_INDIRECT_COMMAND_READ_BIT:
3287 		case VK_ACCESS_INDEX_READ_BIT:
3288 		case VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT:
3289 			break;
3290 		case VK_ACCESS_UNIFORM_READ_BIT:
3291 			flush_bits |= RADV_CMD_FLAG_INV_VCACHE | RADV_CMD_FLAG_INV_SCACHE;
3292 			break;
3293 		case VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT:
3294 		case VK_ACCESS_TRANSFER_READ_BIT:
3295 		case VK_ACCESS_INPUT_ATTACHMENT_READ_BIT:
3296 			flush_bits |= RADV_CMD_FLAG_INV_VCACHE |
3297 			              RADV_CMD_FLAG_INV_L2;
3298 			break;
3299 		case VK_ACCESS_SHADER_READ_BIT:
3300 			flush_bits |= RADV_CMD_FLAG_INV_VCACHE;
3301 			/* Unlike LLVM, ACO uses SMEM for SSBOs and we have to
3302 			 * invalidate the scalar cache. */
3303 			if (!cmd_buffer->device->physical_device->use_llvm)
3304 				flush_bits |= RADV_CMD_FLAG_INV_SCACHE;
3305 
3306 			if (!image_is_coherent)
3307 				flush_bits |= RADV_CMD_FLAG_INV_L2;
3308 			break;
3309 		case VK_ACCESS_COLOR_ATTACHMENT_READ_BIT:
3310 			if (flush_CB)
3311 				flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB;
3312 			if (flush_CB_meta)
3313 				flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
3314 			break;
3315 		case VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT:
3316 			if (flush_DB)
3317 				flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB;
3318 			if (flush_DB_meta)
3319 				flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
3320 			break;
3321 		case VK_ACCESS_MEMORY_READ_BIT:
3322 			flush_bits |= RADV_CMD_FLAG_INV_VCACHE |
3323 				      RADV_CMD_FLAG_INV_SCACHE |
3324 			              RADV_CMD_FLAG_INV_L2;
3325 			if (flush_CB)
3326 				flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB;
3327 			if (flush_CB_meta)
3328 				flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
3329 			if (flush_DB)
3330 				flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB;
3331 			if (flush_DB_meta)
3332 				flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
3333 			break;
3334 		default:
3335 			break;
3336 		}
3337 	}
3338 	return flush_bits;
3339 }
3340 
radv_subpass_barrier(struct radv_cmd_buffer * cmd_buffer,const struct radv_subpass_barrier * barrier)3341 void radv_subpass_barrier(struct radv_cmd_buffer *cmd_buffer,
3342 			  const struct radv_subpass_barrier *barrier)
3343 {
3344 	cmd_buffer->state.flush_bits |= radv_src_access_flush(cmd_buffer, barrier->src_access_mask,
3345 							      NULL);
3346 	radv_stage_flush(cmd_buffer, barrier->src_stage_mask);
3347 	cmd_buffer->state.flush_bits |= radv_dst_access_flush(cmd_buffer, barrier->dst_access_mask,
3348 	                                                      NULL);
3349 }
3350 
3351 uint32_t
radv_get_subpass_id(struct radv_cmd_buffer * cmd_buffer)3352 radv_get_subpass_id(struct radv_cmd_buffer *cmd_buffer)
3353 {
3354 	struct radv_cmd_state *state = &cmd_buffer->state;
3355 	uint32_t subpass_id = state->subpass - state->pass->subpasses;
3356 
3357 	/* The id of this subpass shouldn't exceed the number of subpasses in
3358 	 * this render pass minus 1.
3359 	 */
3360 	assert(subpass_id < state->pass->subpass_count);
3361 	return subpass_id;
3362 }
3363 
3364 static struct radv_sample_locations_state *
radv_get_attachment_sample_locations(struct radv_cmd_buffer * cmd_buffer,uint32_t att_idx,bool begin_subpass)3365 radv_get_attachment_sample_locations(struct radv_cmd_buffer *cmd_buffer,
3366 				     uint32_t att_idx,
3367 				     bool begin_subpass)
3368 {
3369 	struct radv_cmd_state *state = &cmd_buffer->state;
3370 	uint32_t subpass_id = radv_get_subpass_id(cmd_buffer);
3371 	struct radv_image_view *view = state->attachments[att_idx].iview;
3372 
3373 	if (view->image->info.samples == 1)
3374 		return NULL;
3375 
3376 	if (state->pass->attachments[att_idx].first_subpass_idx == subpass_id) {
3377 		/* Return the initial sample locations if this is the initial
3378 		 * layout transition of the given subpass attachemnt.
3379 		 */
3380 		if (state->attachments[att_idx].sample_location.count > 0)
3381 			return &state->attachments[att_idx].sample_location;
3382 	} else {
3383 		/* Otherwise return the subpass sample locations if defined. */
3384 		if (state->subpass_sample_locs) {
3385 			/* Because the driver sets the current subpass before
3386 			 * initial layout transitions, we should use the sample
3387 			 * locations from the previous subpass to avoid an
3388 			 * off-by-one problem. Otherwise, use the sample
3389 			 * locations for the current subpass for final layout
3390 			 * transitions.
3391 			 */
3392 			if (begin_subpass)
3393 				subpass_id--;
3394 
3395 			for (uint32_t i = 0; i < state->num_subpass_sample_locs; i++) {
3396 				if (state->subpass_sample_locs[i].subpass_idx == subpass_id)
3397 					return &state->subpass_sample_locs[i].sample_location;
3398 			}
3399 		}
3400 	}
3401 
3402 	return NULL;
3403 }
3404 
radv_handle_subpass_image_transition(struct radv_cmd_buffer * cmd_buffer,struct radv_subpass_attachment att,bool begin_subpass)3405 static void radv_handle_subpass_image_transition(struct radv_cmd_buffer *cmd_buffer,
3406 						 struct radv_subpass_attachment att,
3407 						 bool begin_subpass)
3408 {
3409 	unsigned idx = att.attachment;
3410 	struct radv_image_view *view = cmd_buffer->state.attachments[idx].iview;
3411 	struct radv_sample_locations_state *sample_locs;
3412 	VkImageSubresourceRange range;
3413 	range.aspectMask = view->aspect_mask;
3414 	range.baseMipLevel = view->base_mip;
3415 	range.levelCount = 1;
3416 	range.baseArrayLayer = view->base_layer;
3417 	range.layerCount = cmd_buffer->state.framebuffer->layers;
3418 
3419 	if (cmd_buffer->state.subpass->view_mask) {
3420 		/* If the current subpass uses multiview, the driver might have
3421 		 * performed a fast color/depth clear to the whole image
3422 		 * (including all layers). To make sure the driver will
3423 		 * decompress the image correctly (if needed), we have to
3424 		 * account for the "real" number of layers. If the view mask is
3425 		 * sparse, this will decompress more layers than needed.
3426 		 */
3427 		range.layerCount = util_last_bit(cmd_buffer->state.subpass->view_mask);
3428 	}
3429 
3430 	/* Get the subpass sample locations for the given attachment, if NULL
3431 	 * is returned the driver will use the default HW locations.
3432 	 */
3433 	sample_locs = radv_get_attachment_sample_locations(cmd_buffer, idx,
3434 							   begin_subpass);
3435 
3436 	/* Determine if the subpass uses separate depth/stencil layouts. */
3437 	bool uses_separate_depth_stencil_layouts = false;
3438 	if ((cmd_buffer->state.attachments[idx].current_layout !=
3439 	     cmd_buffer->state.attachments[idx].current_stencil_layout) ||
3440 	    (att.layout != att.stencil_layout)) {
3441 		uses_separate_depth_stencil_layouts = true;
3442 	}
3443 
3444 	/* For separate layouts, perform depth and stencil transitions
3445 	 * separately.
3446 	 */
3447 	if (uses_separate_depth_stencil_layouts &&
3448 	    (range.aspectMask == (VK_IMAGE_ASPECT_DEPTH_BIT |
3449 				  VK_IMAGE_ASPECT_STENCIL_BIT))) {
3450 		/* Depth-only transitions. */
3451 		range.aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT;
3452 		radv_handle_image_transition(cmd_buffer,
3453 					     view->image,
3454 					     cmd_buffer->state.attachments[idx].current_layout,
3455 					     cmd_buffer->state.attachments[idx].current_in_render_loop,
3456 					     att.layout, att.in_render_loop,
3457 					     0, 0, &range, sample_locs);
3458 
3459 		/* Stencil-only transitions. */
3460 		range.aspectMask = VK_IMAGE_ASPECT_STENCIL_BIT;
3461 		radv_handle_image_transition(cmd_buffer,
3462 					     view->image,
3463 					     cmd_buffer->state.attachments[idx].current_stencil_layout,
3464 					     cmd_buffer->state.attachments[idx].current_in_render_loop,
3465 					     att.stencil_layout, att.in_render_loop,
3466 					     0, 0, &range, sample_locs);
3467 	} else {
3468 		radv_handle_image_transition(cmd_buffer,
3469 					     view->image,
3470 					     cmd_buffer->state.attachments[idx].current_layout,
3471 					     cmd_buffer->state.attachments[idx].current_in_render_loop,
3472 					     att.layout, att.in_render_loop,
3473 					     0, 0, &range, sample_locs);
3474 	}
3475 
3476 	cmd_buffer->state.attachments[idx].current_layout = att.layout;
3477 	cmd_buffer->state.attachments[idx].current_stencil_layout = att.stencil_layout;
3478 	cmd_buffer->state.attachments[idx].current_in_render_loop = att.in_render_loop;
3479 
3480 
3481 }
3482 
3483 void
radv_cmd_buffer_set_subpass(struct radv_cmd_buffer * cmd_buffer,const struct radv_subpass * subpass)3484 radv_cmd_buffer_set_subpass(struct radv_cmd_buffer *cmd_buffer,
3485 			    const struct radv_subpass *subpass)
3486 {
3487 	cmd_buffer->state.subpass = subpass;
3488 
3489 	cmd_buffer->state.dirty |= RADV_CMD_DIRTY_FRAMEBUFFER;
3490 }
3491 
3492 static VkResult
radv_cmd_state_setup_sample_locations(struct radv_cmd_buffer * cmd_buffer,struct radv_render_pass * pass,const VkRenderPassBeginInfo * info)3493 radv_cmd_state_setup_sample_locations(struct radv_cmd_buffer *cmd_buffer,
3494 				      struct radv_render_pass *pass,
3495 				      const VkRenderPassBeginInfo *info)
3496 {
3497 	const struct VkRenderPassSampleLocationsBeginInfoEXT *sample_locs =
3498 		vk_find_struct_const(info->pNext,
3499 				     RENDER_PASS_SAMPLE_LOCATIONS_BEGIN_INFO_EXT);
3500 	struct radv_cmd_state *state = &cmd_buffer->state;
3501 
3502 	if (!sample_locs) {
3503 		state->subpass_sample_locs = NULL;
3504 		return VK_SUCCESS;
3505 	}
3506 
3507 	for (uint32_t i = 0; i < sample_locs->attachmentInitialSampleLocationsCount; i++) {
3508 		const VkAttachmentSampleLocationsEXT *att_sample_locs =
3509 			&sample_locs->pAttachmentInitialSampleLocations[i];
3510 		uint32_t att_idx = att_sample_locs->attachmentIndex;
3511 		struct radv_image *image = cmd_buffer->state.attachments[att_idx].iview->image;
3512 
3513 		assert(vk_format_is_depth_or_stencil(image->vk_format));
3514 
3515 		/* From the Vulkan spec 1.1.108:
3516 		 *
3517 		 * "If the image referenced by the framebuffer attachment at
3518 		 *  index attachmentIndex was not created with
3519 		 *  VK_IMAGE_CREATE_SAMPLE_LOCATIONS_COMPATIBLE_DEPTH_BIT_EXT
3520 		 *  then the values specified in sampleLocationsInfo are
3521 		 *  ignored."
3522 		 */
3523 		if (!(image->flags & VK_IMAGE_CREATE_SAMPLE_LOCATIONS_COMPATIBLE_DEPTH_BIT_EXT))
3524 			continue;
3525 
3526 		const VkSampleLocationsInfoEXT *sample_locs_info =
3527 			&att_sample_locs->sampleLocationsInfo;
3528 
3529 		state->attachments[att_idx].sample_location.per_pixel =
3530 			sample_locs_info->sampleLocationsPerPixel;
3531 		state->attachments[att_idx].sample_location.grid_size =
3532 			sample_locs_info->sampleLocationGridSize;
3533 		state->attachments[att_idx].sample_location.count =
3534 			sample_locs_info->sampleLocationsCount;
3535 		typed_memcpy(&state->attachments[att_idx].sample_location.locations[0],
3536 			     sample_locs_info->pSampleLocations,
3537 			     sample_locs_info->sampleLocationsCount);
3538 	}
3539 
3540 	state->subpass_sample_locs = vk_alloc(&cmd_buffer->pool->alloc,
3541 					      sample_locs->postSubpassSampleLocationsCount *
3542 					      sizeof(state->subpass_sample_locs[0]),
3543 					      8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
3544 	if (state->subpass_sample_locs == NULL) {
3545 		cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY;
3546 		return cmd_buffer->record_result;
3547 	}
3548 
3549 	state->num_subpass_sample_locs = sample_locs->postSubpassSampleLocationsCount;
3550 
3551 	for (uint32_t i = 0; i < sample_locs->postSubpassSampleLocationsCount; i++) {
3552 		const VkSubpassSampleLocationsEXT *subpass_sample_locs_info =
3553 			&sample_locs->pPostSubpassSampleLocations[i];
3554 		const VkSampleLocationsInfoEXT *sample_locs_info =
3555 			&subpass_sample_locs_info->sampleLocationsInfo;
3556 
3557 		state->subpass_sample_locs[i].subpass_idx =
3558 			subpass_sample_locs_info->subpassIndex;
3559 		state->subpass_sample_locs[i].sample_location.per_pixel =
3560 			sample_locs_info->sampleLocationsPerPixel;
3561 		state->subpass_sample_locs[i].sample_location.grid_size =
3562 			sample_locs_info->sampleLocationGridSize;
3563 		state->subpass_sample_locs[i].sample_location.count =
3564 			sample_locs_info->sampleLocationsCount;
3565 		typed_memcpy(&state->subpass_sample_locs[i].sample_location.locations[0],
3566 			     sample_locs_info->pSampleLocations,
3567 			     sample_locs_info->sampleLocationsCount);
3568 	}
3569 
3570 	return VK_SUCCESS;
3571 }
3572 
3573 static VkResult
radv_cmd_state_setup_attachments(struct radv_cmd_buffer * cmd_buffer,struct radv_render_pass * pass,const VkRenderPassBeginInfo * info)3574 radv_cmd_state_setup_attachments(struct radv_cmd_buffer *cmd_buffer,
3575 				 struct radv_render_pass *pass,
3576 				 const VkRenderPassBeginInfo *info)
3577 {
3578 	struct radv_cmd_state *state = &cmd_buffer->state;
3579 	const struct VkRenderPassAttachmentBeginInfo *attachment_info = NULL;
3580 
3581 	if (info) {
3582 		attachment_info = vk_find_struct_const(info->pNext,
3583 		                                       RENDER_PASS_ATTACHMENT_BEGIN_INFO);
3584 	}
3585 
3586 
3587 	if (pass->attachment_count == 0) {
3588 		state->attachments = NULL;
3589 		return VK_SUCCESS;
3590 	}
3591 
3592 	state->attachments = vk_alloc(&cmd_buffer->pool->alloc,
3593 					pass->attachment_count *
3594 					sizeof(state->attachments[0]),
3595 					8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
3596 	if (state->attachments == NULL) {
3597 		cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY;
3598 		return cmd_buffer->record_result;
3599 	}
3600 
3601 	for (uint32_t i = 0; i < pass->attachment_count; ++i) {
3602 		struct radv_render_pass_attachment *att = &pass->attachments[i];
3603 		VkImageAspectFlags att_aspects = vk_format_aspects(att->format);
3604 		VkImageAspectFlags clear_aspects = 0;
3605 
3606 		if (att_aspects == VK_IMAGE_ASPECT_COLOR_BIT) {
3607 			/* color attachment */
3608 			if (att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) {
3609 				clear_aspects |= VK_IMAGE_ASPECT_COLOR_BIT;
3610 			}
3611 		} else {
3612 			/* depthstencil attachment */
3613 			if ((att_aspects & VK_IMAGE_ASPECT_DEPTH_BIT) &&
3614 			    att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) {
3615 				clear_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT;
3616 				if ((att_aspects & VK_IMAGE_ASPECT_STENCIL_BIT) &&
3617 				    att->stencil_load_op == VK_ATTACHMENT_LOAD_OP_DONT_CARE)
3618 					clear_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;
3619 			}
3620 			if ((att_aspects & VK_IMAGE_ASPECT_STENCIL_BIT) &&
3621 			    att->stencil_load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) {
3622 				clear_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;
3623 			}
3624 		}
3625 
3626 		state->attachments[i].pending_clear_aspects = clear_aspects;
3627 		state->attachments[i].cleared_views = 0;
3628 		if (clear_aspects && info) {
3629 			assert(info->clearValueCount > i);
3630 			state->attachments[i].clear_value = info->pClearValues[i];
3631 		}
3632 
3633 		state->attachments[i].current_layout = att->initial_layout;
3634 		state->attachments[i].current_in_render_loop = false;
3635 		state->attachments[i].current_stencil_layout = att->stencil_initial_layout;
3636 		state->attachments[i].sample_location.count = 0;
3637 
3638 		struct radv_image_view *iview;
3639 		if (attachment_info && attachment_info->attachmentCount > i) {
3640 			iview = radv_image_view_from_handle(attachment_info->pAttachments[i]);
3641 		} else {
3642 			iview = state->framebuffer->attachments[i];
3643 		}
3644 
3645 		state->attachments[i].iview = iview;
3646 		if (iview->aspect_mask & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) {
3647 			radv_initialise_ds_surface(cmd_buffer->device, &state->attachments[i].ds, iview);
3648 		} else {
3649 			radv_initialise_color_surface(cmd_buffer->device, &state->attachments[i].cb, iview);
3650 		}
3651 	}
3652 
3653 	return VK_SUCCESS;
3654 }
3655 
radv_AllocateCommandBuffers(VkDevice _device,const VkCommandBufferAllocateInfo * pAllocateInfo,VkCommandBuffer * pCommandBuffers)3656 VkResult radv_AllocateCommandBuffers(
3657 	VkDevice _device,
3658 	const VkCommandBufferAllocateInfo *pAllocateInfo,
3659 	VkCommandBuffer *pCommandBuffers)
3660 {
3661 	RADV_FROM_HANDLE(radv_device, device, _device);
3662 	RADV_FROM_HANDLE(radv_cmd_pool, pool, pAllocateInfo->commandPool);
3663 
3664 	VkResult result = VK_SUCCESS;
3665 	uint32_t i;
3666 
3667 	for (i = 0; i < pAllocateInfo->commandBufferCount; i++) {
3668 
3669 		if (!list_is_empty(&pool->free_cmd_buffers)) {
3670 			struct radv_cmd_buffer *cmd_buffer = list_first_entry(&pool->free_cmd_buffers, struct radv_cmd_buffer, pool_link);
3671 
3672 			list_del(&cmd_buffer->pool_link);
3673 			list_addtail(&cmd_buffer->pool_link, &pool->cmd_buffers);
3674 
3675 			result = radv_reset_cmd_buffer(cmd_buffer);
3676 			cmd_buffer->level = pAllocateInfo->level;
3677 
3678 			pCommandBuffers[i] = radv_cmd_buffer_to_handle(cmd_buffer);
3679 		} else {
3680 			result = radv_create_cmd_buffer(device, pool, pAllocateInfo->level,
3681 			                                &pCommandBuffers[i]);
3682 		}
3683 		if (result != VK_SUCCESS)
3684 			break;
3685 	}
3686 
3687 	if (result != VK_SUCCESS) {
3688 		radv_FreeCommandBuffers(_device, pAllocateInfo->commandPool,
3689 					i, pCommandBuffers);
3690 
3691 		/* From the Vulkan 1.0.66 spec:
3692 		 *
3693 		 * "vkAllocateCommandBuffers can be used to create multiple
3694 		 *  command buffers. If the creation of any of those command
3695 		 *  buffers fails, the implementation must destroy all
3696 		 *  successfully created command buffer objects from this
3697 		 *  command, set all entries of the pCommandBuffers array to
3698 		 *  NULL and return the error."
3699 		 */
3700 		memset(pCommandBuffers, 0,
3701 		       sizeof(*pCommandBuffers) * pAllocateInfo->commandBufferCount);
3702 	}
3703 
3704 	return result;
3705 }
3706 
radv_FreeCommandBuffers(VkDevice device,VkCommandPool commandPool,uint32_t commandBufferCount,const VkCommandBuffer * pCommandBuffers)3707 void radv_FreeCommandBuffers(
3708 	VkDevice device,
3709 	VkCommandPool commandPool,
3710 	uint32_t commandBufferCount,
3711 	const VkCommandBuffer *pCommandBuffers)
3712 {
3713 	for (uint32_t i = 0; i < commandBufferCount; i++) {
3714 		RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, pCommandBuffers[i]);
3715 
3716 		if (cmd_buffer) {
3717 			if (cmd_buffer->pool) {
3718 				list_del(&cmd_buffer->pool_link);
3719 				list_addtail(&cmd_buffer->pool_link, &cmd_buffer->pool->free_cmd_buffers);
3720 			} else
3721 				radv_destroy_cmd_buffer(cmd_buffer);
3722 
3723 		}
3724 	}
3725 }
3726 
radv_ResetCommandBuffer(VkCommandBuffer commandBuffer,VkCommandBufferResetFlags flags)3727 VkResult radv_ResetCommandBuffer(
3728 	VkCommandBuffer commandBuffer,
3729 	VkCommandBufferResetFlags flags)
3730 {
3731 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
3732 	return radv_reset_cmd_buffer(cmd_buffer);
3733 }
3734 
radv_BeginCommandBuffer(VkCommandBuffer commandBuffer,const VkCommandBufferBeginInfo * pBeginInfo)3735 VkResult radv_BeginCommandBuffer(
3736 	VkCommandBuffer commandBuffer,
3737 	const VkCommandBufferBeginInfo *pBeginInfo)
3738 {
3739 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
3740 	VkResult result = VK_SUCCESS;
3741 
3742 	if (cmd_buffer->status != RADV_CMD_BUFFER_STATUS_INITIAL) {
3743 		/* If the command buffer has already been resetted with
3744 		 * vkResetCommandBuffer, no need to do it again.
3745 		 */
3746 		result = radv_reset_cmd_buffer(cmd_buffer);
3747 		if (result != VK_SUCCESS)
3748 			return result;
3749 	}
3750 
3751 	memset(&cmd_buffer->state, 0, sizeof(cmd_buffer->state));
3752 	cmd_buffer->state.last_primitive_reset_en = -1;
3753 	cmd_buffer->state.last_index_type = -1;
3754 	cmd_buffer->state.last_num_instances = -1;
3755 	cmd_buffer->state.last_vertex_offset = -1;
3756 	cmd_buffer->state.last_first_instance = -1;
3757 	cmd_buffer->state.predication_type = -1;
3758 	cmd_buffer->state.last_sx_ps_downconvert = -1;
3759 	cmd_buffer->state.last_sx_blend_opt_epsilon = -1;
3760 	cmd_buffer->state.last_sx_blend_opt_control = -1;
3761 	cmd_buffer->usage_flags = pBeginInfo->flags;
3762 
3763 	if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY &&
3764 	    (pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT)) {
3765 		assert(pBeginInfo->pInheritanceInfo);
3766 		cmd_buffer->state.framebuffer = radv_framebuffer_from_handle(pBeginInfo->pInheritanceInfo->framebuffer);
3767 		cmd_buffer->state.pass = radv_render_pass_from_handle(pBeginInfo->pInheritanceInfo->renderPass);
3768 
3769 		struct radv_subpass *subpass =
3770 			&cmd_buffer->state.pass->subpasses[pBeginInfo->pInheritanceInfo->subpass];
3771 
3772 		if (cmd_buffer->state.framebuffer) {
3773 			result = radv_cmd_state_setup_attachments(cmd_buffer, cmd_buffer->state.pass, NULL);
3774 			if (result != VK_SUCCESS)
3775 				return result;
3776 		}
3777 
3778 		cmd_buffer->state.inherited_pipeline_statistics =
3779 			pBeginInfo->pInheritanceInfo->pipelineStatistics;
3780 
3781 		radv_cmd_buffer_set_subpass(cmd_buffer, subpass);
3782 	}
3783 
3784 	if (unlikely(cmd_buffer->device->trace_bo))
3785 		radv_cmd_buffer_trace_emit(cmd_buffer);
3786 
3787 	radv_describe_begin_cmd_buffer(cmd_buffer);
3788 
3789 	cmd_buffer->status = RADV_CMD_BUFFER_STATUS_RECORDING;
3790 
3791 	return result;
3792 }
3793 
radv_CmdBindVertexBuffers(VkCommandBuffer commandBuffer,uint32_t firstBinding,uint32_t bindingCount,const VkBuffer * pBuffers,const VkDeviceSize * pOffsets)3794 void radv_CmdBindVertexBuffers(
3795         VkCommandBuffer                             commandBuffer,
3796         uint32_t                                    firstBinding,
3797         uint32_t                                    bindingCount,
3798         const VkBuffer*                             pBuffers,
3799         const VkDeviceSize*                         pOffsets)
3800 {
3801 	radv_CmdBindVertexBuffers2EXT(commandBuffer, firstBinding,
3802 				      bindingCount, pBuffers, pOffsets,
3803 				      NULL, NULL);
3804 }
3805 
radv_CmdBindVertexBuffers2EXT(VkCommandBuffer commandBuffer,uint32_t firstBinding,uint32_t bindingCount,const VkBuffer * pBuffers,const VkDeviceSize * pOffsets,const VkDeviceSize * pSizes,const VkDeviceSize * pStrides)3806 void radv_CmdBindVertexBuffers2EXT(
3807 	VkCommandBuffer                             commandBuffer,
3808 	uint32_t                                    firstBinding,
3809 	uint32_t                                    bindingCount,
3810 	const VkBuffer*                             pBuffers,
3811 	const VkDeviceSize*                         pOffsets,
3812 	const VkDeviceSize*                         pSizes,
3813 	const VkDeviceSize*                         pStrides)
3814 {
3815 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
3816 	struct radv_vertex_binding *vb = cmd_buffer->vertex_bindings;
3817 	bool changed = false;
3818 
3819 	/* We have to defer setting up vertex buffer since we need the buffer
3820 	 * stride from the pipeline. */
3821 
3822 	assert(firstBinding + bindingCount <= MAX_VBS);
3823 	for (uint32_t i = 0; i < bindingCount; i++) {
3824 		RADV_FROM_HANDLE(radv_buffer, buffer, pBuffers[i]);
3825 		uint32_t idx = firstBinding + i;
3826 		VkDeviceSize size = pSizes ? pSizes[i] : 0;
3827 		VkDeviceSize stride = pStrides ? pStrides[i] : 0;
3828 
3829 		/* pSizes and pStrides are optional. */
3830 		if (!changed &&
3831 		    (vb[idx].buffer != buffer ||
3832 		     vb[idx].offset != pOffsets[i] ||
3833 		     vb[idx].size != size ||
3834 		     vb[idx].stride != stride)) {
3835 			changed = true;
3836 		}
3837 
3838 		vb[idx].buffer = buffer;
3839 		vb[idx].offset = pOffsets[i];
3840 		vb[idx].size = size;
3841 		vb[idx].stride = stride;
3842 
3843 		if (buffer) {
3844 			radv_cs_add_buffer(cmd_buffer->device->ws,
3845 					   cmd_buffer->cs, vb[idx].buffer->bo);
3846 		}
3847 	}
3848 
3849 	if (!changed) {
3850 		/* No state changes. */
3851 		return;
3852 	}
3853 
3854 	cmd_buffer->state.dirty |= RADV_CMD_DIRTY_VERTEX_BUFFER;
3855 }
3856 
3857 static uint32_t
vk_to_index_type(VkIndexType type)3858 vk_to_index_type(VkIndexType type)
3859 {
3860 	switch (type) {
3861 	case VK_INDEX_TYPE_UINT8_EXT:
3862 		return V_028A7C_VGT_INDEX_8;
3863 	case VK_INDEX_TYPE_UINT16:
3864 		return V_028A7C_VGT_INDEX_16;
3865 	case VK_INDEX_TYPE_UINT32:
3866 		return V_028A7C_VGT_INDEX_32;
3867 	default:
3868 		unreachable("invalid index type");
3869 	}
3870 }
3871 
3872 static uint32_t
radv_get_vgt_index_size(uint32_t type)3873 radv_get_vgt_index_size(uint32_t type)
3874 {
3875 	switch (type) {
3876 	case V_028A7C_VGT_INDEX_8:
3877 		return 1;
3878 	case V_028A7C_VGT_INDEX_16:
3879 		return 2;
3880 	case V_028A7C_VGT_INDEX_32:
3881 		return 4;
3882 	default:
3883 		unreachable("invalid index type");
3884 	}
3885 }
3886 
radv_CmdBindIndexBuffer(VkCommandBuffer commandBuffer,VkBuffer buffer,VkDeviceSize offset,VkIndexType indexType)3887 void radv_CmdBindIndexBuffer(
3888 	VkCommandBuffer                             commandBuffer,
3889 	VkBuffer buffer,
3890 	VkDeviceSize offset,
3891 	VkIndexType indexType)
3892 {
3893 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
3894 	RADV_FROM_HANDLE(radv_buffer, index_buffer, buffer);
3895 
3896 	if (cmd_buffer->state.index_buffer == index_buffer &&
3897 	    cmd_buffer->state.index_offset == offset &&
3898 	    cmd_buffer->state.index_type == indexType) {
3899 		/* No state changes. */
3900 		return;
3901 	}
3902 
3903 	cmd_buffer->state.index_buffer = index_buffer;
3904 	cmd_buffer->state.index_offset = offset;
3905 	cmd_buffer->state.index_type = vk_to_index_type(indexType);
3906 	cmd_buffer->state.index_va = radv_buffer_get_va(index_buffer->bo);
3907 	cmd_buffer->state.index_va += index_buffer->offset + offset;
3908 
3909 	int index_size = radv_get_vgt_index_size(vk_to_index_type(indexType));
3910 	cmd_buffer->state.max_index_count = (index_buffer->size - offset) / index_size;
3911 	cmd_buffer->state.dirty |= RADV_CMD_DIRTY_INDEX_BUFFER;
3912 	radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, index_buffer->bo);
3913 }
3914 
3915 
3916 static void
radv_bind_descriptor_set(struct radv_cmd_buffer * cmd_buffer,VkPipelineBindPoint bind_point,struct radv_descriptor_set * set,unsigned idx)3917 radv_bind_descriptor_set(struct radv_cmd_buffer *cmd_buffer,
3918 			 VkPipelineBindPoint bind_point,
3919 			 struct radv_descriptor_set *set, unsigned idx)
3920 {
3921 	struct radeon_winsys *ws = cmd_buffer->device->ws;
3922 
3923 	radv_set_descriptor_set(cmd_buffer, bind_point, set, idx);
3924 
3925 	assert(set);
3926 
3927 	if (!cmd_buffer->device->use_global_bo_list) {
3928 		for (unsigned j = 0; j < set->buffer_count; ++j)
3929 			if (set->descriptors[j])
3930 				radv_cs_add_buffer(ws, cmd_buffer->cs, set->descriptors[j]);
3931 	}
3932 
3933 	if(set->bo)
3934 		radv_cs_add_buffer(ws, cmd_buffer->cs, set->bo);
3935 }
3936 
radv_CmdBindDescriptorSets(VkCommandBuffer commandBuffer,VkPipelineBindPoint pipelineBindPoint,VkPipelineLayout _layout,uint32_t firstSet,uint32_t descriptorSetCount,const VkDescriptorSet * pDescriptorSets,uint32_t dynamicOffsetCount,const uint32_t * pDynamicOffsets)3937 void radv_CmdBindDescriptorSets(
3938 	VkCommandBuffer                             commandBuffer,
3939 	VkPipelineBindPoint                         pipelineBindPoint,
3940 	VkPipelineLayout                            _layout,
3941 	uint32_t                                    firstSet,
3942 	uint32_t                                    descriptorSetCount,
3943 	const VkDescriptorSet*                      pDescriptorSets,
3944 	uint32_t                                    dynamicOffsetCount,
3945 	const uint32_t*                             pDynamicOffsets)
3946 {
3947 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
3948 	RADV_FROM_HANDLE(radv_pipeline_layout, layout, _layout);
3949 	unsigned dyn_idx = 0;
3950 
3951 	const bool no_dynamic_bounds = cmd_buffer->device->instance->debug_flags & RADV_DEBUG_NO_DYNAMIC_BOUNDS;
3952 	struct radv_descriptor_state *descriptors_state =
3953 		radv_get_descriptors_state(cmd_buffer, pipelineBindPoint);
3954 
3955 	for (unsigned i = 0; i < descriptorSetCount; ++i) {
3956 		unsigned set_idx = i + firstSet;
3957 		RADV_FROM_HANDLE(radv_descriptor_set, set, pDescriptorSets[i]);
3958 
3959 		/* If the set is already bound we only need to update the
3960 		 * (potentially changed) dynamic offsets. */
3961 		if (descriptors_state->sets[set_idx] != set ||
3962 		    !(descriptors_state->valid & (1u << set_idx))) {
3963 			radv_bind_descriptor_set(cmd_buffer, pipelineBindPoint, set, set_idx);
3964 		}
3965 
3966 		for(unsigned j = 0; j < layout->set[set_idx].dynamic_offset_count; ++j, ++dyn_idx) {
3967 			unsigned idx = j + layout->set[i + firstSet].dynamic_offset_start;
3968 			uint32_t *dst = descriptors_state->dynamic_buffers + idx * 4;
3969 			assert(dyn_idx < dynamicOffsetCount);
3970 
3971 			struct radv_descriptor_range *range = set->dynamic_descriptors + j;
3972 
3973 			if (!range->va) {
3974 				memset(dst, 0, 4 * 4);
3975 			} else {
3976 				uint64_t va = range->va + pDynamicOffsets[dyn_idx];
3977 				dst[0] = va;
3978 				dst[1] = S_008F04_BASE_ADDRESS_HI(va >> 32);
3979 				dst[2] = no_dynamic_bounds ? 0xffffffffu : range->size;
3980 				dst[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
3981 					 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
3982 					 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
3983 					 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
3984 
3985 				if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10) {
3986 					dst[3] |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
3987 						  S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) |
3988 						  S_008F0C_RESOURCE_LEVEL(1);
3989 				} else {
3990 					dst[3] |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
3991 						  S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
3992 				}
3993 			}
3994 
3995 			cmd_buffer->push_constant_stages |= layout->set[set_idx].dynamic_offset_stages;
3996 		}
3997 	}
3998 }
3999 
radv_init_push_descriptor_set(struct radv_cmd_buffer * cmd_buffer,struct radv_descriptor_set * set,struct radv_descriptor_set_layout * layout,VkPipelineBindPoint bind_point)4000 static bool radv_init_push_descriptor_set(struct radv_cmd_buffer *cmd_buffer,
4001                                           struct radv_descriptor_set *set,
4002                                           struct radv_descriptor_set_layout *layout,
4003 					  VkPipelineBindPoint bind_point)
4004 {
4005 	struct radv_descriptor_state *descriptors_state =
4006 		radv_get_descriptors_state(cmd_buffer, bind_point);
4007 	set->size = layout->size;
4008 	set->layout = layout;
4009 
4010 	if (descriptors_state->push_set.capacity < set->size) {
4011 		size_t new_size = MAX2(set->size, 1024);
4012 		new_size = MAX2(new_size, 2 * descriptors_state->push_set.capacity);
4013 		new_size = MIN2(new_size, 96 * MAX_PUSH_DESCRIPTORS);
4014 
4015 		free(set->mapped_ptr);
4016 		set->mapped_ptr = malloc(new_size);
4017 
4018 		if (!set->mapped_ptr) {
4019 			descriptors_state->push_set.capacity = 0;
4020 			cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY;
4021 			return false;
4022 		}
4023 
4024 		descriptors_state->push_set.capacity = new_size;
4025 	}
4026 
4027 	return true;
4028 }
4029 
radv_meta_push_descriptor_set(struct radv_cmd_buffer * cmd_buffer,VkPipelineBindPoint pipelineBindPoint,VkPipelineLayout _layout,uint32_t set,uint32_t descriptorWriteCount,const VkWriteDescriptorSet * pDescriptorWrites)4030 void radv_meta_push_descriptor_set(
4031 	struct radv_cmd_buffer*              cmd_buffer,
4032 	VkPipelineBindPoint                  pipelineBindPoint,
4033 	VkPipelineLayout                     _layout,
4034 	uint32_t                             set,
4035 	uint32_t                             descriptorWriteCount,
4036 	const VkWriteDescriptorSet*          pDescriptorWrites)
4037 {
4038 	RADV_FROM_HANDLE(radv_pipeline_layout, layout, _layout);
4039 	struct radv_descriptor_set *push_set = &cmd_buffer->meta_push_descriptors;
4040 	unsigned bo_offset;
4041 
4042 	assert(set == 0);
4043 	assert(layout->set[set].layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR);
4044 
4045 	push_set->size = layout->set[set].layout->size;
4046 	push_set->layout = layout->set[set].layout;
4047 
4048 	if (!radv_cmd_buffer_upload_alloc(cmd_buffer, push_set->size, 32,
4049 	                                  &bo_offset,
4050 	                                  (void**) &push_set->mapped_ptr))
4051 		return;
4052 
4053 	push_set->va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
4054 	push_set->va += bo_offset;
4055 
4056 	radv_update_descriptor_sets(cmd_buffer->device, cmd_buffer,
4057 	                            radv_descriptor_set_to_handle(push_set),
4058 	                            descriptorWriteCount, pDescriptorWrites, 0, NULL);
4059 
4060 	radv_set_descriptor_set(cmd_buffer, pipelineBindPoint, push_set, set);
4061 }
4062 
radv_CmdPushDescriptorSetKHR(VkCommandBuffer commandBuffer,VkPipelineBindPoint pipelineBindPoint,VkPipelineLayout _layout,uint32_t set,uint32_t descriptorWriteCount,const VkWriteDescriptorSet * pDescriptorWrites)4063 void radv_CmdPushDescriptorSetKHR(
4064 	VkCommandBuffer                             commandBuffer,
4065 	VkPipelineBindPoint                         pipelineBindPoint,
4066 	VkPipelineLayout                            _layout,
4067 	uint32_t                                    set,
4068 	uint32_t                                    descriptorWriteCount,
4069 	const VkWriteDescriptorSet*                 pDescriptorWrites)
4070 {
4071 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4072 	RADV_FROM_HANDLE(radv_pipeline_layout, layout, _layout);
4073 	struct radv_descriptor_state *descriptors_state =
4074 		radv_get_descriptors_state(cmd_buffer, pipelineBindPoint);
4075 	struct radv_descriptor_set *push_set = &descriptors_state->push_set.set;
4076 
4077 	assert(layout->set[set].layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR);
4078 
4079 	if (!radv_init_push_descriptor_set(cmd_buffer, push_set,
4080 					   layout->set[set].layout,
4081 					   pipelineBindPoint))
4082 		return;
4083 
4084 	/* Check that there are no inline uniform block updates when calling vkCmdPushDescriptorSetKHR()
4085 	 * because it is invalid, according to Vulkan spec.
4086 	 */
4087 	for (int i = 0; i < descriptorWriteCount; i++) {
4088 		ASSERTED const VkWriteDescriptorSet *writeset = &pDescriptorWrites[i];
4089 		assert(writeset->descriptorType != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT);
4090 	}
4091 
4092 	radv_update_descriptor_sets(cmd_buffer->device, cmd_buffer,
4093 	                            radv_descriptor_set_to_handle(push_set),
4094 	                            descriptorWriteCount, pDescriptorWrites, 0, NULL);
4095 
4096 	radv_set_descriptor_set(cmd_buffer, pipelineBindPoint, push_set, set);
4097 	descriptors_state->push_dirty = true;
4098 }
4099 
radv_CmdPushDescriptorSetWithTemplateKHR(VkCommandBuffer commandBuffer,VkDescriptorUpdateTemplate descriptorUpdateTemplate,VkPipelineLayout _layout,uint32_t set,const void * pData)4100 void radv_CmdPushDescriptorSetWithTemplateKHR(
4101 	VkCommandBuffer                             commandBuffer,
4102 	VkDescriptorUpdateTemplate                  descriptorUpdateTemplate,
4103 	VkPipelineLayout                            _layout,
4104 	uint32_t                                    set,
4105 	const void*                                 pData)
4106 {
4107 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4108 	RADV_FROM_HANDLE(radv_pipeline_layout, layout, _layout);
4109 	RADV_FROM_HANDLE(radv_descriptor_update_template, templ, descriptorUpdateTemplate);
4110 	struct radv_descriptor_state *descriptors_state =
4111 		radv_get_descriptors_state(cmd_buffer, templ->bind_point);
4112 	struct radv_descriptor_set *push_set = &descriptors_state->push_set.set;
4113 
4114 	assert(layout->set[set].layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR);
4115 
4116 	if (!radv_init_push_descriptor_set(cmd_buffer, push_set,
4117 					   layout->set[set].layout,
4118 					   templ->bind_point))
4119 		return;
4120 
4121 	radv_update_descriptor_set_with_template(cmd_buffer->device, cmd_buffer, push_set,
4122 						 descriptorUpdateTemplate, pData);
4123 
4124 	radv_set_descriptor_set(cmd_buffer, templ->bind_point, push_set, set);
4125 	descriptors_state->push_dirty = true;
4126 }
4127 
radv_CmdPushConstants(VkCommandBuffer commandBuffer,VkPipelineLayout layout,VkShaderStageFlags stageFlags,uint32_t offset,uint32_t size,const void * pValues)4128 void radv_CmdPushConstants(VkCommandBuffer commandBuffer,
4129 			   VkPipelineLayout layout,
4130 			   VkShaderStageFlags stageFlags,
4131 			   uint32_t offset,
4132 			   uint32_t size,
4133 			   const void* pValues)
4134 {
4135 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4136 	memcpy(cmd_buffer->push_constants + offset, pValues, size);
4137 	cmd_buffer->push_constant_stages |= stageFlags;
4138 }
4139 
radv_EndCommandBuffer(VkCommandBuffer commandBuffer)4140 VkResult radv_EndCommandBuffer(
4141 	VkCommandBuffer                             commandBuffer)
4142 {
4143 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4144 
4145 	radv_emit_mip_change_flush_default(cmd_buffer);
4146 
4147 	if (cmd_buffer->queue_family_index != RADV_QUEUE_TRANSFER) {
4148 		if (cmd_buffer->device->physical_device->rad_info.chip_class == GFX6)
4149 			cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_WB_L2;
4150 
4151 		/* Make sure to sync all pending active queries at the end of
4152 		 * command buffer.
4153 		 */
4154 		cmd_buffer->state.flush_bits |= cmd_buffer->active_query_flush_bits;
4155 
4156 		/* Since NGG streamout uses GDS, we need to make GDS idle when
4157 		 * we leave the IB, otherwise another process might overwrite
4158 		 * it while our shaders are busy.
4159 		 */
4160 		if (cmd_buffer->gds_needed)
4161 			cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH;
4162 
4163 		si_emit_cache_flush(cmd_buffer);
4164 	}
4165 
4166 	/* Make sure CP DMA is idle at the end of IBs because the kernel
4167 	 * doesn't wait for it.
4168 	 */
4169 	si_cp_dma_wait_for_idle(cmd_buffer);
4170 
4171 	radv_describe_end_cmd_buffer(cmd_buffer);
4172 
4173 	vk_free(&cmd_buffer->pool->alloc, cmd_buffer->state.attachments);
4174 	vk_free(&cmd_buffer->pool->alloc, cmd_buffer->state.subpass_sample_locs);
4175 
4176 	VkResult result = cmd_buffer->device->ws->cs_finalize(cmd_buffer->cs);
4177 	if (result != VK_SUCCESS)
4178 		return vk_error(cmd_buffer->device->instance, result);
4179 
4180 	cmd_buffer->status = RADV_CMD_BUFFER_STATUS_EXECUTABLE;
4181 
4182 	return cmd_buffer->record_result;
4183 }
4184 
4185 static void
radv_emit_compute_pipeline(struct radv_cmd_buffer * cmd_buffer)4186 radv_emit_compute_pipeline(struct radv_cmd_buffer *cmd_buffer)
4187 {
4188 	struct radv_pipeline *pipeline = cmd_buffer->state.compute_pipeline;
4189 
4190 	if (!pipeline || pipeline == cmd_buffer->state.emitted_compute_pipeline)
4191 		return;
4192 
4193 	assert(!pipeline->ctx_cs.cdw);
4194 
4195 	cmd_buffer->state.emitted_compute_pipeline = pipeline;
4196 
4197 	radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, pipeline->cs.cdw);
4198 	radeon_emit_array(cmd_buffer->cs, pipeline->cs.buf, pipeline->cs.cdw);
4199 
4200 	cmd_buffer->compute_scratch_size_per_wave_needed = MAX2(cmd_buffer->compute_scratch_size_per_wave_needed,
4201 	                                                        pipeline->scratch_bytes_per_wave);
4202 	cmd_buffer->compute_scratch_waves_wanted = MAX2(cmd_buffer->compute_scratch_waves_wanted,
4203 	                                                pipeline->max_waves);
4204 
4205 	radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs,
4206 			   pipeline->shaders[MESA_SHADER_COMPUTE]->bo);
4207 
4208 	if (unlikely(cmd_buffer->device->trace_bo))
4209 		radv_save_pipeline(cmd_buffer, pipeline);
4210 }
4211 
radv_mark_descriptor_sets_dirty(struct radv_cmd_buffer * cmd_buffer,VkPipelineBindPoint bind_point)4212 static void radv_mark_descriptor_sets_dirty(struct radv_cmd_buffer *cmd_buffer,
4213 					    VkPipelineBindPoint bind_point)
4214 {
4215 	struct radv_descriptor_state *descriptors_state =
4216 		radv_get_descriptors_state(cmd_buffer, bind_point);
4217 
4218 	descriptors_state->dirty |= descriptors_state->valid;
4219 }
4220 
radv_CmdBindPipeline(VkCommandBuffer commandBuffer,VkPipelineBindPoint pipelineBindPoint,VkPipeline _pipeline)4221 void radv_CmdBindPipeline(
4222 	VkCommandBuffer                             commandBuffer,
4223 	VkPipelineBindPoint                         pipelineBindPoint,
4224 	VkPipeline                                  _pipeline)
4225 {
4226 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4227 	RADV_FROM_HANDLE(radv_pipeline, pipeline, _pipeline);
4228 
4229 	switch (pipelineBindPoint) {
4230 	case VK_PIPELINE_BIND_POINT_COMPUTE:
4231 		if (cmd_buffer->state.compute_pipeline == pipeline)
4232 			return;
4233 		radv_mark_descriptor_sets_dirty(cmd_buffer, pipelineBindPoint);
4234 
4235 		cmd_buffer->state.compute_pipeline = pipeline;
4236 		cmd_buffer->push_constant_stages |= VK_SHADER_STAGE_COMPUTE_BIT;
4237 		break;
4238 	case VK_PIPELINE_BIND_POINT_GRAPHICS:
4239 		if (cmd_buffer->state.pipeline == pipeline)
4240 			return;
4241 		radv_mark_descriptor_sets_dirty(cmd_buffer, pipelineBindPoint);
4242 
4243 		cmd_buffer->state.pipeline = pipeline;
4244 		if (!pipeline)
4245 			break;
4246 
4247 		cmd_buffer->state.dirty |= RADV_CMD_DIRTY_PIPELINE;
4248 		cmd_buffer->push_constant_stages |= pipeline->active_stages;
4249 
4250 		/* the new vertex shader might not have the same user regs */
4251 		cmd_buffer->state.last_first_instance = -1;
4252 		cmd_buffer->state.last_vertex_offset = -1;
4253 
4254 		/* Prefetch all pipeline shaders at first draw time. */
4255 		cmd_buffer->state.prefetch_L2_mask |= RADV_PREFETCH_SHADERS;
4256 
4257 		if ((cmd_buffer->device->physical_device->rad_info.chip_class == GFX10 ||
4258 		     cmd_buffer->device->physical_device->rad_info.family == CHIP_SIENNA_CICHLID) &&
4259 		    cmd_buffer->state.emitted_pipeline &&
4260 		    radv_pipeline_has_ngg(cmd_buffer->state.emitted_pipeline) &&
4261 		    !radv_pipeline_has_ngg(cmd_buffer->state.pipeline)) {
4262 			/* Transitioning from NGG to legacy GS requires
4263 			 * VGT_FLUSH on GFX10 and Sienna Cichlid. VGT_FLUSH
4264 			 * is also emitted at the beginning of IBs when legacy
4265 			 * GS ring pointers are set.
4266 			 */
4267 			cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VGT_FLUSH;
4268 		}
4269 
4270 		radv_bind_dynamic_state(cmd_buffer, &pipeline->dynamic_state);
4271 		radv_bind_streamout_state(cmd_buffer, pipeline);
4272 
4273 		if (pipeline->graphics.esgs_ring_size > cmd_buffer->esgs_ring_size_needed)
4274 			cmd_buffer->esgs_ring_size_needed = pipeline->graphics.esgs_ring_size;
4275 		if (pipeline->graphics.gsvs_ring_size > cmd_buffer->gsvs_ring_size_needed)
4276 			cmd_buffer->gsvs_ring_size_needed = pipeline->graphics.gsvs_ring_size;
4277 
4278 		if (radv_pipeline_has_tess(pipeline))
4279 			cmd_buffer->tess_rings_needed = true;
4280 		break;
4281 	default:
4282 		assert(!"invalid bind point");
4283 		break;
4284 	}
4285 }
4286 
radv_CmdSetViewport(VkCommandBuffer commandBuffer,uint32_t firstViewport,uint32_t viewportCount,const VkViewport * pViewports)4287 void radv_CmdSetViewport(
4288 	VkCommandBuffer                             commandBuffer,
4289 	uint32_t                                    firstViewport,
4290 	uint32_t                                    viewportCount,
4291 	const VkViewport*                           pViewports)
4292 {
4293 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4294 	struct radv_cmd_state *state = &cmd_buffer->state;
4295 	ASSERTED const uint32_t total_count = firstViewport + viewportCount;
4296 
4297 	assert(firstViewport < MAX_VIEWPORTS);
4298 	assert(total_count >= 1 && total_count <= MAX_VIEWPORTS);
4299 
4300 	if (total_count <= state->dynamic.viewport.count &&
4301 	    !memcmp(state->dynamic.viewport.viewports + firstViewport,
4302 		    pViewports, viewportCount * sizeof(*pViewports))) {
4303 		return;
4304 	}
4305 
4306 	if (state->dynamic.viewport.count < total_count)
4307 		state->dynamic.viewport.count = total_count;
4308 
4309 	memcpy(state->dynamic.viewport.viewports + firstViewport, pViewports,
4310 	       viewportCount * sizeof(*pViewports));
4311 
4312 	state->dirty |= RADV_CMD_DIRTY_DYNAMIC_VIEWPORT;
4313 }
4314 
radv_CmdSetScissor(VkCommandBuffer commandBuffer,uint32_t firstScissor,uint32_t scissorCount,const VkRect2D * pScissors)4315 void radv_CmdSetScissor(
4316 	VkCommandBuffer                             commandBuffer,
4317 	uint32_t                                    firstScissor,
4318 	uint32_t                                    scissorCount,
4319 	const VkRect2D*                             pScissors)
4320 {
4321 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4322 	struct radv_cmd_state *state = &cmd_buffer->state;
4323 	ASSERTED const uint32_t total_count = firstScissor + scissorCount;
4324 
4325 	assert(firstScissor < MAX_SCISSORS);
4326 	assert(total_count >= 1 && total_count <= MAX_SCISSORS);
4327 
4328 	if (total_count <= state->dynamic.scissor.count &&
4329 	    !memcmp(state->dynamic.scissor.scissors + firstScissor, pScissors,
4330 		    scissorCount * sizeof(*pScissors))) {
4331 		return;
4332 	}
4333 
4334 	if (state->dynamic.scissor.count < total_count)
4335 		state->dynamic.scissor.count = total_count;
4336 
4337 	memcpy(state->dynamic.scissor.scissors + firstScissor, pScissors,
4338 	       scissorCount * sizeof(*pScissors));
4339 
4340 	state->dirty |= RADV_CMD_DIRTY_DYNAMIC_SCISSOR;
4341 }
4342 
radv_CmdSetLineWidth(VkCommandBuffer commandBuffer,float lineWidth)4343 void radv_CmdSetLineWidth(
4344 	VkCommandBuffer                             commandBuffer,
4345 	float                                       lineWidth)
4346 {
4347 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4348 
4349 	if (cmd_buffer->state.dynamic.line_width == lineWidth)
4350 		return;
4351 
4352 	cmd_buffer->state.dynamic.line_width = lineWidth;
4353 	cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_LINE_WIDTH;
4354 }
4355 
radv_CmdSetDepthBias(VkCommandBuffer commandBuffer,float depthBiasConstantFactor,float depthBiasClamp,float depthBiasSlopeFactor)4356 void radv_CmdSetDepthBias(
4357 	VkCommandBuffer                             commandBuffer,
4358 	float                                       depthBiasConstantFactor,
4359 	float                                       depthBiasClamp,
4360 	float                                       depthBiasSlopeFactor)
4361 {
4362 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4363 	struct radv_cmd_state *state = &cmd_buffer->state;
4364 
4365 	if (state->dynamic.depth_bias.bias == depthBiasConstantFactor &&
4366 	    state->dynamic.depth_bias.clamp == depthBiasClamp &&
4367 	    state->dynamic.depth_bias.slope == depthBiasSlopeFactor) {
4368 		return;
4369 	}
4370 
4371 	state->dynamic.depth_bias.bias = depthBiasConstantFactor;
4372 	state->dynamic.depth_bias.clamp = depthBiasClamp;
4373 	state->dynamic.depth_bias.slope = depthBiasSlopeFactor;
4374 
4375 	state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS;
4376 }
4377 
radv_CmdSetBlendConstants(VkCommandBuffer commandBuffer,const float blendConstants[4])4378 void radv_CmdSetBlendConstants(
4379 	VkCommandBuffer                             commandBuffer,
4380 	const float                                 blendConstants[4])
4381 {
4382 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4383 	struct radv_cmd_state *state = &cmd_buffer->state;
4384 
4385 	if (!memcmp(state->dynamic.blend_constants, blendConstants, sizeof(float) * 4))
4386 		return;
4387 
4388 	memcpy(state->dynamic.blend_constants, blendConstants, sizeof(float) * 4);
4389 
4390 	state->dirty |= RADV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS;
4391 }
4392 
radv_CmdSetDepthBounds(VkCommandBuffer commandBuffer,float minDepthBounds,float maxDepthBounds)4393 void radv_CmdSetDepthBounds(
4394 	VkCommandBuffer                             commandBuffer,
4395 	float                                       minDepthBounds,
4396 	float                                       maxDepthBounds)
4397 {
4398 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4399 	struct radv_cmd_state *state = &cmd_buffer->state;
4400 
4401 	if (state->dynamic.depth_bounds.min == minDepthBounds &&
4402 	    state->dynamic.depth_bounds.max == maxDepthBounds) {
4403 		return;
4404 	}
4405 
4406 	state->dynamic.depth_bounds.min = minDepthBounds;
4407 	state->dynamic.depth_bounds.max = maxDepthBounds;
4408 
4409 	state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS;
4410 }
4411 
radv_CmdSetStencilCompareMask(VkCommandBuffer commandBuffer,VkStencilFaceFlags faceMask,uint32_t compareMask)4412 void radv_CmdSetStencilCompareMask(
4413 	VkCommandBuffer                             commandBuffer,
4414 	VkStencilFaceFlags                          faceMask,
4415 	uint32_t                                    compareMask)
4416 {
4417 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4418 	struct radv_cmd_state *state = &cmd_buffer->state;
4419 	bool front_same = state->dynamic.stencil_compare_mask.front == compareMask;
4420 	bool back_same = state->dynamic.stencil_compare_mask.back == compareMask;
4421 
4422 	if ((!(faceMask & VK_STENCIL_FACE_FRONT_BIT) || front_same) &&
4423 	    (!(faceMask & VK_STENCIL_FACE_BACK_BIT) || back_same)) {
4424 		return;
4425 	}
4426 
4427 	if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
4428 		state->dynamic.stencil_compare_mask.front = compareMask;
4429 	if (faceMask & VK_STENCIL_FACE_BACK_BIT)
4430 		state->dynamic.stencil_compare_mask.back = compareMask;
4431 
4432 	state->dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK;
4433 }
4434 
radv_CmdSetStencilWriteMask(VkCommandBuffer commandBuffer,VkStencilFaceFlags faceMask,uint32_t writeMask)4435 void radv_CmdSetStencilWriteMask(
4436 	VkCommandBuffer                             commandBuffer,
4437 	VkStencilFaceFlags                          faceMask,
4438 	uint32_t                                    writeMask)
4439 {
4440 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4441 	struct radv_cmd_state *state = &cmd_buffer->state;
4442 	bool front_same = state->dynamic.stencil_write_mask.front == writeMask;
4443 	bool back_same = state->dynamic.stencil_write_mask.back == writeMask;
4444 
4445 	if ((!(faceMask & VK_STENCIL_FACE_FRONT_BIT) || front_same) &&
4446 	    (!(faceMask & VK_STENCIL_FACE_BACK_BIT) || back_same)) {
4447 		return;
4448 	}
4449 
4450 	if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
4451 		state->dynamic.stencil_write_mask.front = writeMask;
4452 	if (faceMask & VK_STENCIL_FACE_BACK_BIT)
4453 		state->dynamic.stencil_write_mask.back = writeMask;
4454 
4455 	state->dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK;
4456 }
4457 
radv_CmdSetStencilReference(VkCommandBuffer commandBuffer,VkStencilFaceFlags faceMask,uint32_t reference)4458 void radv_CmdSetStencilReference(
4459 	VkCommandBuffer                             commandBuffer,
4460 	VkStencilFaceFlags                          faceMask,
4461 	uint32_t                                    reference)
4462 {
4463 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4464 	struct radv_cmd_state *state = &cmd_buffer->state;
4465 	bool front_same = state->dynamic.stencil_reference.front == reference;
4466 	bool back_same = state->dynamic.stencil_reference.back == reference;
4467 
4468 	if ((!(faceMask & VK_STENCIL_FACE_FRONT_BIT) || front_same) &&
4469 	    (!(faceMask & VK_STENCIL_FACE_BACK_BIT) || back_same)) {
4470 		return;
4471 	}
4472 
4473 	if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
4474 		cmd_buffer->state.dynamic.stencil_reference.front = reference;
4475 	if (faceMask & VK_STENCIL_FACE_BACK_BIT)
4476 		cmd_buffer->state.dynamic.stencil_reference.back = reference;
4477 
4478 	cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE;
4479 }
4480 
radv_CmdSetDiscardRectangleEXT(VkCommandBuffer commandBuffer,uint32_t firstDiscardRectangle,uint32_t discardRectangleCount,const VkRect2D * pDiscardRectangles)4481 void radv_CmdSetDiscardRectangleEXT(
4482 	VkCommandBuffer                             commandBuffer,
4483 	uint32_t                                    firstDiscardRectangle,
4484 	uint32_t                                    discardRectangleCount,
4485 	const VkRect2D*                             pDiscardRectangles)
4486 {
4487 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4488 	struct radv_cmd_state *state = &cmd_buffer->state;
4489 	ASSERTED const uint32_t total_count = firstDiscardRectangle + discardRectangleCount;
4490 
4491 	assert(firstDiscardRectangle < MAX_DISCARD_RECTANGLES);
4492 	assert(total_count >= 1 && total_count <= MAX_DISCARD_RECTANGLES);
4493 
4494 	if (!memcmp(state->dynamic.discard_rectangle.rectangles + firstDiscardRectangle,
4495 		    pDiscardRectangles, discardRectangleCount * sizeof(*pDiscardRectangles))) {
4496 		return;
4497 	}
4498 
4499 	typed_memcpy(&state->dynamic.discard_rectangle.rectangles[firstDiscardRectangle],
4500 	             pDiscardRectangles, discardRectangleCount);
4501 
4502 	state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DISCARD_RECTANGLE;
4503 }
4504 
radv_CmdSetSampleLocationsEXT(VkCommandBuffer commandBuffer,const VkSampleLocationsInfoEXT * pSampleLocationsInfo)4505 void radv_CmdSetSampleLocationsEXT(
4506 	VkCommandBuffer                             commandBuffer,
4507 	const VkSampleLocationsInfoEXT*             pSampleLocationsInfo)
4508 {
4509 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4510 	struct radv_cmd_state *state = &cmd_buffer->state;
4511 
4512 	assert(pSampleLocationsInfo->sampleLocationsCount <= MAX_SAMPLE_LOCATIONS);
4513 
4514 	state->dynamic.sample_location.per_pixel = pSampleLocationsInfo->sampleLocationsPerPixel;
4515 	state->dynamic.sample_location.grid_size = pSampleLocationsInfo->sampleLocationGridSize;
4516 	state->dynamic.sample_location.count = pSampleLocationsInfo->sampleLocationsCount;
4517 	typed_memcpy(&state->dynamic.sample_location.locations[0],
4518 		     pSampleLocationsInfo->pSampleLocations,
4519 		     pSampleLocationsInfo->sampleLocationsCount);
4520 
4521 	state->dirty |= RADV_CMD_DIRTY_DYNAMIC_SAMPLE_LOCATIONS;
4522 }
4523 
radv_CmdSetLineStippleEXT(VkCommandBuffer commandBuffer,uint32_t lineStippleFactor,uint16_t lineStipplePattern)4524 void radv_CmdSetLineStippleEXT(
4525 	VkCommandBuffer                             commandBuffer,
4526 	uint32_t                                    lineStippleFactor,
4527 	uint16_t                                    lineStipplePattern)
4528 {
4529 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4530 	struct radv_cmd_state *state = &cmd_buffer->state;
4531 
4532 	state->dynamic.line_stipple.factor = lineStippleFactor;
4533 	state->dynamic.line_stipple.pattern = lineStipplePattern;
4534 
4535 	state->dirty |= RADV_CMD_DIRTY_DYNAMIC_LINE_STIPPLE;
4536 }
4537 
radv_CmdSetCullModeEXT(VkCommandBuffer commandBuffer,VkCullModeFlags cullMode)4538 void radv_CmdSetCullModeEXT(
4539 	VkCommandBuffer                             commandBuffer,
4540 	VkCullModeFlags                             cullMode)
4541 {
4542 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4543 	struct radv_cmd_state *state = &cmd_buffer->state;
4544 
4545 	if (state->dynamic.cull_mode == cullMode)
4546 		return;
4547 
4548 	state->dynamic.cull_mode = cullMode;
4549 
4550 	state->dirty |= RADV_CMD_DIRTY_DYNAMIC_CULL_MODE;
4551 }
4552 
radv_CmdSetFrontFaceEXT(VkCommandBuffer commandBuffer,VkFrontFace frontFace)4553 void radv_CmdSetFrontFaceEXT(
4554 	VkCommandBuffer                             commandBuffer,
4555 	VkFrontFace                                 frontFace)
4556 {
4557 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4558 	struct radv_cmd_state *state = &cmd_buffer->state;
4559 
4560 	if (state->dynamic.front_face == frontFace)
4561 		return;
4562 
4563 	state->dynamic.front_face = frontFace;
4564 
4565 	state->dirty |= RADV_CMD_DIRTY_DYNAMIC_FRONT_FACE;
4566 }
4567 
radv_CmdSetPrimitiveTopologyEXT(VkCommandBuffer commandBuffer,VkPrimitiveTopology primitiveTopology)4568 void radv_CmdSetPrimitiveTopologyEXT(
4569 	VkCommandBuffer                             commandBuffer,
4570 	VkPrimitiveTopology                         primitiveTopology)
4571 {
4572 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4573 	struct radv_cmd_state *state = &cmd_buffer->state;
4574 	unsigned primitive_topology = si_translate_prim(primitiveTopology);
4575 
4576 	if (state->dynamic.primitive_topology == primitive_topology)
4577 		return;
4578 
4579 	state->dynamic.primitive_topology = primitive_topology;
4580 
4581 	state->dirty |= RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY;
4582 }
4583 
radv_CmdSetViewportWithCountEXT(VkCommandBuffer commandBuffer,uint32_t viewportCount,const VkViewport * pViewports)4584 void radv_CmdSetViewportWithCountEXT(
4585 	VkCommandBuffer                             commandBuffer,
4586 	uint32_t                                    viewportCount,
4587 	const VkViewport*                           pViewports)
4588 {
4589 	radv_CmdSetViewport(commandBuffer, 0, viewportCount, pViewports);
4590 }
4591 
radv_CmdSetScissorWithCountEXT(VkCommandBuffer commandBuffer,uint32_t scissorCount,const VkRect2D * pScissors)4592 void radv_CmdSetScissorWithCountEXT(
4593 	VkCommandBuffer                             commandBuffer,
4594 	uint32_t                                    scissorCount,
4595 	const VkRect2D*                             pScissors)
4596 {
4597 	radv_CmdSetScissor(commandBuffer, 0, scissorCount, pScissors);
4598 }
4599 
radv_CmdSetDepthTestEnableEXT(VkCommandBuffer commandBuffer,VkBool32 depthTestEnable)4600 void radv_CmdSetDepthTestEnableEXT(
4601 	VkCommandBuffer                             commandBuffer,
4602 	VkBool32                                    depthTestEnable)
4603 
4604 {
4605 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4606 	struct radv_cmd_state *state = &cmd_buffer->state;
4607 
4608 	if (state->dynamic.depth_test_enable == depthTestEnable)
4609 		return;
4610 
4611 	state->dynamic.depth_test_enable = depthTestEnable;
4612 
4613 	state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_TEST_ENABLE;
4614 }
4615 
radv_CmdSetDepthWriteEnableEXT(VkCommandBuffer commandBuffer,VkBool32 depthWriteEnable)4616 void radv_CmdSetDepthWriteEnableEXT(
4617 	VkCommandBuffer                             commandBuffer,
4618 	VkBool32                                    depthWriteEnable)
4619 {
4620 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4621 	struct radv_cmd_state *state = &cmd_buffer->state;
4622 
4623 	if (state->dynamic.depth_write_enable == depthWriteEnable)
4624 		return;
4625 
4626 	state->dynamic.depth_write_enable = depthWriteEnable;
4627 
4628 	state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_WRITE_ENABLE;
4629 }
4630 
radv_CmdSetDepthCompareOpEXT(VkCommandBuffer commandBuffer,VkCompareOp depthCompareOp)4631 void radv_CmdSetDepthCompareOpEXT(
4632 	VkCommandBuffer                             commandBuffer,
4633 	VkCompareOp                                 depthCompareOp)
4634 {
4635 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4636 	struct radv_cmd_state *state = &cmd_buffer->state;
4637 
4638 	if (state->dynamic.depth_compare_op == depthCompareOp)
4639 		return;
4640 
4641 	state->dynamic.depth_compare_op = depthCompareOp;
4642 
4643 	state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_COMPARE_OP;
4644 }
4645 
radv_CmdSetDepthBoundsTestEnableEXT(VkCommandBuffer commandBuffer,VkBool32 depthBoundsTestEnable)4646 void radv_CmdSetDepthBoundsTestEnableEXT(
4647 	VkCommandBuffer                             commandBuffer,
4648 	VkBool32                                    depthBoundsTestEnable)
4649 {
4650 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4651 	struct radv_cmd_state *state = &cmd_buffer->state;
4652 
4653 	if (state->dynamic.depth_bounds_test_enable == depthBoundsTestEnable)
4654 		return;
4655 
4656 	state->dynamic.depth_bounds_test_enable = depthBoundsTestEnable;
4657 
4658 	state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE;
4659 }
4660 
radv_CmdSetStencilTestEnableEXT(VkCommandBuffer commandBuffer,VkBool32 stencilTestEnable)4661 void radv_CmdSetStencilTestEnableEXT(
4662 	VkCommandBuffer                             commandBuffer,
4663 	VkBool32                                    stencilTestEnable)
4664 {
4665 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4666 	struct radv_cmd_state *state = &cmd_buffer->state;
4667 
4668 	if (state->dynamic.stencil_test_enable == stencilTestEnable)
4669 		return;
4670 
4671 	state->dynamic.stencil_test_enable = stencilTestEnable;
4672 
4673 	state->dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_TEST_ENABLE;
4674 }
4675 
radv_CmdSetStencilOpEXT(VkCommandBuffer commandBuffer,VkStencilFaceFlags faceMask,VkStencilOp failOp,VkStencilOp passOp,VkStencilOp depthFailOp,VkCompareOp compareOp)4676 void radv_CmdSetStencilOpEXT(
4677 	VkCommandBuffer                             commandBuffer,
4678 	VkStencilFaceFlags                          faceMask,
4679 	VkStencilOp                                 failOp,
4680 	VkStencilOp                                 passOp,
4681 	VkStencilOp                                 depthFailOp,
4682 	VkCompareOp                                 compareOp)
4683 {
4684 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4685 	struct radv_cmd_state *state = &cmd_buffer->state;
4686 	bool front_same =
4687 		state->dynamic.stencil_op.front.fail_op == failOp &&
4688 		state->dynamic.stencil_op.front.pass_op == passOp &&
4689 		state->dynamic.stencil_op.front.depth_fail_op == depthFailOp &&
4690 		state->dynamic.stencil_op.front.compare_op == compareOp;
4691 	bool back_same =
4692 		state->dynamic.stencil_op.back.fail_op == failOp &&
4693 		state->dynamic.stencil_op.back.pass_op == passOp &&
4694 		state->dynamic.stencil_op.back.depth_fail_op == depthFailOp &&
4695 		state->dynamic.stencil_op.back.compare_op == compareOp;
4696 
4697 	if ((!(faceMask & VK_STENCIL_FACE_FRONT_BIT) || front_same) &&
4698 	    (!(faceMask & VK_STENCIL_FACE_BACK_BIT) || back_same))
4699 		return;
4700 
4701 	if (faceMask & VK_STENCIL_FACE_FRONT_BIT) {
4702 		state->dynamic.stencil_op.front.fail_op = failOp;
4703 		state->dynamic.stencil_op.front.pass_op = passOp;
4704 		state->dynamic.stencil_op.front.depth_fail_op = depthFailOp;
4705 		state->dynamic.stencil_op.front.compare_op = compareOp;
4706 	}
4707 
4708 	if (faceMask & VK_STENCIL_FACE_BACK_BIT) {
4709 		state->dynamic.stencil_op.back.fail_op = failOp;
4710 		state->dynamic.stencil_op.back.pass_op = passOp;
4711 		state->dynamic.stencil_op.back.depth_fail_op = depthFailOp;
4712 		state->dynamic.stencil_op.back.compare_op = compareOp;
4713 	}
4714 
4715 	state->dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_OP;
4716 }
4717 
radv_CmdExecuteCommands(VkCommandBuffer commandBuffer,uint32_t commandBufferCount,const VkCommandBuffer * pCmdBuffers)4718 void radv_CmdExecuteCommands(
4719 	VkCommandBuffer                             commandBuffer,
4720 	uint32_t                                    commandBufferCount,
4721 	const VkCommandBuffer*                      pCmdBuffers)
4722 {
4723 	RADV_FROM_HANDLE(radv_cmd_buffer, primary, commandBuffer);
4724 
4725 	assert(commandBufferCount > 0);
4726 
4727 	radv_emit_mip_change_flush_default(primary);
4728 
4729 	/* Emit pending flushes on primary prior to executing secondary */
4730 	si_emit_cache_flush(primary);
4731 
4732 	for (uint32_t i = 0; i < commandBufferCount; i++) {
4733 		RADV_FROM_HANDLE(radv_cmd_buffer, secondary, pCmdBuffers[i]);
4734 
4735 		primary->scratch_size_per_wave_needed = MAX2(primary->scratch_size_per_wave_needed,
4736 		                                             secondary->scratch_size_per_wave_needed);
4737 		primary->scratch_waves_wanted = MAX2(primary->scratch_waves_wanted,
4738 		                                     secondary->scratch_waves_wanted);
4739 		primary->compute_scratch_size_per_wave_needed = MAX2(primary->compute_scratch_size_per_wave_needed,
4740 		                                                     secondary->compute_scratch_size_per_wave_needed);
4741 		primary->compute_scratch_waves_wanted = MAX2(primary->compute_scratch_waves_wanted,
4742 		                                             secondary->compute_scratch_waves_wanted);
4743 
4744 		if (secondary->esgs_ring_size_needed > primary->esgs_ring_size_needed)
4745 			primary->esgs_ring_size_needed = secondary->esgs_ring_size_needed;
4746 		if (secondary->gsvs_ring_size_needed > primary->gsvs_ring_size_needed)
4747 			primary->gsvs_ring_size_needed = secondary->gsvs_ring_size_needed;
4748 		if (secondary->tess_rings_needed)
4749 			primary->tess_rings_needed = true;
4750 		if (secondary->sample_positions_needed)
4751 			primary->sample_positions_needed = true;
4752 		if (secondary->gds_needed)
4753 			primary->gds_needed = true;
4754 
4755 		if (!secondary->state.framebuffer &&
4756 		    (primary->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER)) {
4757 			/* Emit the framebuffer state from primary if secondary
4758 			 * has been recorded without a framebuffer, otherwise
4759 			 * fast color/depth clears can't work.
4760 			 */
4761 			radv_emit_fb_mip_change_flush(primary);
4762 			radv_emit_framebuffer_state(primary);
4763 		}
4764 
4765 		primary->device->ws->cs_execute_secondary(primary->cs, secondary->cs);
4766 
4767 
4768 		/* When the secondary command buffer is compute only we don't
4769 		 * need to re-emit the current graphics pipeline.
4770 		 */
4771 		if (secondary->state.emitted_pipeline) {
4772 			primary->state.emitted_pipeline =
4773 				secondary->state.emitted_pipeline;
4774 		}
4775 
4776 		/* When the secondary command buffer is graphics only we don't
4777 		 * need to re-emit the current compute pipeline.
4778 		 */
4779 		if (secondary->state.emitted_compute_pipeline) {
4780 			primary->state.emitted_compute_pipeline =
4781 				secondary->state.emitted_compute_pipeline;
4782 		}
4783 
4784 		/* Only re-emit the draw packets when needed. */
4785 		if (secondary->state.last_primitive_reset_en != -1) {
4786 			primary->state.last_primitive_reset_en =
4787 				secondary->state.last_primitive_reset_en;
4788 		}
4789 
4790 		if (secondary->state.last_primitive_reset_index) {
4791 			primary->state.last_primitive_reset_index =
4792 				secondary->state.last_primitive_reset_index;
4793 		}
4794 
4795 		if (secondary->state.last_ia_multi_vgt_param) {
4796 			primary->state.last_ia_multi_vgt_param =
4797 				secondary->state.last_ia_multi_vgt_param;
4798 		}
4799 
4800 		primary->state.last_first_instance = secondary->state.last_first_instance;
4801 		primary->state.last_num_instances = secondary->state.last_num_instances;
4802 		primary->state.last_vertex_offset = secondary->state.last_vertex_offset;
4803 		primary->state.last_sx_ps_downconvert = secondary->state.last_sx_ps_downconvert;
4804 		primary->state.last_sx_blend_opt_epsilon = secondary->state.last_sx_blend_opt_epsilon;
4805 		primary->state.last_sx_blend_opt_control = secondary->state.last_sx_blend_opt_control;
4806 
4807 		if (secondary->state.last_index_type != -1) {
4808 			primary->state.last_index_type =
4809 				secondary->state.last_index_type;
4810 		}
4811 	}
4812 
4813 	/* After executing commands from secondary buffers we have to dirty
4814 	 * some states.
4815 	 */
4816 	primary->state.dirty |= RADV_CMD_DIRTY_PIPELINE |
4817 				RADV_CMD_DIRTY_INDEX_BUFFER |
4818 				RADV_CMD_DIRTY_DYNAMIC_ALL;
4819 	radv_mark_descriptor_sets_dirty(primary, VK_PIPELINE_BIND_POINT_GRAPHICS);
4820 	radv_mark_descriptor_sets_dirty(primary, VK_PIPELINE_BIND_POINT_COMPUTE);
4821 }
4822 
radv_CreateCommandPool(VkDevice _device,const VkCommandPoolCreateInfo * pCreateInfo,const VkAllocationCallbacks * pAllocator,VkCommandPool * pCmdPool)4823 VkResult radv_CreateCommandPool(
4824 	VkDevice                                    _device,
4825 	const VkCommandPoolCreateInfo*              pCreateInfo,
4826 	const VkAllocationCallbacks*                pAllocator,
4827 	VkCommandPool*                              pCmdPool)
4828 {
4829 	RADV_FROM_HANDLE(radv_device, device, _device);
4830 	struct radv_cmd_pool *pool;
4831 
4832 	pool = vk_alloc2(&device->vk.alloc, pAllocator, sizeof(*pool), 8,
4833 			   VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
4834 	if (pool == NULL)
4835 		return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
4836 
4837 	vk_object_base_init(&device->vk, &pool->base,
4838 			    VK_OBJECT_TYPE_COMMAND_POOL);
4839 
4840 	if (pAllocator)
4841 		pool->alloc = *pAllocator;
4842 	else
4843 		pool->alloc = device->vk.alloc;
4844 
4845 	list_inithead(&pool->cmd_buffers);
4846 	list_inithead(&pool->free_cmd_buffers);
4847 
4848 	pool->queue_family_index = pCreateInfo->queueFamilyIndex;
4849 
4850 	*pCmdPool = radv_cmd_pool_to_handle(pool);
4851 
4852 	return VK_SUCCESS;
4853 
4854 }
4855 
radv_DestroyCommandPool(VkDevice _device,VkCommandPool commandPool,const VkAllocationCallbacks * pAllocator)4856 void radv_DestroyCommandPool(
4857 	VkDevice                                    _device,
4858 	VkCommandPool                               commandPool,
4859 	const VkAllocationCallbacks*                pAllocator)
4860 {
4861 	RADV_FROM_HANDLE(radv_device, device, _device);
4862 	RADV_FROM_HANDLE(radv_cmd_pool, pool, commandPool);
4863 
4864 	if (!pool)
4865 		return;
4866 
4867 	list_for_each_entry_safe(struct radv_cmd_buffer, cmd_buffer,
4868 				 &pool->cmd_buffers, pool_link) {
4869 		radv_destroy_cmd_buffer(cmd_buffer);
4870 	}
4871 
4872 	list_for_each_entry_safe(struct radv_cmd_buffer, cmd_buffer,
4873 				 &pool->free_cmd_buffers, pool_link) {
4874 		radv_destroy_cmd_buffer(cmd_buffer);
4875 	}
4876 
4877 	vk_object_base_finish(&pool->base);
4878 	vk_free2(&device->vk.alloc, pAllocator, pool);
4879 }
4880 
radv_ResetCommandPool(VkDevice device,VkCommandPool commandPool,VkCommandPoolResetFlags flags)4881 VkResult radv_ResetCommandPool(
4882 	VkDevice                                    device,
4883 	VkCommandPool                               commandPool,
4884 	VkCommandPoolResetFlags                     flags)
4885 {
4886 	RADV_FROM_HANDLE(radv_cmd_pool, pool, commandPool);
4887 	VkResult result;
4888 
4889 	list_for_each_entry(struct radv_cmd_buffer, cmd_buffer,
4890 			    &pool->cmd_buffers, pool_link) {
4891 		result = radv_reset_cmd_buffer(cmd_buffer);
4892 		if (result != VK_SUCCESS)
4893 			return result;
4894 	}
4895 
4896 	return VK_SUCCESS;
4897 }
4898 
radv_TrimCommandPool(VkDevice device,VkCommandPool commandPool,VkCommandPoolTrimFlags flags)4899 void radv_TrimCommandPool(
4900     VkDevice                                    device,
4901     VkCommandPool                               commandPool,
4902     VkCommandPoolTrimFlags                      flags)
4903 {
4904 	RADV_FROM_HANDLE(radv_cmd_pool, pool, commandPool);
4905 
4906 	if (!pool)
4907 		return;
4908 
4909 	list_for_each_entry_safe(struct radv_cmd_buffer, cmd_buffer,
4910 				 &pool->free_cmd_buffers, pool_link) {
4911 		radv_destroy_cmd_buffer(cmd_buffer);
4912 	}
4913 }
4914 
4915 static void
radv_cmd_buffer_begin_subpass(struct radv_cmd_buffer * cmd_buffer,uint32_t subpass_id)4916 radv_cmd_buffer_begin_subpass(struct radv_cmd_buffer *cmd_buffer,
4917 			      uint32_t subpass_id)
4918 {
4919 	struct radv_cmd_state *state = &cmd_buffer->state;
4920 	struct radv_subpass *subpass = &state->pass->subpasses[subpass_id];
4921 
4922 	ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws,
4923 							   cmd_buffer->cs, 4096);
4924 
4925 	radv_subpass_barrier(cmd_buffer, &subpass->start_barrier);
4926 
4927 	radv_cmd_buffer_set_subpass(cmd_buffer, subpass);
4928 
4929 	radv_describe_barrier_start(cmd_buffer, RGP_BARRIER_EXTERNAL_RENDER_PASS_SYNC);
4930 
4931 	for (uint32_t i = 0; i < subpass->attachment_count; ++i) {
4932 		const uint32_t a = subpass->attachments[i].attachment;
4933 		if (a == VK_ATTACHMENT_UNUSED)
4934 			continue;
4935 
4936 		radv_handle_subpass_image_transition(cmd_buffer,
4937 						     subpass->attachments[i],
4938 						     true);
4939 	}
4940 
4941 	radv_describe_barrier_end(cmd_buffer);
4942 
4943 	radv_cmd_buffer_clear_subpass(cmd_buffer);
4944 
4945 	assert(cmd_buffer->cs->cdw <= cdw_max);
4946 }
4947 
4948 static void
radv_cmd_buffer_end_subpass(struct radv_cmd_buffer * cmd_buffer)4949 radv_cmd_buffer_end_subpass(struct radv_cmd_buffer *cmd_buffer)
4950 {
4951 	struct radv_cmd_state *state = &cmd_buffer->state;
4952 	const struct radv_subpass *subpass = state->subpass;
4953 	uint32_t subpass_id = radv_get_subpass_id(cmd_buffer);
4954 
4955 	radv_cmd_buffer_resolve_subpass(cmd_buffer);
4956 
4957 	radv_describe_barrier_start(cmd_buffer, RGP_BARRIER_EXTERNAL_RENDER_PASS_SYNC);
4958 
4959 	for (uint32_t i = 0; i < subpass->attachment_count; ++i) {
4960 		const uint32_t a = subpass->attachments[i].attachment;
4961 		if (a == VK_ATTACHMENT_UNUSED)
4962 			continue;
4963 
4964 		if (state->pass->attachments[a].last_subpass_idx != subpass_id)
4965 			continue;
4966 
4967 		VkImageLayout layout = state->pass->attachments[a].final_layout;
4968 		VkImageLayout stencil_layout = state->pass->attachments[a].stencil_final_layout;
4969 		struct radv_subpass_attachment att = { a, layout, stencil_layout };
4970 		radv_handle_subpass_image_transition(cmd_buffer, att, false);
4971 	}
4972 
4973 	radv_describe_barrier_end(cmd_buffer);
4974 }
4975 
4976 void
radv_cmd_buffer_begin_render_pass(struct radv_cmd_buffer * cmd_buffer,const VkRenderPassBeginInfo * pRenderPassBegin)4977 radv_cmd_buffer_begin_render_pass(struct radv_cmd_buffer *cmd_buffer,
4978 				  const VkRenderPassBeginInfo *pRenderPassBegin)
4979 {
4980 	RADV_FROM_HANDLE(radv_render_pass, pass, pRenderPassBegin->renderPass);
4981 	RADV_FROM_HANDLE(radv_framebuffer, framebuffer, pRenderPassBegin->framebuffer);
4982 	VkResult result;
4983 
4984 	cmd_buffer->state.framebuffer = framebuffer;
4985 	cmd_buffer->state.pass = pass;
4986 	cmd_buffer->state.render_area = pRenderPassBegin->renderArea;
4987 
4988 	result = radv_cmd_state_setup_attachments(cmd_buffer, pass, pRenderPassBegin);
4989 	if (result != VK_SUCCESS)
4990 		return;
4991 
4992 	result = radv_cmd_state_setup_sample_locations(cmd_buffer, pass, pRenderPassBegin);
4993 	if (result != VK_SUCCESS)
4994 		return;
4995 }
4996 
radv_CmdBeginRenderPass(VkCommandBuffer commandBuffer,const VkRenderPassBeginInfo * pRenderPassBegin,VkSubpassContents contents)4997 void radv_CmdBeginRenderPass(
4998 	VkCommandBuffer                             commandBuffer,
4999 	const VkRenderPassBeginInfo*                pRenderPassBegin,
5000 	VkSubpassContents                           contents)
5001 {
5002 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5003 
5004 	radv_cmd_buffer_begin_render_pass(cmd_buffer, pRenderPassBegin);
5005 
5006 	radv_cmd_buffer_begin_subpass(cmd_buffer, 0);
5007 }
5008 
radv_CmdBeginRenderPass2(VkCommandBuffer commandBuffer,const VkRenderPassBeginInfo * pRenderPassBeginInfo,const VkSubpassBeginInfo * pSubpassBeginInfo)5009 void radv_CmdBeginRenderPass2(
5010     VkCommandBuffer                             commandBuffer,
5011     const VkRenderPassBeginInfo*                pRenderPassBeginInfo,
5012     const VkSubpassBeginInfo*                   pSubpassBeginInfo)
5013 {
5014 	radv_CmdBeginRenderPass(commandBuffer, pRenderPassBeginInfo,
5015 				pSubpassBeginInfo->contents);
5016 }
5017 
radv_CmdNextSubpass(VkCommandBuffer commandBuffer,VkSubpassContents contents)5018 void radv_CmdNextSubpass(
5019     VkCommandBuffer                             commandBuffer,
5020     VkSubpassContents                           contents)
5021 {
5022 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5023 
5024 	uint32_t prev_subpass = radv_get_subpass_id(cmd_buffer);
5025 	radv_cmd_buffer_end_subpass(cmd_buffer);
5026 	radv_cmd_buffer_begin_subpass(cmd_buffer, prev_subpass + 1);
5027 }
5028 
radv_CmdNextSubpass2(VkCommandBuffer commandBuffer,const VkSubpassBeginInfo * pSubpassBeginInfo,const VkSubpassEndInfo * pSubpassEndInfo)5029 void radv_CmdNextSubpass2(
5030     VkCommandBuffer                             commandBuffer,
5031     const VkSubpassBeginInfo*                   pSubpassBeginInfo,
5032     const VkSubpassEndInfo*                     pSubpassEndInfo)
5033 {
5034 	radv_CmdNextSubpass(commandBuffer, pSubpassBeginInfo->contents);
5035 }
5036 
radv_emit_view_index(struct radv_cmd_buffer * cmd_buffer,unsigned index)5037 static void radv_emit_view_index(struct radv_cmd_buffer *cmd_buffer, unsigned index)
5038 {
5039 	struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
5040 	for (unsigned stage = 0; stage < MESA_SHADER_STAGES; ++stage) {
5041 		if (!radv_get_shader(pipeline, stage))
5042 			continue;
5043 
5044 		struct radv_userdata_info *loc = radv_lookup_user_sgpr(pipeline, stage, AC_UD_VIEW_INDEX);
5045 		if (loc->sgpr_idx == -1)
5046 			continue;
5047 		uint32_t base_reg = pipeline->user_data_0[stage];
5048 		radeon_set_sh_reg(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, index);
5049 
5050 	}
5051 	if (radv_pipeline_has_gs_copy_shader(pipeline)) {
5052 		struct radv_userdata_info *loc = &pipeline->gs_copy_shader->info.user_sgprs_locs.shader_data[AC_UD_VIEW_INDEX];
5053 		if (loc->sgpr_idx != -1) {
5054 			uint32_t base_reg = R_00B130_SPI_SHADER_USER_DATA_VS_0;
5055 			radeon_set_sh_reg(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, index);
5056 		}
5057 	}
5058 }
5059 
5060 static void
radv_cs_emit_draw_packet(struct radv_cmd_buffer * cmd_buffer,uint32_t vertex_count,bool use_opaque)5061 radv_cs_emit_draw_packet(struct radv_cmd_buffer *cmd_buffer,
5062                          uint32_t vertex_count,
5063 			 bool use_opaque)
5064 {
5065 	radeon_emit(cmd_buffer->cs, PKT3(PKT3_DRAW_INDEX_AUTO, 1, cmd_buffer->state.predicating));
5066 	radeon_emit(cmd_buffer->cs, vertex_count);
5067 	radeon_emit(cmd_buffer->cs, V_0287F0_DI_SRC_SEL_AUTO_INDEX |
5068 	                            S_0287F0_USE_OPAQUE(use_opaque));
5069 }
5070 
5071 static void
radv_cs_emit_draw_indexed_packet(struct radv_cmd_buffer * cmd_buffer,uint64_t index_va,uint32_t index_count)5072 radv_cs_emit_draw_indexed_packet(struct radv_cmd_buffer *cmd_buffer,
5073                                  uint64_t index_va,
5074                                  uint32_t index_count)
5075 {
5076 	radeon_emit(cmd_buffer->cs, PKT3(PKT3_DRAW_INDEX_2, 4, cmd_buffer->state.predicating));
5077 	radeon_emit(cmd_buffer->cs, cmd_buffer->state.max_index_count);
5078 	radeon_emit(cmd_buffer->cs, index_va);
5079 	radeon_emit(cmd_buffer->cs, index_va >> 32);
5080 	radeon_emit(cmd_buffer->cs, index_count);
5081 	radeon_emit(cmd_buffer->cs, V_0287F0_DI_SRC_SEL_DMA);
5082 }
5083 
5084 static void
radv_cs_emit_indirect_draw_packet(struct radv_cmd_buffer * cmd_buffer,bool indexed,uint32_t draw_count,uint64_t count_va,uint32_t stride)5085 radv_cs_emit_indirect_draw_packet(struct radv_cmd_buffer *cmd_buffer,
5086                                   bool indexed,
5087                                   uint32_t draw_count,
5088                                   uint64_t count_va,
5089                                   uint32_t stride)
5090 {
5091 	struct radeon_cmdbuf *cs = cmd_buffer->cs;
5092 	unsigned di_src_sel = indexed ? V_0287F0_DI_SRC_SEL_DMA
5093 	                              : V_0287F0_DI_SRC_SEL_AUTO_INDEX;
5094 	bool draw_id_enable = radv_get_shader(cmd_buffer->state.pipeline, MESA_SHADER_VERTEX)->info.vs.needs_draw_id;
5095 	uint32_t base_reg = cmd_buffer->state.pipeline->graphics.vtx_base_sgpr;
5096 	bool predicating = cmd_buffer->state.predicating;
5097 	assert(base_reg);
5098 
5099 	/* just reset draw state for vertex data */
5100 	cmd_buffer->state.last_first_instance = -1;
5101 	cmd_buffer->state.last_num_instances = -1;
5102 	cmd_buffer->state.last_vertex_offset = -1;
5103 
5104 	if (draw_count == 1 && !count_va && !draw_id_enable) {
5105 		radeon_emit(cs, PKT3(indexed ? PKT3_DRAW_INDEX_INDIRECT :
5106 				     PKT3_DRAW_INDIRECT, 3, predicating));
5107 		radeon_emit(cs, 0);
5108 		radeon_emit(cs, (base_reg - SI_SH_REG_OFFSET) >> 2);
5109 		radeon_emit(cs, ((base_reg + 4) - SI_SH_REG_OFFSET) >> 2);
5110 		radeon_emit(cs, di_src_sel);
5111 	} else {
5112 		radeon_emit(cs, PKT3(indexed ? PKT3_DRAW_INDEX_INDIRECT_MULTI :
5113 				     PKT3_DRAW_INDIRECT_MULTI,
5114 				     8, predicating));
5115 		radeon_emit(cs, 0);
5116 		radeon_emit(cs, (base_reg - SI_SH_REG_OFFSET) >> 2);
5117 		radeon_emit(cs, ((base_reg + 4) - SI_SH_REG_OFFSET) >> 2);
5118 		radeon_emit(cs, (((base_reg + 8) - SI_SH_REG_OFFSET) >> 2) |
5119 			    S_2C3_DRAW_INDEX_ENABLE(draw_id_enable) |
5120 			    S_2C3_COUNT_INDIRECT_ENABLE(!!count_va));
5121 		radeon_emit(cs, draw_count); /* count */
5122 		radeon_emit(cs, count_va); /* count_addr */
5123 		radeon_emit(cs, count_va >> 32);
5124 		radeon_emit(cs, stride); /* stride */
5125 		radeon_emit(cs, di_src_sel);
5126 	}
5127 }
5128 
5129 static void
radv_emit_draw_packets(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * info)5130 radv_emit_draw_packets(struct radv_cmd_buffer *cmd_buffer,
5131 		       const struct radv_draw_info *info)
5132 {
5133 	struct radv_cmd_state *state = &cmd_buffer->state;
5134 	struct radeon_winsys *ws = cmd_buffer->device->ws;
5135 	struct radeon_cmdbuf *cs = cmd_buffer->cs;
5136 
5137 	radv_describe_draw(cmd_buffer);
5138 
5139 	if (info->indirect) {
5140 		uint64_t va = radv_buffer_get_va(info->indirect->bo);
5141 		uint64_t count_va = 0;
5142 
5143 		va += info->indirect->offset + info->indirect_offset;
5144 
5145 		radv_cs_add_buffer(ws, cs, info->indirect->bo);
5146 
5147 		radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0));
5148 		radeon_emit(cs, 1);
5149 		radeon_emit(cs, va);
5150 		radeon_emit(cs, va >> 32);
5151 
5152 		if (info->count_buffer) {
5153 			count_va = radv_buffer_get_va(info->count_buffer->bo);
5154 			count_va += info->count_buffer->offset +
5155 				    info->count_buffer_offset;
5156 
5157 			radv_cs_add_buffer(ws, cs, info->count_buffer->bo);
5158 		}
5159 
5160 		if (!state->subpass->view_mask) {
5161 			radv_cs_emit_indirect_draw_packet(cmd_buffer,
5162 							  info->indexed,
5163 							  info->count,
5164 							  count_va,
5165 							  info->stride);
5166 		} else {
5167 			unsigned i;
5168 			for_each_bit(i, state->subpass->view_mask) {
5169 				radv_emit_view_index(cmd_buffer, i);
5170 
5171 				radv_cs_emit_indirect_draw_packet(cmd_buffer,
5172 								  info->indexed,
5173 								  info->count,
5174 								  count_va,
5175 								  info->stride);
5176 			}
5177 		}
5178 	} else {
5179 		assert(state->pipeline->graphics.vtx_base_sgpr);
5180 
5181 		if (info->vertex_offset != state->last_vertex_offset ||
5182 		    info->first_instance != state->last_first_instance) {
5183 			radeon_set_sh_reg_seq(cs, state->pipeline->graphics.vtx_base_sgpr,
5184 					      state->pipeline->graphics.vtx_emit_num);
5185 
5186 			radeon_emit(cs, info->vertex_offset);
5187 			radeon_emit(cs, info->first_instance);
5188 			if (state->pipeline->graphics.vtx_emit_num == 3)
5189 				radeon_emit(cs, 0);
5190 			state->last_first_instance = info->first_instance;
5191 			state->last_vertex_offset = info->vertex_offset;
5192 		}
5193 
5194 		if (state->last_num_instances != info->instance_count) {
5195 			radeon_emit(cs, PKT3(PKT3_NUM_INSTANCES, 0, false));
5196 			radeon_emit(cs, info->instance_count);
5197 			state->last_num_instances = info->instance_count;
5198 		}
5199 
5200 		if (info->indexed) {
5201 			int index_size = radv_get_vgt_index_size(state->index_type);
5202 			uint64_t index_va;
5203 
5204 			/* Skip draw calls with 0-sized index buffers. They
5205 			 * cause a hang on some chips, like Navi10-14.
5206 			 */
5207 			if (!cmd_buffer->state.max_index_count)
5208 				return;
5209 
5210 			index_va = state->index_va;
5211 			index_va += info->first_index * index_size;
5212 
5213 			if (!state->subpass->view_mask) {
5214 				radv_cs_emit_draw_indexed_packet(cmd_buffer,
5215 								 index_va,
5216 								 info->count);
5217 			} else {
5218 				unsigned i;
5219 				for_each_bit(i, state->subpass->view_mask) {
5220 					radv_emit_view_index(cmd_buffer, i);
5221 
5222 					radv_cs_emit_draw_indexed_packet(cmd_buffer,
5223 									 index_va,
5224 									 info->count);
5225 				}
5226 			}
5227 		} else {
5228 			if (!state->subpass->view_mask) {
5229 				radv_cs_emit_draw_packet(cmd_buffer,
5230 							 info->count,
5231 							 !!info->strmout_buffer);
5232 			} else {
5233 				unsigned i;
5234 				for_each_bit(i, state->subpass->view_mask) {
5235 					radv_emit_view_index(cmd_buffer, i);
5236 
5237 					radv_cs_emit_draw_packet(cmd_buffer,
5238 								 info->count,
5239 								 !!info->strmout_buffer);
5240 				}
5241 			}
5242 		}
5243 	}
5244 }
5245 
5246 /*
5247  * Vega and raven have a bug which triggers if there are multiple context
5248  * register contexts active at the same time with different scissor values.
5249  *
5250  * There are two possible workarounds:
5251  * 1) Wait for PS_PARTIAL_FLUSH every time the scissor is changed. That way
5252  *    there is only ever 1 active set of scissor values at the same time.
5253  *
5254  * 2) Whenever the hardware switches contexts we have to set the scissor
5255  *    registers again even if it is a noop. That way the new context gets
5256  *    the correct scissor values.
5257  *
5258  * This implements option 2. radv_need_late_scissor_emission needs to
5259  * return true on affected HW if radv_emit_all_graphics_states sets
5260  * any context registers.
5261  */
radv_need_late_scissor_emission(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * info)5262 static bool radv_need_late_scissor_emission(struct radv_cmd_buffer *cmd_buffer,
5263                                             const struct radv_draw_info *info)
5264 {
5265 	struct radv_cmd_state *state = &cmd_buffer->state;
5266 
5267 	if (!cmd_buffer->device->physical_device->rad_info.has_gfx9_scissor_bug)
5268 		return false;
5269 
5270 	if (cmd_buffer->state.context_roll_without_scissor_emitted || info->strmout_buffer)
5271 		return true;
5272 
5273 	uint32_t used_states = cmd_buffer->state.pipeline->graphics.needed_dynamic_state | ~RADV_CMD_DIRTY_DYNAMIC_ALL;
5274 
5275 	/* Index, vertex and streamout buffers don't change context regs, and
5276 	 * pipeline is already handled.
5277 	 */
5278 	used_states &= ~(RADV_CMD_DIRTY_INDEX_BUFFER |
5279 			 RADV_CMD_DIRTY_VERTEX_BUFFER |
5280 			 RADV_CMD_DIRTY_STREAMOUT_BUFFER |
5281 			 RADV_CMD_DIRTY_PIPELINE);
5282 
5283 	if (cmd_buffer->state.dirty & used_states)
5284 		return true;
5285 
5286 	uint32_t primitive_reset_index =
5287 		radv_get_primitive_reset_index(cmd_buffer);
5288 
5289 	if (info->indexed && state->pipeline->graphics.prim_restart_enable &&
5290 	    primitive_reset_index != state->last_primitive_reset_index)
5291 		return true;
5292 
5293 	return false;
5294 }
5295 
5296 static void
radv_emit_all_graphics_states(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * info)5297 radv_emit_all_graphics_states(struct radv_cmd_buffer *cmd_buffer,
5298 			      const struct radv_draw_info *info)
5299 {
5300 	bool late_scissor_emission;
5301 
5302 	if ((cmd_buffer->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER) ||
5303 	    cmd_buffer->state.emitted_pipeline != cmd_buffer->state.pipeline)
5304 		radv_emit_rbplus_state(cmd_buffer);
5305 
5306 	if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_PIPELINE)
5307 		radv_emit_graphics_pipeline(cmd_buffer);
5308 
5309 	/* This should be before the cmd_buffer->state.dirty is cleared
5310 	 * (excluding RADV_CMD_DIRTY_PIPELINE) and after
5311 	 * cmd_buffer->state.context_roll_without_scissor_emitted is set. */
5312 	late_scissor_emission =
5313 		radv_need_late_scissor_emission(cmd_buffer, info);
5314 
5315 	if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER)
5316 		radv_emit_framebuffer_state(cmd_buffer);
5317 
5318 	if (info->indexed) {
5319 		if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_INDEX_BUFFER)
5320 			radv_emit_index_buffer(cmd_buffer, info->indirect);
5321 	} else {
5322 		/* On GFX7 and later, non-indexed draws overwrite VGT_INDEX_TYPE,
5323 		 * so the state must be re-emitted before the next indexed
5324 		 * draw.
5325 		 */
5326 		if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX7) {
5327 			cmd_buffer->state.last_index_type = -1;
5328 			cmd_buffer->state.dirty |= RADV_CMD_DIRTY_INDEX_BUFFER;
5329 		}
5330 	}
5331 
5332 	radv_cmd_buffer_flush_dynamic_state(cmd_buffer);
5333 
5334 	radv_emit_draw_registers(cmd_buffer, info);
5335 
5336 	if (late_scissor_emission)
5337 		radv_emit_scissor(cmd_buffer);
5338 }
5339 
5340 static void
radv_draw(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * info)5341 radv_draw(struct radv_cmd_buffer *cmd_buffer,
5342 	  const struct radv_draw_info *info)
5343 {
5344 	struct radeon_info *rad_info =
5345 		&cmd_buffer->device->physical_device->rad_info;
5346 	bool has_prefetch =
5347 		cmd_buffer->device->physical_device->rad_info.chip_class >= GFX7;
5348 	bool pipeline_is_dirty =
5349 		(cmd_buffer->state.dirty & RADV_CMD_DIRTY_PIPELINE) &&
5350 		cmd_buffer->state.pipeline != cmd_buffer->state.emitted_pipeline;
5351 
5352 	ASSERTED unsigned cdw_max =
5353 		radeon_check_space(cmd_buffer->device->ws,
5354 				   cmd_buffer->cs, 4096);
5355 
5356 	if (likely(!info->indirect)) {
5357 		/* GFX6-GFX7 treat instance_count==0 as instance_count==1. There is
5358 		 * no workaround for indirect draws, but we can at least skip
5359 		 * direct draws.
5360 		 */
5361 		if (unlikely(!info->instance_count))
5362 			return;
5363 
5364 		/* Handle count == 0. */
5365 		if (unlikely(!info->count && !info->strmout_buffer))
5366 			return;
5367 	}
5368 
5369 	/* Need to apply this workaround early as it can set flush flags. */
5370 	if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER)
5371 		radv_emit_fb_mip_change_flush(cmd_buffer);
5372 
5373 	/* Use optimal packet order based on whether we need to sync the
5374 	 * pipeline.
5375 	 */
5376 	if (cmd_buffer->state.flush_bits & (RADV_CMD_FLAG_FLUSH_AND_INV_CB |
5377 					    RADV_CMD_FLAG_FLUSH_AND_INV_DB |
5378 					    RADV_CMD_FLAG_PS_PARTIAL_FLUSH |
5379 					    RADV_CMD_FLAG_CS_PARTIAL_FLUSH)) {
5380 		/* If we have to wait for idle, set all states first, so that
5381 		 * all SET packets are processed in parallel with previous draw
5382 		 * calls. Then upload descriptors, set shader pointers, and
5383 		 * draw, and prefetch at the end. This ensures that the time
5384 		 * the CUs are idle is very short. (there are only SET_SH
5385 		 * packets between the wait and the draw)
5386 		 */
5387 		radv_emit_all_graphics_states(cmd_buffer, info);
5388 		si_emit_cache_flush(cmd_buffer);
5389 		/* <-- CUs are idle here --> */
5390 
5391 		radv_upload_graphics_shader_descriptors(cmd_buffer, pipeline_is_dirty);
5392 
5393 		radv_emit_draw_packets(cmd_buffer, info);
5394 		/* <-- CUs are busy here --> */
5395 
5396 		/* Start prefetches after the draw has been started. Both will
5397 		 * run in parallel, but starting the draw first is more
5398 		 * important.
5399 		 */
5400 		if (has_prefetch && cmd_buffer->state.prefetch_L2_mask) {
5401 			radv_emit_prefetch_L2(cmd_buffer,
5402 					      cmd_buffer->state.pipeline, false);
5403 		}
5404 	} else {
5405 		/* If we don't wait for idle, start prefetches first, then set
5406 		 * states, and draw at the end.
5407 		 */
5408 		si_emit_cache_flush(cmd_buffer);
5409 
5410 		if (has_prefetch && cmd_buffer->state.prefetch_L2_mask) {
5411 			/* Only prefetch the vertex shader and VBO descriptors
5412 			 * in order to start the draw as soon as possible.
5413 			 */
5414 			radv_emit_prefetch_L2(cmd_buffer,
5415 					      cmd_buffer->state.pipeline, true);
5416 		}
5417 
5418 		radv_upload_graphics_shader_descriptors(cmd_buffer, pipeline_is_dirty);
5419 
5420 		radv_emit_all_graphics_states(cmd_buffer, info);
5421 		radv_emit_draw_packets(cmd_buffer, info);
5422 
5423 		/* Prefetch the remaining shaders after the draw has been
5424 		 * started.
5425 		 */
5426 		if (has_prefetch && cmd_buffer->state.prefetch_L2_mask) {
5427 			radv_emit_prefetch_L2(cmd_buffer,
5428 					      cmd_buffer->state.pipeline, false);
5429 		}
5430 	}
5431 
5432 	/* Workaround for a VGT hang when streamout is enabled.
5433 	 * It must be done after drawing.
5434 	 */
5435 	if (cmd_buffer->state.streamout.streamout_enabled &&
5436 	    (rad_info->family == CHIP_HAWAII ||
5437 	     rad_info->family == CHIP_TONGA ||
5438 	     rad_info->family == CHIP_FIJI)) {
5439 		cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VGT_STREAMOUT_SYNC;
5440 	}
5441 
5442 	assert(cmd_buffer->cs->cdw <= cdw_max);
5443 	radv_cmd_buffer_after_draw(cmd_buffer, RADV_CMD_FLAG_PS_PARTIAL_FLUSH);
5444 }
5445 
radv_CmdDraw(VkCommandBuffer commandBuffer,uint32_t vertexCount,uint32_t instanceCount,uint32_t firstVertex,uint32_t firstInstance)5446 void radv_CmdDraw(
5447 	VkCommandBuffer                             commandBuffer,
5448 	uint32_t                                    vertexCount,
5449 	uint32_t                                    instanceCount,
5450 	uint32_t                                    firstVertex,
5451 	uint32_t                                    firstInstance)
5452 {
5453 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5454 	struct radv_draw_info info = {0};
5455 
5456 	info.count = vertexCount;
5457 	info.instance_count = instanceCount;
5458 	info.first_instance = firstInstance;
5459 	info.vertex_offset = firstVertex;
5460 
5461 	radv_draw(cmd_buffer, &info);
5462 }
5463 
radv_CmdDrawIndexed(VkCommandBuffer commandBuffer,uint32_t indexCount,uint32_t instanceCount,uint32_t firstIndex,int32_t vertexOffset,uint32_t firstInstance)5464 void radv_CmdDrawIndexed(
5465 	VkCommandBuffer                             commandBuffer,
5466 	uint32_t                                    indexCount,
5467 	uint32_t                                    instanceCount,
5468 	uint32_t                                    firstIndex,
5469 	int32_t                                     vertexOffset,
5470 	uint32_t                                    firstInstance)
5471 {
5472 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5473 	struct radv_draw_info info = {0};
5474 
5475 	info.indexed = true;
5476 	info.count = indexCount;
5477 	info.instance_count = instanceCount;
5478 	info.first_index = firstIndex;
5479 	info.vertex_offset = vertexOffset;
5480 	info.first_instance = firstInstance;
5481 
5482 	radv_draw(cmd_buffer, &info);
5483 }
5484 
radv_CmdDrawIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,uint32_t drawCount,uint32_t stride)5485 void radv_CmdDrawIndirect(
5486 	VkCommandBuffer                             commandBuffer,
5487 	VkBuffer                                    _buffer,
5488 	VkDeviceSize                                offset,
5489 	uint32_t                                    drawCount,
5490 	uint32_t                                    stride)
5491 {
5492 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5493 	RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
5494 	struct radv_draw_info info = {0};
5495 
5496 	info.count = drawCount;
5497 	info.indirect = buffer;
5498 	info.indirect_offset = offset;
5499 	info.stride = stride;
5500 
5501 	radv_draw(cmd_buffer, &info);
5502 }
5503 
radv_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,uint32_t drawCount,uint32_t stride)5504 void radv_CmdDrawIndexedIndirect(
5505 	VkCommandBuffer                             commandBuffer,
5506 	VkBuffer                                    _buffer,
5507 	VkDeviceSize                                offset,
5508 	uint32_t                                    drawCount,
5509 	uint32_t                                    stride)
5510 {
5511 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5512 	RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
5513 	struct radv_draw_info info = {0};
5514 
5515 	info.indexed = true;
5516 	info.count = drawCount;
5517 	info.indirect = buffer;
5518 	info.indirect_offset = offset;
5519 	info.stride = stride;
5520 
5521 	radv_draw(cmd_buffer, &info);
5522 }
5523 
radv_CmdDrawIndirectCount(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,VkBuffer _countBuffer,VkDeviceSize countBufferOffset,uint32_t maxDrawCount,uint32_t stride)5524 void radv_CmdDrawIndirectCount(
5525 	VkCommandBuffer                             commandBuffer,
5526 	VkBuffer                                    _buffer,
5527 	VkDeviceSize                                offset,
5528 	VkBuffer                                    _countBuffer,
5529 	VkDeviceSize                                countBufferOffset,
5530 	uint32_t                                    maxDrawCount,
5531 	uint32_t                                    stride)
5532 {
5533 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5534 	RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
5535 	RADV_FROM_HANDLE(radv_buffer, count_buffer, _countBuffer);
5536 	struct radv_draw_info info = {0};
5537 
5538 	info.count = maxDrawCount;
5539 	info.indirect = buffer;
5540 	info.indirect_offset = offset;
5541 	info.count_buffer = count_buffer;
5542 	info.count_buffer_offset = countBufferOffset;
5543 	info.stride = stride;
5544 
5545 	radv_draw(cmd_buffer, &info);
5546 }
5547 
radv_CmdDrawIndexedIndirectCount(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,VkBuffer _countBuffer,VkDeviceSize countBufferOffset,uint32_t maxDrawCount,uint32_t stride)5548 void radv_CmdDrawIndexedIndirectCount(
5549 	VkCommandBuffer                             commandBuffer,
5550 	VkBuffer                                    _buffer,
5551 	VkDeviceSize                                offset,
5552 	VkBuffer                                    _countBuffer,
5553 	VkDeviceSize                                countBufferOffset,
5554 	uint32_t                                    maxDrawCount,
5555 	uint32_t                                    stride)
5556 {
5557 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5558 	RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
5559 	RADV_FROM_HANDLE(radv_buffer, count_buffer, _countBuffer);
5560 	struct radv_draw_info info = {0};
5561 
5562 	info.indexed = true;
5563 	info.count = maxDrawCount;
5564 	info.indirect = buffer;
5565 	info.indirect_offset = offset;
5566 	info.count_buffer = count_buffer;
5567 	info.count_buffer_offset = countBufferOffset;
5568 	info.stride = stride;
5569 
5570 	radv_draw(cmd_buffer, &info);
5571 }
5572 
5573 struct radv_dispatch_info {
5574 	/**
5575 	 * Determine the layout of the grid (in block units) to be used.
5576 	 */
5577 	uint32_t blocks[3];
5578 
5579 	/**
5580 	 * A starting offset for the grid. If unaligned is set, the offset
5581 	 * must still be aligned.
5582 	 */
5583 	uint32_t offsets[3];
5584 	/**
5585 	 * Whether it's an unaligned compute dispatch.
5586 	 */
5587 	bool unaligned;
5588 
5589 	/**
5590 	 * Indirect compute parameters resource.
5591 	 */
5592 	struct radv_buffer *indirect;
5593 	uint64_t indirect_offset;
5594 };
5595 
5596 static void
radv_emit_dispatch_packets(struct radv_cmd_buffer * cmd_buffer,const struct radv_dispatch_info * info)5597 radv_emit_dispatch_packets(struct radv_cmd_buffer *cmd_buffer,
5598 			   const struct radv_dispatch_info *info)
5599 {
5600 	struct radv_pipeline *pipeline = cmd_buffer->state.compute_pipeline;
5601 	struct radv_shader_variant *compute_shader = pipeline->shaders[MESA_SHADER_COMPUTE];
5602 	unsigned dispatch_initiator = cmd_buffer->device->dispatch_initiator;
5603 	struct radeon_winsys *ws = cmd_buffer->device->ws;
5604 	bool predicating = cmd_buffer->state.predicating;
5605 	struct radeon_cmdbuf *cs = cmd_buffer->cs;
5606 	struct radv_userdata_info *loc;
5607 
5608 	radv_describe_dispatch(cmd_buffer, info->blocks[0], info->blocks[1],
5609 	                       info->blocks[2]);
5610 
5611 	loc = radv_lookup_user_sgpr(pipeline, MESA_SHADER_COMPUTE,
5612 				    AC_UD_CS_GRID_SIZE);
5613 
5614 	ASSERTED unsigned cdw_max = radeon_check_space(ws, cs, 25);
5615 
5616 	if (compute_shader->info.wave_size == 32) {
5617 		assert(cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10);
5618 		dispatch_initiator |= S_00B800_CS_W32_EN(1);
5619 	}
5620 
5621 	if (info->indirect) {
5622 		uint64_t va = radv_buffer_get_va(info->indirect->bo);
5623 
5624 		va += info->indirect->offset + info->indirect_offset;
5625 
5626 		radv_cs_add_buffer(ws, cs, info->indirect->bo);
5627 
5628 		if (loc->sgpr_idx != -1) {
5629 			for (unsigned i = 0; i < 3; ++i) {
5630 				radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
5631 				radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) |
5632 						COPY_DATA_DST_SEL(COPY_DATA_REG));
5633 				radeon_emit(cs, (va +  4 * i));
5634 				radeon_emit(cs, (va + 4 * i) >> 32);
5635 				radeon_emit(cs, ((R_00B900_COMPUTE_USER_DATA_0
5636 						 + loc->sgpr_idx * 4) >> 2) + i);
5637 				radeon_emit(cs, 0);
5638 			}
5639 		}
5640 
5641 		if (radv_cmd_buffer_uses_mec(cmd_buffer)) {
5642 			radeon_emit(cs, PKT3(PKT3_DISPATCH_INDIRECT, 2, predicating) |
5643 					PKT3_SHADER_TYPE_S(1));
5644 			radeon_emit(cs, va);
5645 			radeon_emit(cs, va >> 32);
5646 			radeon_emit(cs, dispatch_initiator);
5647 		} else {
5648 			radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0) |
5649 					PKT3_SHADER_TYPE_S(1));
5650 			radeon_emit(cs, 1);
5651 			radeon_emit(cs, va);
5652 			radeon_emit(cs, va >> 32);
5653 
5654 			radeon_emit(cs, PKT3(PKT3_DISPATCH_INDIRECT, 1, predicating) |
5655 					PKT3_SHADER_TYPE_S(1));
5656 			radeon_emit(cs, 0);
5657 			radeon_emit(cs, dispatch_initiator);
5658 		}
5659 	} else {
5660 		unsigned blocks[3] = { info->blocks[0], info->blocks[1], info->blocks[2] };
5661 		unsigned offsets[3] = { info->offsets[0], info->offsets[1], info->offsets[2] };
5662 
5663 		if (info->unaligned) {
5664 			unsigned *cs_block_size = compute_shader->info.cs.block_size;
5665 			unsigned remainder[3];
5666 
5667 			/* If aligned, these should be an entire block size,
5668 			 * not 0.
5669 			 */
5670 			remainder[0] = blocks[0] + cs_block_size[0] -
5671 				       align_u32_npot(blocks[0], cs_block_size[0]);
5672 			remainder[1] = blocks[1] + cs_block_size[1] -
5673 				       align_u32_npot(blocks[1], cs_block_size[1]);
5674 			remainder[2] = blocks[2] + cs_block_size[2] -
5675 				       align_u32_npot(blocks[2], cs_block_size[2]);
5676 
5677 			blocks[0] = round_up_u32(blocks[0], cs_block_size[0]);
5678 			blocks[1] = round_up_u32(blocks[1], cs_block_size[1]);
5679 			blocks[2] = round_up_u32(blocks[2], cs_block_size[2]);
5680 
5681 			for(unsigned i = 0; i < 3; ++i) {
5682 				assert(offsets[i] % cs_block_size[i] == 0);
5683 				offsets[i] /= cs_block_size[i];
5684 			}
5685 
5686 			radeon_set_sh_reg_seq(cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3);
5687 			radeon_emit(cs,
5688 				    S_00B81C_NUM_THREAD_FULL(cs_block_size[0]) |
5689 				    S_00B81C_NUM_THREAD_PARTIAL(remainder[0]));
5690 			radeon_emit(cs,
5691 				    S_00B81C_NUM_THREAD_FULL(cs_block_size[1]) |
5692 				    S_00B81C_NUM_THREAD_PARTIAL(remainder[1]));
5693 			radeon_emit(cs,
5694 				    S_00B81C_NUM_THREAD_FULL(cs_block_size[2]) |
5695 				    S_00B81C_NUM_THREAD_PARTIAL(remainder[2]));
5696 
5697 			dispatch_initiator |= S_00B800_PARTIAL_TG_EN(1);
5698 		}
5699 
5700 		if (loc->sgpr_idx != -1) {
5701 			assert(loc->num_sgprs == 3);
5702 
5703 			radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 +
5704 						  loc->sgpr_idx * 4, 3);
5705 			radeon_emit(cs, blocks[0]);
5706 			radeon_emit(cs, blocks[1]);
5707 			radeon_emit(cs, blocks[2]);
5708 		}
5709 
5710 		if (offsets[0] || offsets[1] || offsets[2]) {
5711 			radeon_set_sh_reg_seq(cs, R_00B810_COMPUTE_START_X, 3);
5712 			radeon_emit(cs, offsets[0]);
5713 			radeon_emit(cs, offsets[1]);
5714 			radeon_emit(cs, offsets[2]);
5715 
5716 			/* The blocks in the packet are not counts but end values. */
5717 			for (unsigned i = 0; i < 3; ++i)
5718 				blocks[i] += offsets[i];
5719 		} else {
5720 			dispatch_initiator |= S_00B800_FORCE_START_AT_000(1);
5721 		}
5722 
5723 		radeon_emit(cs, PKT3(PKT3_DISPATCH_DIRECT, 3, predicating) |
5724 				PKT3_SHADER_TYPE_S(1));
5725 		radeon_emit(cs, blocks[0]);
5726 		radeon_emit(cs, blocks[1]);
5727 		radeon_emit(cs, blocks[2]);
5728 		radeon_emit(cs, dispatch_initiator);
5729 	}
5730 
5731 	assert(cmd_buffer->cs->cdw <= cdw_max);
5732 }
5733 
5734 static void
radv_upload_compute_shader_descriptors(struct radv_cmd_buffer * cmd_buffer)5735 radv_upload_compute_shader_descriptors(struct radv_cmd_buffer *cmd_buffer)
5736 {
5737 	radv_flush_descriptors(cmd_buffer, VK_SHADER_STAGE_COMPUTE_BIT);
5738 	radv_flush_constants(cmd_buffer, VK_SHADER_STAGE_COMPUTE_BIT);
5739 }
5740 
5741 static void
radv_dispatch(struct radv_cmd_buffer * cmd_buffer,const struct radv_dispatch_info * info)5742 radv_dispatch(struct radv_cmd_buffer *cmd_buffer,
5743 	      const struct radv_dispatch_info *info)
5744 {
5745 	struct radv_pipeline *pipeline = cmd_buffer->state.compute_pipeline;
5746 	bool has_prefetch =
5747 		cmd_buffer->device->physical_device->rad_info.chip_class >= GFX7;
5748 	bool pipeline_is_dirty = pipeline &&
5749 				 pipeline != cmd_buffer->state.emitted_compute_pipeline;
5750 
5751 	if (cmd_buffer->state.flush_bits & (RADV_CMD_FLAG_FLUSH_AND_INV_CB |
5752 					    RADV_CMD_FLAG_FLUSH_AND_INV_DB |
5753 					    RADV_CMD_FLAG_PS_PARTIAL_FLUSH |
5754 					    RADV_CMD_FLAG_CS_PARTIAL_FLUSH)) {
5755 		/* If we have to wait for idle, set all states first, so that
5756 		 * all SET packets are processed in parallel with previous draw
5757 		 * calls. Then upload descriptors, set shader pointers, and
5758 		 * dispatch, and prefetch at the end. This ensures that the
5759 		 * time the CUs are idle is very short. (there are only SET_SH
5760 		 * packets between the wait and the draw)
5761 		 */
5762 		radv_emit_compute_pipeline(cmd_buffer);
5763 		si_emit_cache_flush(cmd_buffer);
5764 		/* <-- CUs are idle here --> */
5765 
5766 		radv_upload_compute_shader_descriptors(cmd_buffer);
5767 
5768 		radv_emit_dispatch_packets(cmd_buffer, info);
5769 		/* <-- CUs are busy here --> */
5770 
5771 		/* Start prefetches after the dispatch has been started. Both
5772 		 * will run in parallel, but starting the dispatch first is
5773 		 * more important.
5774 		 */
5775 		if (has_prefetch && pipeline_is_dirty) {
5776 			radv_emit_shader_prefetch(cmd_buffer,
5777 						  pipeline->shaders[MESA_SHADER_COMPUTE]);
5778 		}
5779 	} else {
5780 		/* If we don't wait for idle, start prefetches first, then set
5781 		 * states, and dispatch at the end.
5782 		 */
5783 		si_emit_cache_flush(cmd_buffer);
5784 
5785 		if (has_prefetch && pipeline_is_dirty) {
5786 			radv_emit_shader_prefetch(cmd_buffer,
5787 						  pipeline->shaders[MESA_SHADER_COMPUTE]);
5788 		}
5789 
5790 		radv_upload_compute_shader_descriptors(cmd_buffer);
5791 
5792 		radv_emit_compute_pipeline(cmd_buffer);
5793 		radv_emit_dispatch_packets(cmd_buffer, info);
5794 	}
5795 
5796 	radv_cmd_buffer_after_draw(cmd_buffer, RADV_CMD_FLAG_CS_PARTIAL_FLUSH);
5797 }
5798 
radv_CmdDispatchBase(VkCommandBuffer commandBuffer,uint32_t base_x,uint32_t base_y,uint32_t base_z,uint32_t x,uint32_t y,uint32_t z)5799 void radv_CmdDispatchBase(
5800 	VkCommandBuffer                             commandBuffer,
5801 	uint32_t                                    base_x,
5802 	uint32_t                                    base_y,
5803 	uint32_t                                    base_z,
5804 	uint32_t                                    x,
5805 	uint32_t                                    y,
5806 	uint32_t                                    z)
5807 {
5808 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5809 	struct radv_dispatch_info info = {0};
5810 
5811 	info.blocks[0] = x;
5812 	info.blocks[1] = y;
5813 	info.blocks[2] = z;
5814 
5815 	info.offsets[0] = base_x;
5816 	info.offsets[1] = base_y;
5817 	info.offsets[2] = base_z;
5818 	radv_dispatch(cmd_buffer, &info);
5819 }
5820 
radv_CmdDispatch(VkCommandBuffer commandBuffer,uint32_t x,uint32_t y,uint32_t z)5821 void radv_CmdDispatch(
5822 	VkCommandBuffer                             commandBuffer,
5823 	uint32_t                                    x,
5824 	uint32_t                                    y,
5825 	uint32_t                                    z)
5826 {
5827 	radv_CmdDispatchBase(commandBuffer, 0, 0, 0, x, y, z);
5828 }
5829 
radv_CmdDispatchIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset)5830 void radv_CmdDispatchIndirect(
5831 	VkCommandBuffer                             commandBuffer,
5832 	VkBuffer                                    _buffer,
5833 	VkDeviceSize                                offset)
5834 {
5835 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5836 	RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
5837 	struct radv_dispatch_info info = {0};
5838 
5839 	info.indirect = buffer;
5840 	info.indirect_offset = offset;
5841 
5842 	radv_dispatch(cmd_buffer, &info);
5843 }
5844 
radv_unaligned_dispatch(struct radv_cmd_buffer * cmd_buffer,uint32_t x,uint32_t y,uint32_t z)5845 void radv_unaligned_dispatch(
5846 	struct radv_cmd_buffer                      *cmd_buffer,
5847 	uint32_t                                    x,
5848 	uint32_t                                    y,
5849 	uint32_t                                    z)
5850 {
5851 	struct radv_dispatch_info info = {0};
5852 
5853 	info.blocks[0] = x;
5854 	info.blocks[1] = y;
5855 	info.blocks[2] = z;
5856 	info.unaligned = 1;
5857 
5858 	radv_dispatch(cmd_buffer, &info);
5859 }
5860 
5861 void
radv_cmd_buffer_end_render_pass(struct radv_cmd_buffer * cmd_buffer)5862 radv_cmd_buffer_end_render_pass(struct radv_cmd_buffer *cmd_buffer)
5863 {
5864 	vk_free(&cmd_buffer->pool->alloc, cmd_buffer->state.attachments);
5865 	vk_free(&cmd_buffer->pool->alloc, cmd_buffer->state.subpass_sample_locs);
5866 
5867 	cmd_buffer->state.pass = NULL;
5868 	cmd_buffer->state.subpass = NULL;
5869 	cmd_buffer->state.attachments = NULL;
5870 	cmd_buffer->state.framebuffer = NULL;
5871 	cmd_buffer->state.subpass_sample_locs = NULL;
5872 }
5873 
radv_CmdEndRenderPass(VkCommandBuffer commandBuffer)5874 void radv_CmdEndRenderPass(
5875 	VkCommandBuffer                             commandBuffer)
5876 {
5877 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5878 
5879 	radv_subpass_barrier(cmd_buffer, &cmd_buffer->state.pass->end_barrier);
5880 
5881 	radv_cmd_buffer_end_subpass(cmd_buffer);
5882 
5883 	radv_cmd_buffer_end_render_pass(cmd_buffer);
5884 }
5885 
radv_CmdEndRenderPass2(VkCommandBuffer commandBuffer,const VkSubpassEndInfo * pSubpassEndInfo)5886 void radv_CmdEndRenderPass2(
5887     VkCommandBuffer                             commandBuffer,
5888     const VkSubpassEndInfo*                     pSubpassEndInfo)
5889 {
5890 	radv_CmdEndRenderPass(commandBuffer);
5891 }
5892 
5893 /*
5894  * For HTILE we have the following interesting clear words:
5895  *   0xfffff30f: Uncompressed, full depth range, for depth+stencil HTILE
5896  *   0xfffc000f: Uncompressed, full depth range, for depth only HTILE.
5897  *   0xfffffff0: Clear depth to 1.0
5898  *   0x00000000: Clear depth to 0.0
5899  */
radv_initialize_htile(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,const VkImageSubresourceRange * range)5900 static void radv_initialize_htile(struct radv_cmd_buffer *cmd_buffer,
5901                                   struct radv_image *image,
5902                                   const VkImageSubresourceRange *range)
5903 {
5904 	assert(range->baseMipLevel == 0);
5905 	assert(range->levelCount == 1 || range->levelCount == VK_REMAINING_ARRAY_LAYERS);
5906 	VkImageAspectFlags aspects = VK_IMAGE_ASPECT_DEPTH_BIT;
5907 	struct radv_cmd_state *state = &cmd_buffer->state;
5908 	uint32_t htile_value = vk_format_is_stencil(image->vk_format) ? 0xfffff30f : 0xfffc000f;
5909 	VkClearDepthStencilValue value = {0};
5910 	struct radv_barrier_data barrier = {0};
5911 
5912 	state->flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB |
5913 			     RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
5914 
5915 	barrier.layout_transitions.init_mask_ram = 1;
5916 	radv_describe_layout_transition(cmd_buffer, &barrier);
5917 
5918 	state->flush_bits |= radv_clear_htile(cmd_buffer, image, range, htile_value);
5919 
5920 	state->flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
5921 
5922 	if (vk_format_is_stencil(image->vk_format))
5923 		aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;
5924 
5925 	radv_set_ds_clear_metadata(cmd_buffer, image, range, value, aspects);
5926 
5927 	if (radv_image_is_tc_compat_htile(image)) {
5928 		/* Initialize the TC-compat metada value to 0 because by
5929 		 * default DB_Z_INFO.RANGE_PRECISION is set to 1, and we only
5930 		 * need have to conditionally update its value when performing
5931 		 * a fast depth clear.
5932 		 */
5933 		radv_set_tc_compat_zrange_metadata(cmd_buffer, image, range, 0);
5934 	}
5935 }
5936 
radv_handle_depth_image_transition(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,VkImageLayout src_layout,bool src_render_loop,VkImageLayout dst_layout,bool dst_render_loop,unsigned src_queue_mask,unsigned dst_queue_mask,const VkImageSubresourceRange * range,struct radv_sample_locations_state * sample_locs)5937 static void radv_handle_depth_image_transition(struct radv_cmd_buffer *cmd_buffer,
5938 					       struct radv_image *image,
5939 					       VkImageLayout src_layout,
5940 					       bool src_render_loop,
5941 					       VkImageLayout dst_layout,
5942 					       bool dst_render_loop,
5943 					       unsigned src_queue_mask,
5944 					       unsigned dst_queue_mask,
5945 					       const VkImageSubresourceRange *range,
5946 					       struct radv_sample_locations_state *sample_locs)
5947 {
5948 	if (!radv_image_has_htile(image))
5949 		return;
5950 
5951 	if (src_layout == VK_IMAGE_LAYOUT_UNDEFINED) {
5952 		radv_initialize_htile(cmd_buffer, image, range);
5953 	} else if (!radv_layout_is_htile_compressed(cmd_buffer->device, image, src_layout, src_render_loop, src_queue_mask) &&
5954 	           radv_layout_is_htile_compressed(cmd_buffer->device, image, dst_layout, dst_render_loop, dst_queue_mask)) {
5955 		radv_initialize_htile(cmd_buffer, image, range);
5956 	} else if (radv_layout_is_htile_compressed(cmd_buffer->device, image, src_layout, src_render_loop, src_queue_mask) &&
5957 	           !radv_layout_is_htile_compressed(cmd_buffer->device, image, dst_layout, dst_render_loop, dst_queue_mask)) {
5958 		cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB |
5959 		                                RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
5960 
5961 		radv_decompress_depth_stencil(cmd_buffer, image, range,
5962 					      sample_locs);
5963 
5964 		cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB |
5965 		                                RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
5966 	}
5967 }
5968 
radv_initialise_cmask(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,const VkImageSubresourceRange * range,uint32_t value)5969 static void radv_initialise_cmask(struct radv_cmd_buffer *cmd_buffer,
5970 				  struct radv_image *image,
5971 				  const VkImageSubresourceRange *range,
5972 				  uint32_t value)
5973 {
5974 	struct radv_cmd_state *state = &cmd_buffer->state;
5975 	struct radv_barrier_data barrier = {0};
5976 
5977 	state->flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB |
5978 			    RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
5979 
5980 	barrier.layout_transitions.init_mask_ram = 1;
5981 	radv_describe_layout_transition(cmd_buffer, &barrier);
5982 
5983 	state->flush_bits |= radv_clear_cmask(cmd_buffer, image, range, value);
5984 
5985 	state->flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
5986 }
5987 
radv_initialize_fmask(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,const VkImageSubresourceRange * range)5988 void radv_initialize_fmask(struct radv_cmd_buffer *cmd_buffer,
5989 			   struct radv_image *image,
5990 			   const VkImageSubresourceRange *range)
5991 {
5992 	struct radv_cmd_state *state = &cmd_buffer->state;
5993 	static const uint32_t fmask_clear_values[4] = {
5994 		0x00000000,
5995 		0x02020202,
5996 		0xE4E4E4E4,
5997 		0x76543210
5998 	};
5999 	uint32_t log2_samples = util_logbase2(image->info.samples);
6000 	uint32_t value = fmask_clear_values[log2_samples];
6001 	struct radv_barrier_data barrier = {0};
6002 
6003 	state->flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB |
6004 			     RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
6005 
6006 	barrier.layout_transitions.init_mask_ram = 1;
6007 	radv_describe_layout_transition(cmd_buffer, &barrier);
6008 
6009 	state->flush_bits |= radv_clear_fmask(cmd_buffer, image, range, value);
6010 
6011 	state->flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
6012 }
6013 
radv_initialize_dcc(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,const VkImageSubresourceRange * range,uint32_t value)6014 void radv_initialize_dcc(struct radv_cmd_buffer *cmd_buffer,
6015 			 struct radv_image *image,
6016 			 const VkImageSubresourceRange *range, uint32_t value)
6017 {
6018 	struct radv_cmd_state *state = &cmd_buffer->state;
6019 	struct radv_barrier_data barrier = {0};
6020 	unsigned size = 0;
6021 
6022 	state->flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB |
6023 			     RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
6024 
6025 	barrier.layout_transitions.init_mask_ram = 1;
6026 	radv_describe_layout_transition(cmd_buffer, &barrier);
6027 
6028 	state->flush_bits |= radv_clear_dcc(cmd_buffer, image, range, value);
6029 
6030 	if (cmd_buffer->device->physical_device->rad_info.chip_class == GFX8) {
6031 		/* When DCC is enabled with mipmaps, some levels might not
6032 		 * support fast clears and we have to initialize them as "fully
6033 		 * expanded".
6034 		 */
6035 		/* Compute the size of all fast clearable DCC levels. */
6036 		for (unsigned i = 0; i < image->planes[0].surface.num_dcc_levels; i++) {
6037 			struct legacy_surf_level *surf_level =
6038 				&image->planes[0].surface.u.legacy.level[i];
6039 			unsigned dcc_fast_clear_size =
6040 				surf_level->dcc_slice_fast_clear_size * image->info.array_size;
6041 
6042 			if (!dcc_fast_clear_size)
6043 				break;
6044 
6045 			size = surf_level->dcc_offset + dcc_fast_clear_size;
6046 		}
6047 
6048 		/* Initialize the mipmap levels without DCC. */
6049 		if (size != image->planes[0].surface.dcc_size) {
6050 			state->flush_bits |=
6051 				radv_fill_buffer(cmd_buffer, image->bo,
6052 						 image->offset + image->planes[0].surface.dcc_offset + size,
6053 						 image->planes[0].surface.dcc_size - size,
6054 						 0xffffffff);
6055 		}
6056 	}
6057 
6058 	state->flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB |
6059 			     RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
6060 }
6061 
6062 /**
6063  * Initialize DCC/FMASK/CMASK metadata for a color image.
6064  */
radv_init_color_image_metadata(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,VkImageLayout src_layout,bool src_render_loop,VkImageLayout dst_layout,bool dst_render_loop,unsigned src_queue_mask,unsigned dst_queue_mask,const VkImageSubresourceRange * range)6065 static void radv_init_color_image_metadata(struct radv_cmd_buffer *cmd_buffer,
6066 					   struct radv_image *image,
6067 					   VkImageLayout src_layout,
6068 					   bool src_render_loop,
6069 					   VkImageLayout dst_layout,
6070 					   bool dst_render_loop,
6071 					   unsigned src_queue_mask,
6072 					   unsigned dst_queue_mask,
6073 					   const VkImageSubresourceRange *range)
6074 {
6075 	if (radv_image_has_cmask(image)) {
6076 		uint32_t value = 0xffffffffu; /* Fully expanded mode. */
6077 
6078 		/*  TODO: clarify why 0xccccccccu is used. */
6079 
6080 		/* If CMASK isn't updated with the new layout, we should use the
6081 		 * fully expanded mode so that the image is read correctly if
6082 		 * CMASK is used (such as when transitioning to a compressed
6083 		 * layout).
6084 		 */
6085 		if (radv_image_has_fmask(image) &&
6086 		    radv_layout_can_fast_clear(image, dst_layout,
6087 					       dst_render_loop, dst_queue_mask)) {
6088 			value = 0xccccccccu;
6089 		}
6090 
6091 		radv_initialise_cmask(cmd_buffer, image, range, value);
6092 	}
6093 
6094 	if (radv_image_has_fmask(image)) {
6095 		radv_initialize_fmask(cmd_buffer, image, range);
6096 	}
6097 
6098 	if (radv_dcc_enabled(image, range->baseMipLevel)) {
6099 		uint32_t value = 0xffffffffu; /* Fully expanded mode. */
6100 
6101 		if (radv_layout_dcc_compressed(cmd_buffer->device, image, dst_layout,
6102 					       dst_render_loop,
6103 					       dst_queue_mask)) {
6104 			value = 0u;
6105 		}
6106 
6107 		radv_initialize_dcc(cmd_buffer, image, range, value);
6108 
6109 		radv_update_fce_metadata(cmd_buffer, image, range, false);
6110 	}
6111 
6112 	if (radv_image_has_cmask(image) ||
6113 	    radv_dcc_enabled(image, range->baseMipLevel)) {
6114 		uint32_t color_values[2] = {0};
6115 		radv_set_color_clear_metadata(cmd_buffer, image, range,
6116 					      color_values);
6117 	}
6118 }
6119 
6120 /**
6121  * Handle color image transitions for DCC/FMASK/CMASK.
6122  */
radv_handle_color_image_transition(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,VkImageLayout src_layout,bool src_render_loop,VkImageLayout dst_layout,bool dst_render_loop,unsigned src_queue_mask,unsigned dst_queue_mask,const VkImageSubresourceRange * range)6123 static void radv_handle_color_image_transition(struct radv_cmd_buffer *cmd_buffer,
6124 					       struct radv_image *image,
6125 					       VkImageLayout src_layout,
6126 					       bool src_render_loop,
6127 					       VkImageLayout dst_layout,
6128 					       bool dst_render_loop,
6129 					       unsigned src_queue_mask,
6130 					       unsigned dst_queue_mask,
6131 					       const VkImageSubresourceRange *range)
6132 {
6133 	if (src_layout == VK_IMAGE_LAYOUT_UNDEFINED) {
6134 		radv_init_color_image_metadata(cmd_buffer, image,
6135 					       src_layout, src_render_loop,
6136 					       dst_layout, dst_render_loop,
6137 					       src_queue_mask, dst_queue_mask,
6138 					       range);
6139 		return;
6140 	}
6141 
6142 	if (radv_dcc_enabled(image, range->baseMipLevel)) {
6143 		if (src_layout == VK_IMAGE_LAYOUT_PREINITIALIZED) {
6144 			radv_initialize_dcc(cmd_buffer, image, range, 0xffffffffu);
6145 		} else if (radv_layout_dcc_compressed(cmd_buffer->device, image, src_layout, src_render_loop, src_queue_mask) &&
6146 		           !radv_layout_dcc_compressed(cmd_buffer->device, image, dst_layout, dst_render_loop, dst_queue_mask)) {
6147 			radv_decompress_dcc(cmd_buffer, image, range);
6148 		} else if (radv_layout_can_fast_clear(image, src_layout, src_render_loop, src_queue_mask) &&
6149 			   !radv_layout_can_fast_clear(image, dst_layout, dst_render_loop, dst_queue_mask)) {
6150 			radv_fast_clear_flush_image_inplace(cmd_buffer, image, range);
6151 		}
6152 	} else if (radv_image_has_cmask(image) || radv_image_has_fmask(image)) {
6153 		bool fce_eliminate = false, fmask_expand = false;
6154 
6155 		if (radv_layout_can_fast_clear(image, src_layout, src_render_loop, src_queue_mask) &&
6156 		    !radv_layout_can_fast_clear(image, dst_layout, dst_render_loop, dst_queue_mask)) {
6157 			fce_eliminate = true;
6158 		}
6159 
6160 		if (radv_image_has_fmask(image) &&
6161 		    (image->usage & (VK_IMAGE_USAGE_STORAGE_BIT |
6162 				     VK_IMAGE_USAGE_TRANSFER_DST_BIT))) {
6163 			if (src_layout != VK_IMAGE_LAYOUT_GENERAL &&
6164 			    dst_layout == VK_IMAGE_LAYOUT_GENERAL) {
6165 				/* A FMASK decompress is required before doing
6166 				 * a MSAA decompress using FMASK.
6167 				 */
6168 				fmask_expand = true;
6169 			}
6170 		}
6171 
6172 		if (fce_eliminate || fmask_expand)
6173 			radv_fast_clear_flush_image_inplace(cmd_buffer, image, range);
6174 
6175 		if (fmask_expand) {
6176 			struct radv_barrier_data barrier = {0};
6177 			barrier.layout_transitions.fmask_color_expand = 1;
6178 			radv_describe_layout_transition(cmd_buffer, &barrier);
6179 
6180 			radv_expand_fmask_image_inplace(cmd_buffer, image, range);
6181 		}
6182 	}
6183 }
6184 
radv_handle_image_transition(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,VkImageLayout src_layout,bool src_render_loop,VkImageLayout dst_layout,bool dst_render_loop,uint32_t src_family,uint32_t dst_family,const VkImageSubresourceRange * range,struct radv_sample_locations_state * sample_locs)6185 static void radv_handle_image_transition(struct radv_cmd_buffer *cmd_buffer,
6186 					 struct radv_image *image,
6187 					 VkImageLayout src_layout,
6188 					 bool src_render_loop,
6189 					 VkImageLayout dst_layout,
6190 					 bool dst_render_loop,
6191 					 uint32_t src_family,
6192 					 uint32_t dst_family,
6193 					 const VkImageSubresourceRange *range,
6194 					 struct radv_sample_locations_state *sample_locs)
6195 {
6196 	if (image->exclusive && src_family != dst_family) {
6197 		/* This is an acquire or a release operation and there will be
6198 		 * a corresponding release/acquire. Do the transition in the
6199 		 * most flexible queue. */
6200 
6201 		assert(src_family == cmd_buffer->queue_family_index ||
6202 		       dst_family == cmd_buffer->queue_family_index);
6203 
6204 		if (src_family == VK_QUEUE_FAMILY_EXTERNAL ||
6205 		    src_family == VK_QUEUE_FAMILY_FOREIGN_EXT)
6206 			return;
6207 
6208 		if (cmd_buffer->queue_family_index == RADV_QUEUE_TRANSFER)
6209 			return;
6210 
6211 		if (cmd_buffer->queue_family_index == RADV_QUEUE_COMPUTE &&
6212 		    (src_family == RADV_QUEUE_GENERAL ||
6213 		     dst_family == RADV_QUEUE_GENERAL))
6214 			return;
6215 	}
6216 
6217 	if (src_layout == dst_layout && src_render_loop == dst_render_loop)
6218 		return;
6219 
6220 	unsigned src_queue_mask =
6221 		radv_image_queue_family_mask(image, src_family,
6222 					     cmd_buffer->queue_family_index);
6223 	unsigned dst_queue_mask =
6224 		radv_image_queue_family_mask(image, dst_family,
6225 					     cmd_buffer->queue_family_index);
6226 
6227 	if (vk_format_is_depth(image->vk_format)) {
6228 		radv_handle_depth_image_transition(cmd_buffer, image,
6229 						   src_layout, src_render_loop,
6230 						   dst_layout, dst_render_loop,
6231 						   src_queue_mask, dst_queue_mask,
6232 						   range, sample_locs);
6233 	} else {
6234 		radv_handle_color_image_transition(cmd_buffer, image,
6235 						   src_layout, src_render_loop,
6236 						   dst_layout, dst_render_loop,
6237 						   src_queue_mask, dst_queue_mask,
6238 						   range);
6239 	}
6240 }
6241 
6242 struct radv_barrier_info {
6243 	enum rgp_barrier_reason reason;
6244 	uint32_t eventCount;
6245 	const VkEvent *pEvents;
6246 	VkPipelineStageFlags srcStageMask;
6247 	VkPipelineStageFlags dstStageMask;
6248 };
6249 
6250 static void
radv_barrier(struct radv_cmd_buffer * cmd_buffer,uint32_t memoryBarrierCount,const VkMemoryBarrier * pMemoryBarriers,uint32_t bufferMemoryBarrierCount,const VkBufferMemoryBarrier * pBufferMemoryBarriers,uint32_t imageMemoryBarrierCount,const VkImageMemoryBarrier * pImageMemoryBarriers,const struct radv_barrier_info * info)6251 radv_barrier(struct radv_cmd_buffer *cmd_buffer,
6252 	     uint32_t memoryBarrierCount,
6253 	     const VkMemoryBarrier *pMemoryBarriers,
6254 	     uint32_t bufferMemoryBarrierCount,
6255 	     const VkBufferMemoryBarrier *pBufferMemoryBarriers,
6256 	     uint32_t imageMemoryBarrierCount,
6257 	     const VkImageMemoryBarrier *pImageMemoryBarriers,
6258 	     const struct radv_barrier_info *info)
6259 {
6260 	struct radeon_cmdbuf *cs = cmd_buffer->cs;
6261 	enum radv_cmd_flush_bits src_flush_bits = 0;
6262 	enum radv_cmd_flush_bits dst_flush_bits = 0;
6263 
6264 	radv_describe_barrier_start(cmd_buffer, info->reason);
6265 
6266 	for (unsigned i = 0; i < info->eventCount; ++i) {
6267 		RADV_FROM_HANDLE(radv_event, event, info->pEvents[i]);
6268 		uint64_t va = radv_buffer_get_va(event->bo);
6269 
6270 		radv_cs_add_buffer(cmd_buffer->device->ws, cs, event->bo);
6271 
6272 		ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 7);
6273 
6274 		radv_cp_wait_mem(cs, WAIT_REG_MEM_EQUAL, va, 1, 0xffffffff);
6275 		assert(cmd_buffer->cs->cdw <= cdw_max);
6276 	}
6277 
6278 	for (uint32_t i = 0; i < memoryBarrierCount; i++) {
6279 		src_flush_bits |= radv_src_access_flush(cmd_buffer, pMemoryBarriers[i].srcAccessMask,
6280 							NULL);
6281 		dst_flush_bits |= radv_dst_access_flush(cmd_buffer, pMemoryBarriers[i].dstAccessMask,
6282 		                                        NULL);
6283 	}
6284 
6285 	for (uint32_t i = 0; i < bufferMemoryBarrierCount; i++) {
6286 		src_flush_bits |= radv_src_access_flush(cmd_buffer, pBufferMemoryBarriers[i].srcAccessMask,
6287 							NULL);
6288 		dst_flush_bits |= radv_dst_access_flush(cmd_buffer, pBufferMemoryBarriers[i].dstAccessMask,
6289 		                                        NULL);
6290 	}
6291 
6292 	for (uint32_t i = 0; i < imageMemoryBarrierCount; i++) {
6293 		RADV_FROM_HANDLE(radv_image, image, pImageMemoryBarriers[i].image);
6294 
6295 		src_flush_bits |= radv_src_access_flush(cmd_buffer, pImageMemoryBarriers[i].srcAccessMask,
6296 							image);
6297 		dst_flush_bits |= radv_dst_access_flush(cmd_buffer, pImageMemoryBarriers[i].dstAccessMask,
6298 		                                        image);
6299 	}
6300 
6301 	/* The Vulkan spec 1.1.98 says:
6302 	 *
6303 	 * "An execution dependency with only
6304 	 *  VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT in the destination stage mask
6305 	 *  will only prevent that stage from executing in subsequently
6306 	 *  submitted commands. As this stage does not perform any actual
6307 	 *  execution, this is not observable - in effect, it does not delay
6308 	 *  processing of subsequent commands. Similarly an execution dependency
6309 	 *  with only VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT in the source stage mask
6310 	 *  will effectively not wait for any prior commands to complete."
6311 	 */
6312 	if (info->dstStageMask != VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT)
6313 		radv_stage_flush(cmd_buffer, info->srcStageMask);
6314 	cmd_buffer->state.flush_bits |= src_flush_bits;
6315 
6316 	for (uint32_t i = 0; i < imageMemoryBarrierCount; i++) {
6317 		RADV_FROM_HANDLE(radv_image, image, pImageMemoryBarriers[i].image);
6318 
6319 		const struct VkSampleLocationsInfoEXT *sample_locs_info =
6320 			vk_find_struct_const(pImageMemoryBarriers[i].pNext,
6321 					     SAMPLE_LOCATIONS_INFO_EXT);
6322 		struct radv_sample_locations_state sample_locations = {0};
6323 
6324 		if (sample_locs_info) {
6325 			assert(image->flags & VK_IMAGE_CREATE_SAMPLE_LOCATIONS_COMPATIBLE_DEPTH_BIT_EXT);
6326 			sample_locations.per_pixel = sample_locs_info->sampleLocationsPerPixel;
6327 			sample_locations.grid_size = sample_locs_info->sampleLocationGridSize;
6328 			sample_locations.count = sample_locs_info->sampleLocationsCount;
6329 			typed_memcpy(&sample_locations.locations[0],
6330 				     sample_locs_info->pSampleLocations,
6331 				     sample_locs_info->sampleLocationsCount);
6332 		}
6333 
6334 		radv_handle_image_transition(cmd_buffer, image,
6335 					     pImageMemoryBarriers[i].oldLayout,
6336 					     false, /* Outside of a renderpass we are never in a renderloop */
6337 					     pImageMemoryBarriers[i].newLayout,
6338 					     false, /* Outside of a renderpass we are never in a renderloop */
6339 					     pImageMemoryBarriers[i].srcQueueFamilyIndex,
6340 					     pImageMemoryBarriers[i].dstQueueFamilyIndex,
6341 					     &pImageMemoryBarriers[i].subresourceRange,
6342 					     sample_locs_info ? &sample_locations : NULL);
6343 	}
6344 
6345 	/* Make sure CP DMA is idle because the driver might have performed a
6346 	 * DMA operation for copying or filling buffers/images.
6347 	 */
6348 	if (info->srcStageMask & (VK_PIPELINE_STAGE_TRANSFER_BIT |
6349 				  VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT))
6350 		si_cp_dma_wait_for_idle(cmd_buffer);
6351 
6352 	cmd_buffer->state.flush_bits |= dst_flush_bits;
6353 
6354 	radv_describe_barrier_end(cmd_buffer);
6355 }
6356 
radv_CmdPipelineBarrier(VkCommandBuffer commandBuffer,VkPipelineStageFlags srcStageMask,VkPipelineStageFlags destStageMask,VkBool32 byRegion,uint32_t memoryBarrierCount,const VkMemoryBarrier * pMemoryBarriers,uint32_t bufferMemoryBarrierCount,const VkBufferMemoryBarrier * pBufferMemoryBarriers,uint32_t imageMemoryBarrierCount,const VkImageMemoryBarrier * pImageMemoryBarriers)6357 void radv_CmdPipelineBarrier(
6358 	VkCommandBuffer                             commandBuffer,
6359 	VkPipelineStageFlags                        srcStageMask,
6360 	VkPipelineStageFlags                        destStageMask,
6361 	VkBool32                                    byRegion,
6362 	uint32_t                                    memoryBarrierCount,
6363 	const VkMemoryBarrier*                      pMemoryBarriers,
6364 	uint32_t                                    bufferMemoryBarrierCount,
6365 	const VkBufferMemoryBarrier*                pBufferMemoryBarriers,
6366 	uint32_t                                    imageMemoryBarrierCount,
6367 	const VkImageMemoryBarrier*                 pImageMemoryBarriers)
6368 {
6369 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
6370 	struct radv_barrier_info info;
6371 
6372 	info.reason = RGP_BARRIER_EXTERNAL_CMD_PIPELINE_BARRIER;
6373 	info.eventCount = 0;
6374 	info.pEvents = NULL;
6375 	info.srcStageMask = srcStageMask;
6376 	info.dstStageMask = destStageMask;
6377 
6378 	radv_barrier(cmd_buffer, memoryBarrierCount, pMemoryBarriers,
6379 		     bufferMemoryBarrierCount, pBufferMemoryBarriers,
6380 		     imageMemoryBarrierCount, pImageMemoryBarriers, &info);
6381 }
6382 
6383 
write_event(struct radv_cmd_buffer * cmd_buffer,struct radv_event * event,VkPipelineStageFlags stageMask,unsigned value)6384 static void write_event(struct radv_cmd_buffer *cmd_buffer,
6385 			struct radv_event *event,
6386 			VkPipelineStageFlags stageMask,
6387 			unsigned value)
6388 {
6389 	struct radeon_cmdbuf *cs = cmd_buffer->cs;
6390 	uint64_t va = radv_buffer_get_va(event->bo);
6391 
6392 	si_emit_cache_flush(cmd_buffer);
6393 
6394 	radv_cs_add_buffer(cmd_buffer->device->ws, cs, event->bo);
6395 
6396 	ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 28);
6397 
6398 	/* Flags that only require a top-of-pipe event. */
6399 	VkPipelineStageFlags top_of_pipe_flags =
6400 		VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
6401 
6402 	/* Flags that only require a post-index-fetch event. */
6403 	VkPipelineStageFlags post_index_fetch_flags =
6404 		top_of_pipe_flags |
6405 		VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT |
6406 		VK_PIPELINE_STAGE_VERTEX_INPUT_BIT;
6407 
6408 	/* Make sure CP DMA is idle because the driver might have performed a
6409 	 * DMA operation for copying or filling buffers/images.
6410 	 */
6411 	if (stageMask & (VK_PIPELINE_STAGE_TRANSFER_BIT |
6412 			 VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT))
6413 		si_cp_dma_wait_for_idle(cmd_buffer);
6414 
6415 	/* TODO: Emit EOS events for syncing PS/CS stages. */
6416 
6417 	if (!(stageMask & ~top_of_pipe_flags)) {
6418 		/* Just need to sync the PFP engine. */
6419 		radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0));
6420 		radeon_emit(cs, S_370_DST_SEL(V_370_MEM) |
6421 				S_370_WR_CONFIRM(1) |
6422 				S_370_ENGINE_SEL(V_370_PFP));
6423 		radeon_emit(cs, va);
6424 		radeon_emit(cs, va >> 32);
6425 		radeon_emit(cs, value);
6426 	} else if (!(stageMask & ~post_index_fetch_flags)) {
6427 		/* Sync ME because PFP reads index and indirect buffers. */
6428 		radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0));
6429 		radeon_emit(cs, S_370_DST_SEL(V_370_MEM) |
6430 				S_370_WR_CONFIRM(1) |
6431 				S_370_ENGINE_SEL(V_370_ME));
6432 		radeon_emit(cs, va);
6433 		radeon_emit(cs, va >> 32);
6434 		radeon_emit(cs, value);
6435 	} else {
6436 		/* Otherwise, sync all prior GPU work using an EOP event. */
6437 		si_cs_emit_write_event_eop(cs,
6438 					   cmd_buffer->device->physical_device->rad_info.chip_class,
6439 					   radv_cmd_buffer_uses_mec(cmd_buffer),
6440 					   V_028A90_BOTTOM_OF_PIPE_TS, 0,
6441 					   EOP_DST_SEL_MEM,
6442 					   EOP_DATA_SEL_VALUE_32BIT, va, value,
6443 					   cmd_buffer->gfx9_eop_bug_va);
6444 	}
6445 
6446 	assert(cmd_buffer->cs->cdw <= cdw_max);
6447 }
6448 
radv_CmdSetEvent(VkCommandBuffer commandBuffer,VkEvent _event,VkPipelineStageFlags stageMask)6449 void radv_CmdSetEvent(VkCommandBuffer commandBuffer,
6450 		      VkEvent _event,
6451 		      VkPipelineStageFlags stageMask)
6452 {
6453 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
6454 	RADV_FROM_HANDLE(radv_event, event, _event);
6455 
6456 	write_event(cmd_buffer, event, stageMask, 1);
6457 }
6458 
radv_CmdResetEvent(VkCommandBuffer commandBuffer,VkEvent _event,VkPipelineStageFlags stageMask)6459 void radv_CmdResetEvent(VkCommandBuffer commandBuffer,
6460 			VkEvent _event,
6461 			VkPipelineStageFlags stageMask)
6462 {
6463 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
6464 	RADV_FROM_HANDLE(radv_event, event, _event);
6465 
6466 	write_event(cmd_buffer, event, stageMask, 0);
6467 }
6468 
radv_CmdWaitEvents(VkCommandBuffer commandBuffer,uint32_t eventCount,const VkEvent * pEvents,VkPipelineStageFlags srcStageMask,VkPipelineStageFlags dstStageMask,uint32_t memoryBarrierCount,const VkMemoryBarrier * pMemoryBarriers,uint32_t bufferMemoryBarrierCount,const VkBufferMemoryBarrier * pBufferMemoryBarriers,uint32_t imageMemoryBarrierCount,const VkImageMemoryBarrier * pImageMemoryBarriers)6469 void radv_CmdWaitEvents(VkCommandBuffer commandBuffer,
6470 			uint32_t eventCount,
6471 			const VkEvent* pEvents,
6472 			VkPipelineStageFlags srcStageMask,
6473 			VkPipelineStageFlags dstStageMask,
6474 			uint32_t memoryBarrierCount,
6475 			const VkMemoryBarrier* pMemoryBarriers,
6476 			uint32_t bufferMemoryBarrierCount,
6477 			const VkBufferMemoryBarrier* pBufferMemoryBarriers,
6478 			uint32_t imageMemoryBarrierCount,
6479 			const VkImageMemoryBarrier* pImageMemoryBarriers)
6480 {
6481 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
6482 	struct radv_barrier_info info;
6483 
6484 	info.reason = RGP_BARRIER_EXTERNAL_CMD_WAIT_EVENTS;
6485 	info.eventCount = eventCount;
6486 	info.pEvents = pEvents;
6487 	info.srcStageMask = 0;
6488 
6489 	radv_barrier(cmd_buffer, memoryBarrierCount, pMemoryBarriers,
6490 		     bufferMemoryBarrierCount, pBufferMemoryBarriers,
6491 		     imageMemoryBarrierCount, pImageMemoryBarriers, &info);
6492 }
6493 
6494 
radv_CmdSetDeviceMask(VkCommandBuffer commandBuffer,uint32_t deviceMask)6495 void radv_CmdSetDeviceMask(VkCommandBuffer commandBuffer,
6496                            uint32_t deviceMask)
6497 {
6498    /* No-op */
6499 }
6500 
6501 /* VK_EXT_conditional_rendering */
radv_CmdBeginConditionalRenderingEXT(VkCommandBuffer commandBuffer,const VkConditionalRenderingBeginInfoEXT * pConditionalRenderingBegin)6502 void radv_CmdBeginConditionalRenderingEXT(
6503 	VkCommandBuffer                             commandBuffer,
6504 	const VkConditionalRenderingBeginInfoEXT*   pConditionalRenderingBegin)
6505 {
6506 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
6507 	RADV_FROM_HANDLE(radv_buffer, buffer, pConditionalRenderingBegin->buffer);
6508 	struct radeon_cmdbuf *cs = cmd_buffer->cs;
6509 	bool draw_visible = true;
6510 	uint64_t pred_value = 0;
6511 	uint64_t va, new_va;
6512 	unsigned pred_offset;
6513 
6514 	va = radv_buffer_get_va(buffer->bo) + pConditionalRenderingBegin->offset;
6515 
6516 	/* By default, if the 32-bit value at offset in buffer memory is zero,
6517 	 * then the rendering commands are discarded, otherwise they are
6518 	 * executed as normal. If the inverted flag is set, all commands are
6519 	 * discarded if the value is non zero.
6520 	 */
6521 	if (pConditionalRenderingBegin->flags &
6522 	    VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT) {
6523 		draw_visible = false;
6524 	}
6525 
6526 	si_emit_cache_flush(cmd_buffer);
6527 
6528 	/* From the Vulkan spec 1.1.107:
6529 	 *
6530 	 * "If the 32-bit value at offset in buffer memory is zero, then the
6531 	 *  rendering commands are discarded, otherwise they are executed as
6532 	 *  normal. If the value of the predicate in buffer memory changes while
6533 	 *  conditional rendering is active, the rendering commands may be
6534 	 *  discarded in an implementation-dependent way. Some implementations
6535 	 *  may latch the value of the predicate upon beginning conditional
6536 	 *  rendering while others may read it before every rendering command."
6537 	 *
6538 	 * But, the AMD hardware treats the predicate as a 64-bit value which
6539 	 * means we need a workaround in the driver. Luckily, it's not required
6540 	 * to support if the value changes when predication is active.
6541 	 *
6542 	 * The workaround is as follows:
6543 	 * 1) allocate a 64-value in the upload BO and initialize it to 0
6544 	 * 2) copy the 32-bit predicate value to the upload BO
6545 	 * 3) use the new allocated VA address for predication
6546 	 *
6547 	 * Based on the conditionalrender demo, it's faster to do the COPY_DATA
6548 	 * in ME  (+ sync PFP) instead of PFP.
6549 	 */
6550 	radv_cmd_buffer_upload_data(cmd_buffer, 8, 16, &pred_value, &pred_offset);
6551 
6552 	new_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + pred_offset;
6553 
6554 	radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
6555 	radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) |
6556 			COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
6557 			COPY_DATA_WR_CONFIRM);
6558 	radeon_emit(cs, va);
6559 	radeon_emit(cs, va >> 32);
6560 	radeon_emit(cs, new_va);
6561 	radeon_emit(cs, new_va >> 32);
6562 
6563 	radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
6564 	radeon_emit(cs, 0);
6565 
6566 	/* Enable predication for this command buffer. */
6567 	si_emit_set_predication_state(cmd_buffer, draw_visible, new_va);
6568 	cmd_buffer->state.predicating = true;
6569 
6570 	/* Store conditional rendering user info. */
6571 	cmd_buffer->state.predication_type = draw_visible;
6572 	cmd_buffer->state.predication_va = new_va;
6573 }
6574 
radv_CmdEndConditionalRenderingEXT(VkCommandBuffer commandBuffer)6575 void radv_CmdEndConditionalRenderingEXT(
6576 	VkCommandBuffer                             commandBuffer)
6577 {
6578 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
6579 
6580 	/* Disable predication for this command buffer. */
6581 	si_emit_set_predication_state(cmd_buffer, false, 0);
6582 	cmd_buffer->state.predicating = false;
6583 
6584 	/* Reset conditional rendering user info. */
6585 	cmd_buffer->state.predication_type = -1;
6586 	cmd_buffer->state.predication_va = 0;
6587 }
6588 
6589 /* VK_EXT_transform_feedback */
radv_CmdBindTransformFeedbackBuffersEXT(VkCommandBuffer commandBuffer,uint32_t firstBinding,uint32_t bindingCount,const VkBuffer * pBuffers,const VkDeviceSize * pOffsets,const VkDeviceSize * pSizes)6590 void radv_CmdBindTransformFeedbackBuffersEXT(
6591     VkCommandBuffer                             commandBuffer,
6592     uint32_t                                    firstBinding,
6593     uint32_t                                    bindingCount,
6594     const VkBuffer*                             pBuffers,
6595     const VkDeviceSize*                         pOffsets,
6596     const VkDeviceSize*                         pSizes)
6597 {
6598 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
6599 	struct radv_streamout_binding *sb = cmd_buffer->streamout_bindings;
6600 	uint8_t enabled_mask = 0;
6601 
6602 	assert(firstBinding + bindingCount <= MAX_SO_BUFFERS);
6603 	for (uint32_t i = 0; i < bindingCount; i++) {
6604 		uint32_t idx = firstBinding + i;
6605 
6606 		sb[idx].buffer = radv_buffer_from_handle(pBuffers[i]);
6607 		sb[idx].offset = pOffsets[i];
6608 
6609 		if (!pSizes || pSizes[i] == VK_WHOLE_SIZE) {
6610 			sb[idx].size = sb[idx].buffer->size - sb[idx].offset;
6611 		} else {
6612 			sb[idx].size = pSizes[i];
6613 		}
6614 
6615 		radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs,
6616 				   sb[idx].buffer->bo);
6617 
6618 		enabled_mask |= 1 << idx;
6619 	}
6620 
6621 	cmd_buffer->state.streamout.enabled_mask |= enabled_mask;
6622 
6623 	cmd_buffer->state.dirty |= RADV_CMD_DIRTY_STREAMOUT_BUFFER;
6624 }
6625 
6626 static void
radv_emit_streamout_enable(struct radv_cmd_buffer * cmd_buffer)6627 radv_emit_streamout_enable(struct radv_cmd_buffer *cmd_buffer)
6628 {
6629 	struct radv_streamout_state *so = &cmd_buffer->state.streamout;
6630 	struct radeon_cmdbuf *cs = cmd_buffer->cs;
6631 
6632 	radeon_set_context_reg_seq(cs, R_028B94_VGT_STRMOUT_CONFIG, 2);
6633 	radeon_emit(cs,
6634 		    S_028B94_STREAMOUT_0_EN(so->streamout_enabled) |
6635 		    S_028B94_RAST_STREAM(0) |
6636 		    S_028B94_STREAMOUT_1_EN(so->streamout_enabled) |
6637 		    S_028B94_STREAMOUT_2_EN(so->streamout_enabled) |
6638 		    S_028B94_STREAMOUT_3_EN(so->streamout_enabled));
6639 	radeon_emit(cs, so->hw_enabled_mask &
6640 			so->enabled_stream_buffers_mask);
6641 
6642 	cmd_buffer->state.context_roll_without_scissor_emitted = true;
6643 }
6644 
6645 static void
radv_set_streamout_enable(struct radv_cmd_buffer * cmd_buffer,bool enable)6646 radv_set_streamout_enable(struct radv_cmd_buffer *cmd_buffer, bool enable)
6647 {
6648 	struct radv_streamout_state *so = &cmd_buffer->state.streamout;
6649 	bool old_streamout_enabled = so->streamout_enabled;
6650 	uint32_t old_hw_enabled_mask = so->hw_enabled_mask;
6651 
6652 	so->streamout_enabled = enable;
6653 
6654 	so->hw_enabled_mask = so->enabled_mask |
6655 			      (so->enabled_mask << 4) |
6656 			      (so->enabled_mask << 8) |
6657 			      (so->enabled_mask << 12);
6658 
6659 	if (!cmd_buffer->device->physical_device->use_ngg_streamout &&
6660 	    ((old_streamout_enabled != so->streamout_enabled) ||
6661 	     (old_hw_enabled_mask != so->hw_enabled_mask)))
6662 		radv_emit_streamout_enable(cmd_buffer);
6663 
6664 	if (cmd_buffer->device->physical_device->use_ngg_streamout) {
6665 		cmd_buffer->gds_needed = true;
6666 		cmd_buffer->gds_oa_needed = true;
6667 	}
6668 }
6669 
radv_flush_vgt_streamout(struct radv_cmd_buffer * cmd_buffer)6670 static void radv_flush_vgt_streamout(struct radv_cmd_buffer *cmd_buffer)
6671 {
6672 	struct radeon_cmdbuf *cs = cmd_buffer->cs;
6673 	unsigned reg_strmout_cntl;
6674 
6675 	/* The register is at different places on different ASICs. */
6676 	if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX7) {
6677 		reg_strmout_cntl = R_0300FC_CP_STRMOUT_CNTL;
6678 		radeon_set_uconfig_reg(cs, reg_strmout_cntl, 0);
6679 	} else {
6680 		reg_strmout_cntl = R_0084FC_CP_STRMOUT_CNTL;
6681 		radeon_set_config_reg(cs, reg_strmout_cntl, 0);
6682 	}
6683 
6684 	radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
6685 	radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_SO_VGTSTREAMOUT_FLUSH) | EVENT_INDEX(0));
6686 
6687 	radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
6688 	radeon_emit(cs, WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */
6689 	radeon_emit(cs, reg_strmout_cntl >> 2);  /* register */
6690 	radeon_emit(cs, 0);
6691 	radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* reference value */
6692 	radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* mask */
6693 	radeon_emit(cs, 4); /* poll interval */
6694 }
6695 
6696 static void
radv_emit_streamout_begin(struct radv_cmd_buffer * cmd_buffer,uint32_t firstCounterBuffer,uint32_t counterBufferCount,const VkBuffer * pCounterBuffers,const VkDeviceSize * pCounterBufferOffsets)6697 radv_emit_streamout_begin(struct radv_cmd_buffer *cmd_buffer,
6698 			  uint32_t firstCounterBuffer,
6699 			  uint32_t counterBufferCount,
6700 			  const VkBuffer *pCounterBuffers,
6701 			  const VkDeviceSize *pCounterBufferOffsets)
6702 
6703 {
6704 	struct radv_streamout_binding *sb = cmd_buffer->streamout_bindings;
6705 	struct radv_streamout_state *so = &cmd_buffer->state.streamout;
6706 	struct radeon_cmdbuf *cs = cmd_buffer->cs;
6707 	uint32_t i;
6708 
6709 	radv_flush_vgt_streamout(cmd_buffer);
6710 
6711 	assert(firstCounterBuffer + counterBufferCount <= MAX_SO_BUFFERS);
6712 	for_each_bit(i, so->enabled_mask) {
6713 		int32_t counter_buffer_idx = i - firstCounterBuffer;
6714 		if (counter_buffer_idx >= 0 && counter_buffer_idx >= counterBufferCount)
6715 			counter_buffer_idx = -1;
6716 
6717 		/* AMD GCN binds streamout buffers as shader resources.
6718 		 * VGT only counts primitives and tells the shader through
6719 		 * SGPRs what to do.
6720 		 */
6721 		radeon_set_context_reg_seq(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16*i, 2);
6722 		radeon_emit(cs, sb[i].size >> 2);	/* BUFFER_SIZE (in DW) */
6723 		radeon_emit(cs, so->stride_in_dw[i]);			/* VTX_STRIDE (in DW) */
6724 
6725 		cmd_buffer->state.context_roll_without_scissor_emitted = true;
6726 
6727 		if (counter_buffer_idx >= 0 && pCounterBuffers && pCounterBuffers[counter_buffer_idx]) {
6728 			/* The array of counter buffers is optional. */
6729 			RADV_FROM_HANDLE(radv_buffer, buffer, pCounterBuffers[counter_buffer_idx]);
6730 			uint64_t va = radv_buffer_get_va(buffer->bo);
6731 			uint64_t counter_buffer_offset = 0;
6732 
6733 			if (pCounterBufferOffsets)
6734 				counter_buffer_offset = pCounterBufferOffsets[counter_buffer_idx];
6735 
6736 			va += buffer->offset + counter_buffer_offset;
6737 
6738 			/* Append */
6739 			radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
6740 			radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) |
6741 					STRMOUT_DATA_TYPE(1) | /* offset in bytes */
6742 					STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_MEM)); /* control */
6743 			radeon_emit(cs, 0); /* unused */
6744 			radeon_emit(cs, 0); /* unused */
6745 			radeon_emit(cs, va); /* src address lo */
6746 			radeon_emit(cs, va >> 32); /* src address hi */
6747 
6748 			radv_cs_add_buffer(cmd_buffer->device->ws, cs, buffer->bo);
6749 		} else {
6750 			/* Start from the beginning. */
6751 			radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
6752 			radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) |
6753 					STRMOUT_DATA_TYPE(1) | /* offset in bytes */
6754 					STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_PACKET)); /* control */
6755 			radeon_emit(cs, 0); /* unused */
6756 			radeon_emit(cs, 0); /* unused */
6757 			radeon_emit(cs, 0); /* unused */
6758 			radeon_emit(cs, 0); /* unused */
6759 		}
6760 	}
6761 
6762 	radv_set_streamout_enable(cmd_buffer, true);
6763 }
6764 
6765 static void
gfx10_emit_streamout_begin(struct radv_cmd_buffer * cmd_buffer,uint32_t firstCounterBuffer,uint32_t counterBufferCount,const VkBuffer * pCounterBuffers,const VkDeviceSize * pCounterBufferOffsets)6766 gfx10_emit_streamout_begin(struct radv_cmd_buffer *cmd_buffer,
6767 			   uint32_t firstCounterBuffer,
6768 			   uint32_t counterBufferCount,
6769 			   const VkBuffer *pCounterBuffers,
6770 			   const VkDeviceSize *pCounterBufferOffsets)
6771 {
6772 	struct radv_streamout_state *so = &cmd_buffer->state.streamout;
6773 	unsigned last_target = util_last_bit(so->enabled_mask) - 1;
6774 	struct radeon_cmdbuf *cs = cmd_buffer->cs;
6775 	uint32_t i;
6776 
6777 	assert(cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10);
6778 	assert(firstCounterBuffer + counterBufferCount <= MAX_SO_BUFFERS);
6779 
6780 	/* Sync because the next streamout operation will overwrite GDS and we
6781 	 * have to make sure it's idle.
6782 	 * TODO: Improve by tracking if there is a streamout operation in
6783 	 * flight.
6784 	 */
6785 	cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VS_PARTIAL_FLUSH;
6786 	si_emit_cache_flush(cmd_buffer);
6787 
6788 	for_each_bit(i, so->enabled_mask) {
6789 		int32_t counter_buffer_idx = i - firstCounterBuffer;
6790 		if (counter_buffer_idx >= 0 && counter_buffer_idx >= counterBufferCount)
6791 			counter_buffer_idx = -1;
6792 
6793 		bool append = counter_buffer_idx >= 0 &&
6794 			      pCounterBuffers && pCounterBuffers[counter_buffer_idx];
6795 		uint64_t va = 0;
6796 
6797 		if (append) {
6798 			RADV_FROM_HANDLE(radv_buffer, buffer, pCounterBuffers[counter_buffer_idx]);
6799 			uint64_t counter_buffer_offset = 0;
6800 
6801 			if (pCounterBufferOffsets)
6802 				counter_buffer_offset = pCounterBufferOffsets[counter_buffer_idx];
6803 
6804 			va += radv_buffer_get_va(buffer->bo);
6805 			va += buffer->offset + counter_buffer_offset;
6806 
6807 			radv_cs_add_buffer(cmd_buffer->device->ws, cs, buffer->bo);
6808 		}
6809 
6810 		radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0));
6811 		radeon_emit(cs, S_411_SRC_SEL(append ? V_411_SRC_ADDR_TC_L2 : V_411_DATA) |
6812 				S_411_DST_SEL(V_411_GDS) |
6813 				S_411_CP_SYNC(i == last_target));
6814 		radeon_emit(cs, va);
6815 		radeon_emit(cs, va >> 32);
6816 		radeon_emit(cs, 4 * i); /* destination in GDS */
6817 		radeon_emit(cs, 0);
6818 		radeon_emit(cs, S_414_BYTE_COUNT_GFX9(4) |
6819 				S_414_DISABLE_WR_CONFIRM_GFX9(i != last_target));
6820 	}
6821 
6822 	radv_set_streamout_enable(cmd_buffer, true);
6823 }
6824 
radv_CmdBeginTransformFeedbackEXT(VkCommandBuffer commandBuffer,uint32_t firstCounterBuffer,uint32_t counterBufferCount,const VkBuffer * pCounterBuffers,const VkDeviceSize * pCounterBufferOffsets)6825 void radv_CmdBeginTransformFeedbackEXT(
6826     VkCommandBuffer                             commandBuffer,
6827     uint32_t                                    firstCounterBuffer,
6828     uint32_t                                    counterBufferCount,
6829     const VkBuffer*                             pCounterBuffers,
6830     const VkDeviceSize*                         pCounterBufferOffsets)
6831 {
6832 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
6833 
6834 	if (cmd_buffer->device->physical_device->use_ngg_streamout) {
6835 		gfx10_emit_streamout_begin(cmd_buffer,
6836 					   firstCounterBuffer, counterBufferCount,
6837 					   pCounterBuffers, pCounterBufferOffsets);
6838 	} else {
6839 		radv_emit_streamout_begin(cmd_buffer,
6840 					  firstCounterBuffer, counterBufferCount,
6841 					  pCounterBuffers, pCounterBufferOffsets);
6842 	}
6843 }
6844 
6845 static void
radv_emit_streamout_end(struct radv_cmd_buffer * cmd_buffer,uint32_t firstCounterBuffer,uint32_t counterBufferCount,const VkBuffer * pCounterBuffers,const VkDeviceSize * pCounterBufferOffsets)6846 radv_emit_streamout_end(struct radv_cmd_buffer *cmd_buffer,
6847 			uint32_t firstCounterBuffer,
6848 			uint32_t counterBufferCount,
6849 			const VkBuffer *pCounterBuffers,
6850 			const VkDeviceSize *pCounterBufferOffsets)
6851 {
6852 	struct radv_streamout_state *so = &cmd_buffer->state.streamout;
6853 	struct radeon_cmdbuf *cs = cmd_buffer->cs;
6854 	uint32_t i;
6855 
6856 	radv_flush_vgt_streamout(cmd_buffer);
6857 
6858 	assert(firstCounterBuffer + counterBufferCount <= MAX_SO_BUFFERS);
6859 	for_each_bit(i, so->enabled_mask) {
6860 		int32_t counter_buffer_idx = i - firstCounterBuffer;
6861 		if (counter_buffer_idx >= 0 && counter_buffer_idx >= counterBufferCount)
6862 			counter_buffer_idx = -1;
6863 
6864 		if (counter_buffer_idx >= 0 && pCounterBuffers && pCounterBuffers[counter_buffer_idx]) {
6865 			/* The array of counters buffer is optional. */
6866 			RADV_FROM_HANDLE(radv_buffer, buffer, pCounterBuffers[counter_buffer_idx]);
6867 			uint64_t va = radv_buffer_get_va(buffer->bo);
6868 			uint64_t counter_buffer_offset = 0;
6869 
6870 			if (pCounterBufferOffsets)
6871 				counter_buffer_offset = pCounterBufferOffsets[counter_buffer_idx];
6872 
6873 			va += buffer->offset + counter_buffer_offset;
6874 
6875 			radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
6876 			radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) |
6877 					STRMOUT_DATA_TYPE(1) | /* offset in bytes */
6878 					STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_NONE) |
6879 					STRMOUT_STORE_BUFFER_FILLED_SIZE); /* control */
6880 			radeon_emit(cs, va);		/* dst address lo */
6881 			radeon_emit(cs, va >> 32);	/* dst address hi */
6882 			radeon_emit(cs, 0);		/* unused */
6883 			radeon_emit(cs, 0);		/* unused */
6884 
6885 			radv_cs_add_buffer(cmd_buffer->device->ws, cs, buffer->bo);
6886 		}
6887 
6888 		/* Deactivate transform feedback by zeroing the buffer size.
6889 		 * The counters (primitives generated, primitives emitted) may
6890 		 * be enabled even if there is not buffer bound. This ensures
6891 		 * that the primitives-emitted query won't increment.
6892 		 */
6893 		radeon_set_context_reg(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16*i, 0);
6894 
6895 		cmd_buffer->state.context_roll_without_scissor_emitted = true;
6896 	}
6897 
6898 	radv_set_streamout_enable(cmd_buffer, false);
6899 }
6900 
6901 static void
gfx10_emit_streamout_end(struct radv_cmd_buffer * cmd_buffer,uint32_t firstCounterBuffer,uint32_t counterBufferCount,const VkBuffer * pCounterBuffers,const VkDeviceSize * pCounterBufferOffsets)6902 gfx10_emit_streamout_end(struct radv_cmd_buffer *cmd_buffer,
6903 			 uint32_t firstCounterBuffer,
6904 			 uint32_t counterBufferCount,
6905 			 const VkBuffer *pCounterBuffers,
6906 			 const VkDeviceSize *pCounterBufferOffsets)
6907 {
6908 	struct radv_streamout_state *so = &cmd_buffer->state.streamout;
6909 	struct radeon_cmdbuf *cs = cmd_buffer->cs;
6910 	uint32_t i;
6911 
6912 	assert(cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10);
6913 	assert(firstCounterBuffer + counterBufferCount <= MAX_SO_BUFFERS);
6914 
6915 	for_each_bit(i, so->enabled_mask) {
6916 		int32_t counter_buffer_idx = i - firstCounterBuffer;
6917 		if (counter_buffer_idx >= 0 && counter_buffer_idx >= counterBufferCount)
6918 			counter_buffer_idx = -1;
6919 
6920 		if (counter_buffer_idx >= 0 && pCounterBuffers && pCounterBuffers[counter_buffer_idx]) {
6921 			/* The array of counters buffer is optional. */
6922 			RADV_FROM_HANDLE(radv_buffer, buffer, pCounterBuffers[counter_buffer_idx]);
6923 			uint64_t va = radv_buffer_get_va(buffer->bo);
6924 			uint64_t counter_buffer_offset = 0;
6925 
6926 			if (pCounterBufferOffsets)
6927 				counter_buffer_offset = pCounterBufferOffsets[counter_buffer_idx];
6928 
6929 			va += buffer->offset + counter_buffer_offset;
6930 
6931 			si_cs_emit_write_event_eop(cs,
6932 						   cmd_buffer->device->physical_device->rad_info.chip_class,
6933 						   radv_cmd_buffer_uses_mec(cmd_buffer),
6934 						   V_028A90_PS_DONE, 0,
6935 						   EOP_DST_SEL_TC_L2,
6936 						   EOP_DATA_SEL_GDS,
6937 						   va, EOP_DATA_GDS(i, 1), 0);
6938 
6939 			radv_cs_add_buffer(cmd_buffer->device->ws, cs, buffer->bo);
6940 		}
6941 	}
6942 
6943 	radv_set_streamout_enable(cmd_buffer, false);
6944 }
6945 
radv_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer,uint32_t firstCounterBuffer,uint32_t counterBufferCount,const VkBuffer * pCounterBuffers,const VkDeviceSize * pCounterBufferOffsets)6946 void radv_CmdEndTransformFeedbackEXT(
6947     VkCommandBuffer                             commandBuffer,
6948     uint32_t                                    firstCounterBuffer,
6949     uint32_t                                    counterBufferCount,
6950     const VkBuffer*                             pCounterBuffers,
6951     const VkDeviceSize*                         pCounterBufferOffsets)
6952 {
6953 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
6954 
6955 	if (cmd_buffer->device->physical_device->use_ngg_streamout) {
6956 		gfx10_emit_streamout_end(cmd_buffer,
6957 					 firstCounterBuffer, counterBufferCount,
6958 					 pCounterBuffers, pCounterBufferOffsets);
6959 	} else {
6960 		radv_emit_streamout_end(cmd_buffer,
6961 					firstCounterBuffer, counterBufferCount,
6962 					pCounterBuffers, pCounterBufferOffsets);
6963 	}
6964 }
6965 
radv_CmdDrawIndirectByteCountEXT(VkCommandBuffer commandBuffer,uint32_t instanceCount,uint32_t firstInstance,VkBuffer _counterBuffer,VkDeviceSize counterBufferOffset,uint32_t counterOffset,uint32_t vertexStride)6966 void radv_CmdDrawIndirectByteCountEXT(
6967     VkCommandBuffer                             commandBuffer,
6968     uint32_t                                    instanceCount,
6969     uint32_t                                    firstInstance,
6970     VkBuffer                                    _counterBuffer,
6971     VkDeviceSize                                counterBufferOffset,
6972     uint32_t                                    counterOffset,
6973     uint32_t                                    vertexStride)
6974 {
6975 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
6976 	RADV_FROM_HANDLE(radv_buffer, counterBuffer, _counterBuffer);
6977 	struct radv_draw_info info = {0};
6978 
6979 	info.instance_count = instanceCount;
6980 	info.first_instance = firstInstance;
6981 	info.strmout_buffer = counterBuffer;
6982 	info.strmout_buffer_offset = counterBufferOffset;
6983 	info.stride = vertexStride;
6984 
6985 	radv_draw(cmd_buffer, &info);
6986 }
6987 
6988 /* VK_AMD_buffer_marker */
radv_CmdWriteBufferMarkerAMD(VkCommandBuffer commandBuffer,VkPipelineStageFlagBits pipelineStage,VkBuffer dstBuffer,VkDeviceSize dstOffset,uint32_t marker)6989 void radv_CmdWriteBufferMarkerAMD(
6990     VkCommandBuffer                             commandBuffer,
6991     VkPipelineStageFlagBits                     pipelineStage,
6992     VkBuffer                                    dstBuffer,
6993     VkDeviceSize                                dstOffset,
6994     uint32_t                                    marker)
6995 {
6996 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
6997 	RADV_FROM_HANDLE(radv_buffer, buffer, dstBuffer);
6998 	struct radeon_cmdbuf *cs = cmd_buffer->cs;
6999 	uint64_t va = radv_buffer_get_va(buffer->bo) + dstOffset;
7000 
7001 	si_emit_cache_flush(cmd_buffer);
7002 
7003 	ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 12);
7004 
7005 	if (!(pipelineStage & ~VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT)) {
7006 		radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
7007 		radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) |
7008 				COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
7009 				COPY_DATA_WR_CONFIRM);
7010 		radeon_emit(cs, marker);
7011 		radeon_emit(cs, 0);
7012 		radeon_emit(cs, va);
7013 		radeon_emit(cs, va >> 32);
7014 	} else {
7015 		si_cs_emit_write_event_eop(cs,
7016 					   cmd_buffer->device->physical_device->rad_info.chip_class,
7017 					   radv_cmd_buffer_uses_mec(cmd_buffer),
7018 					   V_028A90_BOTTOM_OF_PIPE_TS, 0,
7019 					   EOP_DST_SEL_MEM,
7020 					   EOP_DATA_SEL_VALUE_32BIT,
7021 					   va, marker,
7022 					   cmd_buffer->gfx9_eop_bug_va);
7023 	}
7024 
7025 	assert(cmd_buffer->cs->cdw <= cdw_max);
7026 }
7027