1 /*
2  * Copyright 2017 Advanced Micro Devices, Inc.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * on the rights to use, copy, modify, merge, publish, distribute, sub
8  * license, and/or sell copies of the Software, and to permit persons to whom
9  * the Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
22  */
23 
24 /* This file handles register programming of primitive binning. */
25 
26 #include "si_pipe.h"
27 #include "sid.h"
28 #include "gfx9d.h"
29 #include "radeon/r600_cs.h"
30 
31 struct uvec2 {
32 	unsigned x, y;
33 };
34 
35 struct si_bin_size_map {
36 	unsigned start;
37 	unsigned bin_size_x;
38 	unsigned bin_size_y;
39 };
40 
41 typedef struct si_bin_size_map si_bin_size_subtable[3][9];
42 
43 /* Find the bin size where sum is >= table[i].start and < table[i + 1].start. */
si_find_bin_size(struct si_screen * sscreen,const si_bin_size_subtable table[],unsigned sum)44 static struct uvec2 si_find_bin_size(struct si_screen *sscreen,
45 				     const si_bin_size_subtable table[],
46 				     unsigned sum)
47 {
48 	unsigned log_num_rb_per_se =
49 		util_logbase2_ceil(sscreen->info.num_render_backends /
50 				   sscreen->info.max_se);
51 	unsigned log_num_se = util_logbase2_ceil(sscreen->info.max_se);
52 	unsigned i;
53 
54 	/* Get the chip-specific subtable. */
55 	const struct si_bin_size_map *subtable =
56 		&table[log_num_rb_per_se][log_num_se][0];
57 
58 	for (i = 0; subtable[i].start != UINT_MAX; i++) {
59 		if (sum >= subtable[i].start && sum < subtable[i + 1].start)
60 			break;
61 	}
62 
63 	struct uvec2 size = {subtable[i].bin_size_x, subtable[i].bin_size_y};
64 	return size;
65 }
66 
si_get_color_bin_size(struct si_context * sctx,unsigned cb_target_enabled_4bit)67 static struct uvec2 si_get_color_bin_size(struct si_context *sctx,
68 					  unsigned cb_target_enabled_4bit)
69 {
70 	unsigned nr_samples = sctx->framebuffer.nr_samples;
71 	unsigned sum = 0;
72 
73 	/* Compute the sum of all Bpp. */
74 	for (unsigned i = 0; i < sctx->framebuffer.state.nr_cbufs; i++) {
75 		if (!(cb_target_enabled_4bit & (0xf << (i * 4))))
76 			continue;
77 
78 		struct r600_texture *rtex =
79 			(struct r600_texture*)sctx->framebuffer.state.cbufs[i]->texture;
80 		sum += rtex->surface.bpe;
81 	}
82 
83 	/* Multiply the sum by some function of the number of samples. */
84 	if (nr_samples >= 2) {
85 		if (sctx->ps_iter_samples >= 2)
86 			sum *= nr_samples;
87 		else
88 			sum *= 2;
89 	}
90 
91 	static const si_bin_size_subtable table[] = {
92 		{
93 			/* One RB / SE */
94 			{
95 				/* One shader engine */
96 				{        0,  128,  128 },
97 				{        1,   64,  128 },
98 				{        2,   32,  128 },
99 				{        3,   16,  128 },
100 				{       17,    0,    0 },
101 				{ UINT_MAX,    0,    0 },
102 			},
103 			{
104 				/* Two shader engines */
105 				{        0,  128,  128 },
106 				{        2,   64,  128 },
107 				{        3,   32,  128 },
108 				{        5,   16,  128 },
109 				{       17,    0,    0 },
110 				{ UINT_MAX,    0,    0 },
111 			},
112 			{
113 				/* Four shader engines */
114 				{        0,  128,  128 },
115 				{        3,   64,  128 },
116 				{        5,   16,  128 },
117 				{       17,    0,    0 },
118 				{ UINT_MAX,    0,    0 },
119 			},
120 		},
121 		{
122 			/* Two RB / SE */
123 			{
124 				/* One shader engine */
125 				{        0,  128,  128 },
126 				{        2,   64,  128 },
127 				{        3,   32,  128 },
128 				{        5,   16,  128 },
129 				{       33,    0,    0 },
130 				{ UINT_MAX,    0,    0 },
131 			},
132 			{
133 				/* Two shader engines */
134 				{        0,  128,  128 },
135 				{        3,   64,  128 },
136 				{        5,   32,  128 },
137 				{        9,   16,  128 },
138 				{       33,    0,    0 },
139 				{ UINT_MAX,    0,    0 },
140 			},
141 			{
142 				/* Four shader engines */
143 				{        0,  256,  256 },
144 				{        2,  128,  256 },
145 				{        3,  128,  128 },
146 				{        5,   64,  128 },
147 				{        9,   16,  128 },
148 				{       33,    0,    0 },
149 				{ UINT_MAX,    0,    0 },
150 			},
151 		},
152 		{
153 			/* Four RB / SE */
154 			{
155 				/* One shader engine */
156 				{        0,  128,  256 },
157 				{        2,  128,  128 },
158 				{        3,   64,  128 },
159 				{        5,   32,  128 },
160 				{        9,   16,  128 },
161 				{       33,    0,    0 },
162 				{ UINT_MAX,    0,    0 },
163 			},
164 			{
165 				/* Two shader engines */
166 				{        0,  256,  256 },
167 				{        2,  128,  256 },
168 				{        3,  128,  128 },
169 				{        5,   64,  128 },
170 				{        9,   32,  128 },
171 				{       17,   16,  128 },
172 				{       33,    0,    0 },
173 				{ UINT_MAX,    0,    0 },
174 			},
175 			{
176 				/* Four shader engines */
177 				{        0,  256,  512 },
178 				{        2,  256,  256 },
179 				{        3,  128,  256 },
180 				{        5,  128,  128 },
181 				{        9,   64,  128 },
182 				{       17,   16,  128 },
183 				{       33,    0,    0 },
184 				{ UINT_MAX,    0,    0 },
185 			},
186 		},
187 	};
188 
189 	return si_find_bin_size(sctx->screen, table, sum);
190 }
191 
si_get_depth_bin_size(struct si_context * sctx)192 static struct uvec2 si_get_depth_bin_size(struct si_context *sctx)
193 {
194 	struct si_state_dsa *dsa = sctx->queued.named.dsa;
195 
196 	if (!sctx->framebuffer.state.zsbuf ||
197 	    (!dsa->depth_enabled && !dsa->stencil_enabled)) {
198 		/* Return the max size. */
199 		struct uvec2 size = {512, 512};
200 		return size;
201 	}
202 
203 	struct r600_texture *rtex =
204 		(struct r600_texture*)sctx->framebuffer.state.zsbuf->texture;
205 	unsigned depth_coeff = dsa->depth_enabled ? 5 : 0;
206 	unsigned stencil_coeff = rtex->surface.has_stencil &&
207 				 dsa->stencil_enabled ? 1 : 0;
208 	unsigned sum = 4 * (depth_coeff + stencil_coeff) *
209 		       sctx->framebuffer.nr_samples;
210 
211 	static const si_bin_size_subtable table[] = {
212 		{
213 			// One RB / SE
214 			{
215 				// One shader engine
216 				{        0,  128,  256 },
217 				{        2,  128,  128 },
218 				{        4,   64,  128 },
219 				{        7,   32,  128 },
220 				{       13,   16,  128 },
221 				{       49,    0,    0 },
222 				{ UINT_MAX,    0,    0 },
223 			},
224 			{
225 				// Two shader engines
226 				{        0,  256,  256 },
227 				{        2,  128,  256 },
228 				{        4,  128,  128 },
229 				{        7,   64,  128 },
230 				{       13,   32,  128 },
231 				{       25,   16,  128 },
232 				{       49,    0,    0 },
233 				{ UINT_MAX,    0,    0 },
234 			},
235 			{
236 				// Four shader engines
237 				{        0,  256,  512 },
238 				{        2,  256,  256 },
239 				{        4,  128,  256 },
240 				{        7,  128,  128 },
241 				{       13,   64,  128 },
242 				{       25,   16,  128 },
243 				{       49,    0,    0 },
244 				{ UINT_MAX,    0,    0 },
245 			},
246 		},
247 		{
248 			// Two RB / SE
249 			{
250 				// One shader engine
251 				{        0,  256,  256 },
252 				{        2,  128,  256 },
253 				{        4,  128,  128 },
254 				{        7,   64,  128 },
255 				{       13,   32,  128 },
256 				{       25,   16,  128 },
257 				{       97,    0,    0 },
258 				{ UINT_MAX,    0,    0 },
259 			},
260 			{
261 				// Two shader engines
262 				{        0,  256,  512 },
263 				{        2,  256,  256 },
264 				{        4,  128,  256 },
265 				{        7,  128,  128 },
266 				{       13,   64,  128 },
267 				{       25,   32,  128 },
268 				{       49,   16,  128 },
269 				{       97,    0,    0 },
270 				{ UINT_MAX,    0,    0 },
271 			},
272 			{
273 				// Four shader engines
274 				{        0,  512,  512 },
275 				{        2,  256,  512 },
276 				{        4,  256,  256 },
277 				{        7,  128,  256 },
278 				{       13,  128,  128 },
279 				{       25,   64,  128 },
280 				{       49,   16,  128 },
281 				{       97,    0,    0 },
282 				{ UINT_MAX,    0,    0 },
283 			},
284 		},
285 		{
286 			// Four RB / SE
287 			{
288 				// One shader engine
289 				{        0,  256,  512 },
290 				{        2,  256,  256 },
291 				{        4,  128,  256 },
292 				{        7,  128,  128 },
293 				{       13,   64,  128 },
294 				{       25,   32,  128 },
295 				{       49,   16,  128 },
296 				{ UINT_MAX,    0,    0 },
297 			},
298 			{
299 				// Two shader engines
300 				{        0,  512,  512 },
301 				{        2,  256,  512 },
302 				{        4,  256,  256 },
303 				{        7,  128,  256 },
304 				{       13,  128,  128 },
305 				{       25,   64,  128 },
306 				{       49,   32,  128 },
307 				{       97,   16,  128 },
308 				{ UINT_MAX,    0,    0 },
309 			},
310 			{
311 				// Four shader engines
312 				{        0,  512,  512 },
313 				{        4,  256,  512 },
314 				{        7,  256,  256 },
315 				{       13,  128,  256 },
316 				{       25,  128,  128 },
317 				{       49,   64,  128 },
318 				{       97,   16,  128 },
319 				{ UINT_MAX,    0,    0 },
320 			},
321 		},
322 	};
323 
324 	return si_find_bin_size(sctx->screen, table, sum);
325 }
326 
si_emit_dpbb_disable(struct si_context * sctx)327 static void si_emit_dpbb_disable(struct si_context *sctx)
328 {
329 	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
330 
331 	radeon_set_context_reg(cs, R_028C44_PA_SC_BINNER_CNTL_0,
332 			       S_028C44_BINNING_MODE(V_028C44_DISABLE_BINNING_USE_LEGACY_SC) |
333 			       S_028C44_DISABLE_START_OF_PRIM(1));
334 	radeon_set_context_reg(cs, R_028060_DB_DFSM_CONTROL,
335 			       S_028060_PUNCHOUT_MODE(V_028060_FORCE_OFF));
336 }
337 
si_emit_dpbb_state(struct si_context * sctx,struct r600_atom * state)338 void si_emit_dpbb_state(struct si_context *sctx, struct r600_atom *state)
339 {
340 	struct si_screen *sscreen = sctx->screen;
341 	struct si_state_blend *blend = sctx->queued.named.blend;
342 	struct si_state_dsa *dsa = sctx->queued.named.dsa;
343 	unsigned db_shader_control = sctx->ps_db_shader_control;
344 
345 	assert(sctx->b.chip_class >= GFX9);
346 
347 	if (!sscreen->dpbb_allowed || !blend || !dsa) {
348 		si_emit_dpbb_disable(sctx);
349 		return;
350 	}
351 
352 	bool ps_can_kill = G_02880C_KILL_ENABLE(db_shader_control) ||
353 			   G_02880C_MASK_EXPORT_ENABLE(db_shader_control) ||
354 			   G_02880C_COVERAGE_TO_MASK_ENABLE(db_shader_control) ||
355 			   blend->alpha_to_coverage;
356 
357 	/* This is ported from Vulkan, but it doesn't make much sense to me.
358 	 * Maybe it's for RE-Z? But Vulkan doesn't use RE-Z. TODO: Clarify this.
359 	 */
360 	bool ps_can_reject_z_trivially =
361 		!G_02880C_Z_EXPORT_ENABLE(db_shader_control) ||
362 		G_02880C_CONSERVATIVE_Z_EXPORT(db_shader_control);
363 
364 	/* Disable binning if PS can kill trivially with DB writes.
365 	 * Ported from Vulkan. (heuristic?)
366 	 */
367 	if (ps_can_kill &&
368 	    ps_can_reject_z_trivially &&
369 	    sctx->framebuffer.state.zsbuf &&
370 	    dsa->db_can_write) {
371 		si_emit_dpbb_disable(sctx);
372 		return;
373 	}
374 
375 	/* Compute the bin size. */
376 	/* TODO: We could also look at enabled pixel shader outputs. */
377 	unsigned cb_target_enabled_4bit = sctx->framebuffer.colorbuf_enabled_4bit &
378 					  blend->cb_target_enabled_4bit;
379 	struct uvec2 color_bin_size =
380 		si_get_color_bin_size(sctx, cb_target_enabled_4bit);
381 	struct uvec2 depth_bin_size = si_get_depth_bin_size(sctx);
382 
383 	unsigned color_area = color_bin_size.x * color_bin_size.y;
384 	unsigned depth_area = depth_bin_size.x * depth_bin_size.y;
385 
386 	struct uvec2 bin_size = color_area < depth_area ? color_bin_size
387 							: depth_bin_size;
388 
389 	if (!bin_size.x || !bin_size.y) {
390 		si_emit_dpbb_disable(sctx);
391 		return;
392 	}
393 
394 	/* Enable DFSM if it's preferred. */
395 	unsigned punchout_mode = V_028060_FORCE_OFF;
396 	bool disable_start_of_prim = true;
397 
398 	if (sscreen->dfsm_allowed &&
399 	    cb_target_enabled_4bit &&
400 	    !G_02880C_KILL_ENABLE(db_shader_control) &&
401 	    /* These two also imply that DFSM is disabled when PS writes to memory. */
402 	    !G_02880C_EXEC_ON_HIER_FAIL(db_shader_control) &&
403 	    !G_02880C_EXEC_ON_NOOP(db_shader_control) &&
404 	    G_02880C_Z_ORDER(db_shader_control) == V_02880C_EARLY_Z_THEN_LATE_Z) {
405 		punchout_mode = V_028060_AUTO;
406 		disable_start_of_prim = (cb_target_enabled_4bit &
407 					 blend->blend_enable_4bit) != 0;
408 	}
409 
410 	/* Tunable parameters. Also test with DFSM enabled/disabled. */
411 	unsigned context_states_per_bin; /* allowed range: [0, 5] */
412 	unsigned persistent_states_per_bin; /* allowed range: [0, 31] */
413 	unsigned fpovs_per_batch; /* allowed range: [0, 255], 0 = unlimited */
414 
415 	switch (sctx->b.family) {
416 	case CHIP_VEGA10:
417 	case CHIP_RAVEN:
418 		/* Tuned for Raven. Vega might need different values. */
419 		context_states_per_bin = 5;
420 		persistent_states_per_bin = 31;
421 		fpovs_per_batch = 63;
422 		break;
423 	default:
424 		assert(0);
425 	}
426 
427 	/* Emit registers. */
428 	struct uvec2 bin_size_extend = {};
429 	if (bin_size.x >= 32)
430 		bin_size_extend.x = util_logbase2(bin_size.x) - 5;
431 	if (bin_size.y >= 32)
432 		bin_size_extend.y = util_logbase2(bin_size.y) - 5;
433 
434 	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
435 	radeon_set_context_reg(cs, R_028C44_PA_SC_BINNER_CNTL_0,
436 			       S_028C44_BINNING_MODE(V_028C44_BINNING_ALLOWED) |
437 			       S_028C44_BIN_SIZE_X(bin_size.x == 16) |
438 			       S_028C44_BIN_SIZE_Y(bin_size.y == 16) |
439 			       S_028C44_BIN_SIZE_X_EXTEND(bin_size_extend.x) |
440 			       S_028C44_BIN_SIZE_Y_EXTEND(bin_size_extend.y) |
441 			       S_028C44_CONTEXT_STATES_PER_BIN(context_states_per_bin) |
442 			       S_028C44_PERSISTENT_STATES_PER_BIN(persistent_states_per_bin) |
443 			       S_028C44_DISABLE_START_OF_PRIM(disable_start_of_prim) |
444 			       S_028C44_FPOVS_PER_BATCH(fpovs_per_batch) |
445 			       S_028C44_OPTIMAL_BIN_SELECTION(1));
446 	radeon_set_context_reg(cs, R_028060_DB_DFSM_CONTROL,
447 			       S_028060_PUNCHOUT_MODE(punchout_mode));
448 }
449