1 /*
2  * Copyright 2012 Advanced Micro Devices, Inc.
3  * All Rights Reserved.
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * on the rights to use, copy, modify, merge, publish, distribute, sub
9  * license, and/or sell copies of the Software, and to permit persons to whom
10  * the Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice (including the next
13  * paragraph) shall be included in all copies or substantial portions of the
14  * Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22  * USE OR OTHER DEALINGS IN THE SOFTWARE.
23  */
24 
25 #include "si_build_pm4.h"
26 #include "si_query.h"
27 #include "sid.h"
28 #include "util/fast_idiv_by_const.h"
29 #include "util/format/u_format.h"
30 #include "util/format/u_format_s3tc.h"
31 #include "util/u_dual_blend.h"
32 #include "util/u_memory.h"
33 #include "util/u_resource.h"
34 #include "util/u_upload_mgr.h"
35 #include "util/u_blend.h"
36 
37 #include "gfx10_format_table.h"
38 
si_map_swizzle(unsigned swizzle)39 static unsigned si_map_swizzle(unsigned swizzle)
40 {
41    switch (swizzle) {
42    case PIPE_SWIZZLE_Y:
43       return V_008F0C_SQ_SEL_Y;
44    case PIPE_SWIZZLE_Z:
45       return V_008F0C_SQ_SEL_Z;
46    case PIPE_SWIZZLE_W:
47       return V_008F0C_SQ_SEL_W;
48    case PIPE_SWIZZLE_0:
49       return V_008F0C_SQ_SEL_0;
50    case PIPE_SWIZZLE_1:
51       return V_008F0C_SQ_SEL_1;
52    default: /* PIPE_SWIZZLE_X */
53       return V_008F0C_SQ_SEL_X;
54    }
55 }
56 
57 /* 12.4 fixed-point */
si_pack_float_12p4(float x)58 static unsigned si_pack_float_12p4(float x)
59 {
60    return x <= 0 ? 0 : x >= 4096 ? 0xffff : x * 16;
61 }
62 
63 /*
64  * Inferred framebuffer and blender state.
65  *
66  * CB_TARGET_MASK is emitted here to avoid a hang with dual source blending
67  * if there is not enough PS outputs.
68  */
si_emit_cb_render_state(struct si_context * sctx)69 static void si_emit_cb_render_state(struct si_context *sctx)
70 {
71    struct radeon_cmdbuf *cs = sctx->gfx_cs;
72    struct si_state_blend *blend = sctx->queued.named.blend;
73    /* CB_COLORn_INFO.FORMAT=INVALID should disable unbound colorbuffers,
74     * but you never know. */
75    uint32_t cb_target_mask = sctx->framebuffer.colorbuf_enabled_4bit & blend->cb_target_mask;
76    unsigned i;
77 
78    /* Avoid a hang that happens when dual source blending is enabled
79     * but there is not enough color outputs. This is undefined behavior,
80     * so disable color writes completely.
81     *
82     * Reproducible with Unigine Heaven 4.0 and drirc missing.
83     */
84    if (blend->dual_src_blend && sctx->ps_shader.cso &&
85        (sctx->ps_shader.cso->info.colors_written & 0x3) != 0x3)
86       cb_target_mask = 0;
87 
88    /* GFX9: Flush DFSM when CB_TARGET_MASK changes.
89     * I think we don't have to do anything between IBs.
90     */
91    if (sctx->screen->dpbb_allowed && sctx->last_cb_target_mask != cb_target_mask) {
92       sctx->last_cb_target_mask = cb_target_mask;
93 
94       radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
95       radeon_emit(cs, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0));
96    }
97 
98    unsigned initial_cdw = cs->current.cdw;
99    radeon_opt_set_context_reg(sctx, R_028238_CB_TARGET_MASK, SI_TRACKED_CB_TARGET_MASK,
100                               cb_target_mask);
101 
102    if (sctx->chip_class >= GFX8) {
103       /* DCC MSAA workaround.
104        * Alternatively, we can set CB_COLORi_DCC_CONTROL.OVERWRITE_-
105        * COMBINER_DISABLE, but that would be more complicated.
106        */
107       bool oc_disable =
108          blend->dcc_msaa_corruption_4bit & cb_target_mask && sctx->framebuffer.nr_samples >= 2;
109       unsigned watermark = sctx->framebuffer.dcc_overwrite_combiner_watermark;
110 
111       radeon_opt_set_context_reg(
112          sctx, R_028424_CB_DCC_CONTROL, SI_TRACKED_CB_DCC_CONTROL,
113          S_028424_OVERWRITE_COMBINER_MRT_SHARING_DISABLE(sctx->chip_class <= GFX9) |
114             S_028424_OVERWRITE_COMBINER_WATERMARK(watermark) |
115             S_028424_OVERWRITE_COMBINER_DISABLE(oc_disable) |
116             S_028424_DISABLE_CONSTANT_ENCODE_REG(sctx->screen->info.has_dcc_constant_encode));
117    }
118 
119    /* RB+ register settings. */
120    if (sctx->screen->info.rbplus_allowed) {
121       unsigned spi_shader_col_format =
122          sctx->ps_shader.cso ? sctx->ps_shader.current->key.part.ps.epilog.spi_shader_col_format
123                              : 0;
124       unsigned sx_ps_downconvert = 0;
125       unsigned sx_blend_opt_epsilon = 0;
126       unsigned sx_blend_opt_control = 0;
127 
128       for (i = 0; i < sctx->framebuffer.state.nr_cbufs; i++) {
129          struct si_surface *surf = (struct si_surface *)sctx->framebuffer.state.cbufs[i];
130          unsigned format, swap, spi_format, colormask;
131          bool has_alpha, has_rgb;
132 
133          if (!surf) {
134             /* If the color buffer is not set, the driver sets 32_R
135              * as the SPI color format, because the hw doesn't allow
136              * holes between color outputs, so also set this to
137              * enable RB+.
138              */
139             sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_R << (i * 4);
140             continue;
141          }
142 
143          format = G_028C70_FORMAT(surf->cb_color_info);
144          swap = G_028C70_COMP_SWAP(surf->cb_color_info);
145          spi_format = (spi_shader_col_format >> (i * 4)) & 0xf;
146          colormask = (cb_target_mask >> (i * 4)) & 0xf;
147 
148          /* Set if RGB and A are present. */
149          has_alpha = !G_028C74_FORCE_DST_ALPHA_1(surf->cb_color_attrib);
150 
151          if (format == V_028C70_COLOR_8 || format == V_028C70_COLOR_16 ||
152              format == V_028C70_COLOR_32)
153             has_rgb = !has_alpha;
154          else
155             has_rgb = true;
156 
157          /* Check the colormask and export format. */
158          if (!(colormask & (PIPE_MASK_RGBA & ~PIPE_MASK_A)))
159             has_rgb = false;
160          if (!(colormask & PIPE_MASK_A))
161             has_alpha = false;
162 
163          if (spi_format == V_028714_SPI_SHADER_ZERO) {
164             has_rgb = false;
165             has_alpha = false;
166          }
167 
168          /* Disable value checking for disabled channels. */
169          if (!has_rgb)
170             sx_blend_opt_control |= S_02875C_MRT0_COLOR_OPT_DISABLE(1) << (i * 4);
171          if (!has_alpha)
172             sx_blend_opt_control |= S_02875C_MRT0_ALPHA_OPT_DISABLE(1) << (i * 4);
173 
174          /* Enable down-conversion for 32bpp and smaller formats. */
175          switch (format) {
176          case V_028C70_COLOR_8:
177          case V_028C70_COLOR_8_8:
178          case V_028C70_COLOR_8_8_8_8:
179             /* For 1 and 2-channel formats, use the superset thereof. */
180             if (spi_format == V_028714_SPI_SHADER_FP16_ABGR ||
181                 spi_format == V_028714_SPI_SHADER_UINT16_ABGR ||
182                 spi_format == V_028714_SPI_SHADER_SINT16_ABGR) {
183                sx_ps_downconvert |= V_028754_SX_RT_EXPORT_8_8_8_8 << (i * 4);
184                sx_blend_opt_epsilon |= V_028758_8BIT_FORMAT << (i * 4);
185             }
186             break;
187 
188          case V_028C70_COLOR_5_6_5:
189             if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
190                sx_ps_downconvert |= V_028754_SX_RT_EXPORT_5_6_5 << (i * 4);
191                sx_blend_opt_epsilon |= V_028758_6BIT_FORMAT << (i * 4);
192             }
193             break;
194 
195          case V_028C70_COLOR_1_5_5_5:
196             if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
197                sx_ps_downconvert |= V_028754_SX_RT_EXPORT_1_5_5_5 << (i * 4);
198                sx_blend_opt_epsilon |= V_028758_5BIT_FORMAT << (i * 4);
199             }
200             break;
201 
202          case V_028C70_COLOR_4_4_4_4:
203             if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
204                sx_ps_downconvert |= V_028754_SX_RT_EXPORT_4_4_4_4 << (i * 4);
205                sx_blend_opt_epsilon |= V_028758_4BIT_FORMAT << (i * 4);
206             }
207             break;
208 
209          case V_028C70_COLOR_32:
210             if (swap == V_028C70_SWAP_STD && spi_format == V_028714_SPI_SHADER_32_R)
211                sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_R << (i * 4);
212             else if (swap == V_028C70_SWAP_ALT_REV && spi_format == V_028714_SPI_SHADER_32_AR)
213                sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_A << (i * 4);
214             break;
215 
216          case V_028C70_COLOR_16:
217          case V_028C70_COLOR_16_16:
218             /* For 1-channel formats, use the superset thereof. */
219             if (spi_format == V_028714_SPI_SHADER_UNORM16_ABGR ||
220                 spi_format == V_028714_SPI_SHADER_SNORM16_ABGR ||
221                 spi_format == V_028714_SPI_SHADER_UINT16_ABGR ||
222                 spi_format == V_028714_SPI_SHADER_SINT16_ABGR) {
223                if (swap == V_028C70_SWAP_STD || swap == V_028C70_SWAP_STD_REV)
224                   sx_ps_downconvert |= V_028754_SX_RT_EXPORT_16_16_GR << (i * 4);
225                else
226                   sx_ps_downconvert |= V_028754_SX_RT_EXPORT_16_16_AR << (i * 4);
227             }
228             break;
229 
230          case V_028C70_COLOR_10_11_11:
231             if (spi_format == V_028714_SPI_SHADER_FP16_ABGR)
232                sx_ps_downconvert |= V_028754_SX_RT_EXPORT_10_11_11 << (i * 4);
233             break;
234 
235          case V_028C70_COLOR_2_10_10_10:
236             if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
237                sx_ps_downconvert |= V_028754_SX_RT_EXPORT_2_10_10_10 << (i * 4);
238                sx_blend_opt_epsilon |= V_028758_10BIT_FORMAT << (i * 4);
239             }
240             break;
241 
242          case V_028C70_COLOR_5_9_9_9:
243             if (spi_format == V_028714_SPI_SHADER_FP16_ABGR)
244                sx_ps_downconvert |= V_028754_SX_RT_EXPORT_9_9_9_E5 << (i * 4);
245             break;
246          }
247       }
248 
249       /* If there are no color outputs, the first color export is
250        * always enabled as 32_R, so also set this to enable RB+.
251        */
252       if (!sx_ps_downconvert)
253          sx_ps_downconvert = V_028754_SX_RT_EXPORT_32_R;
254 
255       /* SX_PS_DOWNCONVERT, SX_BLEND_OPT_EPSILON, SX_BLEND_OPT_CONTROL */
256       radeon_opt_set_context_reg3(sctx, R_028754_SX_PS_DOWNCONVERT, SI_TRACKED_SX_PS_DOWNCONVERT,
257                                   sx_ps_downconvert, sx_blend_opt_epsilon, sx_blend_opt_control);
258    }
259    if (initial_cdw != cs->current.cdw)
260       sctx->context_roll = true;
261 }
262 
263 /*
264  * Blender functions
265  */
266 
si_translate_blend_function(int blend_func)267 static uint32_t si_translate_blend_function(int blend_func)
268 {
269    switch (blend_func) {
270    case PIPE_BLEND_ADD:
271       return V_028780_COMB_DST_PLUS_SRC;
272    case PIPE_BLEND_SUBTRACT:
273       return V_028780_COMB_SRC_MINUS_DST;
274    case PIPE_BLEND_REVERSE_SUBTRACT:
275       return V_028780_COMB_DST_MINUS_SRC;
276    case PIPE_BLEND_MIN:
277       return V_028780_COMB_MIN_DST_SRC;
278    case PIPE_BLEND_MAX:
279       return V_028780_COMB_MAX_DST_SRC;
280    default:
281       PRINT_ERR("Unknown blend function %d\n", blend_func);
282       assert(0);
283       break;
284    }
285    return 0;
286 }
287 
si_translate_blend_factor(int blend_fact)288 static uint32_t si_translate_blend_factor(int blend_fact)
289 {
290    switch (blend_fact) {
291    case PIPE_BLENDFACTOR_ONE:
292       return V_028780_BLEND_ONE;
293    case PIPE_BLENDFACTOR_SRC_COLOR:
294       return V_028780_BLEND_SRC_COLOR;
295    case PIPE_BLENDFACTOR_SRC_ALPHA:
296       return V_028780_BLEND_SRC_ALPHA;
297    case PIPE_BLENDFACTOR_DST_ALPHA:
298       return V_028780_BLEND_DST_ALPHA;
299    case PIPE_BLENDFACTOR_DST_COLOR:
300       return V_028780_BLEND_DST_COLOR;
301    case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
302       return V_028780_BLEND_SRC_ALPHA_SATURATE;
303    case PIPE_BLENDFACTOR_CONST_COLOR:
304       return V_028780_BLEND_CONSTANT_COLOR;
305    case PIPE_BLENDFACTOR_CONST_ALPHA:
306       return V_028780_BLEND_CONSTANT_ALPHA;
307    case PIPE_BLENDFACTOR_ZERO:
308       return V_028780_BLEND_ZERO;
309    case PIPE_BLENDFACTOR_INV_SRC_COLOR:
310       return V_028780_BLEND_ONE_MINUS_SRC_COLOR;
311    case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
312       return V_028780_BLEND_ONE_MINUS_SRC_ALPHA;
313    case PIPE_BLENDFACTOR_INV_DST_ALPHA:
314       return V_028780_BLEND_ONE_MINUS_DST_ALPHA;
315    case PIPE_BLENDFACTOR_INV_DST_COLOR:
316       return V_028780_BLEND_ONE_MINUS_DST_COLOR;
317    case PIPE_BLENDFACTOR_INV_CONST_COLOR:
318       return V_028780_BLEND_ONE_MINUS_CONSTANT_COLOR;
319    case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
320       return V_028780_BLEND_ONE_MINUS_CONSTANT_ALPHA;
321    case PIPE_BLENDFACTOR_SRC1_COLOR:
322       return V_028780_BLEND_SRC1_COLOR;
323    case PIPE_BLENDFACTOR_SRC1_ALPHA:
324       return V_028780_BLEND_SRC1_ALPHA;
325    case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
326       return V_028780_BLEND_INV_SRC1_COLOR;
327    case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
328       return V_028780_BLEND_INV_SRC1_ALPHA;
329    default:
330       PRINT_ERR("Bad blend factor %d not supported!\n", blend_fact);
331       assert(0);
332       break;
333    }
334    return 0;
335 }
336 
si_translate_blend_opt_function(int blend_func)337 static uint32_t si_translate_blend_opt_function(int blend_func)
338 {
339    switch (blend_func) {
340    case PIPE_BLEND_ADD:
341       return V_028760_OPT_COMB_ADD;
342    case PIPE_BLEND_SUBTRACT:
343       return V_028760_OPT_COMB_SUBTRACT;
344    case PIPE_BLEND_REVERSE_SUBTRACT:
345       return V_028760_OPT_COMB_REVSUBTRACT;
346    case PIPE_BLEND_MIN:
347       return V_028760_OPT_COMB_MIN;
348    case PIPE_BLEND_MAX:
349       return V_028760_OPT_COMB_MAX;
350    default:
351       return V_028760_OPT_COMB_BLEND_DISABLED;
352    }
353 }
354 
si_translate_blend_opt_factor(int blend_fact,bool is_alpha)355 static uint32_t si_translate_blend_opt_factor(int blend_fact, bool is_alpha)
356 {
357    switch (blend_fact) {
358    case PIPE_BLENDFACTOR_ZERO:
359       return V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_ALL;
360    case PIPE_BLENDFACTOR_ONE:
361       return V_028760_BLEND_OPT_PRESERVE_ALL_IGNORE_NONE;
362    case PIPE_BLENDFACTOR_SRC_COLOR:
363       return is_alpha ? V_028760_BLEND_OPT_PRESERVE_A1_IGNORE_A0
364                       : V_028760_BLEND_OPT_PRESERVE_C1_IGNORE_C0;
365    case PIPE_BLENDFACTOR_INV_SRC_COLOR:
366       return is_alpha ? V_028760_BLEND_OPT_PRESERVE_A0_IGNORE_A1
367                       : V_028760_BLEND_OPT_PRESERVE_C0_IGNORE_C1;
368    case PIPE_BLENDFACTOR_SRC_ALPHA:
369       return V_028760_BLEND_OPT_PRESERVE_A1_IGNORE_A0;
370    case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
371       return V_028760_BLEND_OPT_PRESERVE_A0_IGNORE_A1;
372    case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
373       return is_alpha ? V_028760_BLEND_OPT_PRESERVE_ALL_IGNORE_NONE
374                       : V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_A0;
375    default:
376       return V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE;
377    }
378 }
379 
si_blend_check_commutativity(struct si_screen * sscreen,struct si_state_blend * blend,enum pipe_blend_func func,enum pipe_blendfactor src,enum pipe_blendfactor dst,unsigned chanmask)380 static void si_blend_check_commutativity(struct si_screen *sscreen, struct si_state_blend *blend,
381                                          enum pipe_blend_func func, enum pipe_blendfactor src,
382                                          enum pipe_blendfactor dst, unsigned chanmask)
383 {
384    /* Src factor is allowed when it does not depend on Dst */
385    static const uint32_t src_allowed =
386       (1u << PIPE_BLENDFACTOR_ONE) | (1u << PIPE_BLENDFACTOR_SRC_COLOR) |
387       (1u << PIPE_BLENDFACTOR_SRC_ALPHA) | (1u << PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE) |
388       (1u << PIPE_BLENDFACTOR_CONST_COLOR) | (1u << PIPE_BLENDFACTOR_CONST_ALPHA) |
389       (1u << PIPE_BLENDFACTOR_SRC1_COLOR) | (1u << PIPE_BLENDFACTOR_SRC1_ALPHA) |
390       (1u << PIPE_BLENDFACTOR_ZERO) | (1u << PIPE_BLENDFACTOR_INV_SRC_COLOR) |
391       (1u << PIPE_BLENDFACTOR_INV_SRC_ALPHA) | (1u << PIPE_BLENDFACTOR_INV_CONST_COLOR) |
392       (1u << PIPE_BLENDFACTOR_INV_CONST_ALPHA) | (1u << PIPE_BLENDFACTOR_INV_SRC1_COLOR) |
393       (1u << PIPE_BLENDFACTOR_INV_SRC1_ALPHA);
394 
395    if (dst == PIPE_BLENDFACTOR_ONE && (src_allowed & (1u << src))) {
396       /* Addition is commutative, but floating point addition isn't
397        * associative: subtle changes can be introduced via different
398        * rounding.
399        *
400        * Out-of-order is also non-deterministic, which means that
401        * this breaks OpenGL invariance requirements. So only enable
402        * out-of-order additive blending if explicitly allowed by a
403        * setting.
404        */
405       if (func == PIPE_BLEND_MAX || func == PIPE_BLEND_MIN ||
406           (func == PIPE_BLEND_ADD && sscreen->commutative_blend_add))
407          blend->commutative_4bit |= chanmask;
408    }
409 }
410 
411 /**
412  * Get rid of DST in the blend factors by commuting the operands:
413  *    func(src * DST, dst * 0) ---> func(src * 0, dst * SRC)
414  */
si_blend_remove_dst(unsigned * func,unsigned * src_factor,unsigned * dst_factor,unsigned expected_dst,unsigned replacement_src)415 static void si_blend_remove_dst(unsigned *func, unsigned *src_factor, unsigned *dst_factor,
416                                 unsigned expected_dst, unsigned replacement_src)
417 {
418    if (*src_factor == expected_dst && *dst_factor == PIPE_BLENDFACTOR_ZERO) {
419       *src_factor = PIPE_BLENDFACTOR_ZERO;
420       *dst_factor = replacement_src;
421 
422       /* Commuting the operands requires reversing subtractions. */
423       if (*func == PIPE_BLEND_SUBTRACT)
424          *func = PIPE_BLEND_REVERSE_SUBTRACT;
425       else if (*func == PIPE_BLEND_REVERSE_SUBTRACT)
426          *func = PIPE_BLEND_SUBTRACT;
427    }
428 }
429 
si_create_blend_state_mode(struct pipe_context * ctx,const struct pipe_blend_state * state,unsigned mode)430 static void *si_create_blend_state_mode(struct pipe_context *ctx,
431                                         const struct pipe_blend_state *state, unsigned mode)
432 {
433    struct si_context *sctx = (struct si_context *)ctx;
434    struct si_state_blend *blend = CALLOC_STRUCT(si_state_blend);
435    struct si_pm4_state *pm4 = &blend->pm4;
436    uint32_t sx_mrt_blend_opt[8] = {0};
437    uint32_t color_control = 0;
438    bool logicop_enable = state->logicop_enable && state->logicop_func != PIPE_LOGICOP_COPY;
439 
440    if (!blend)
441       return NULL;
442 
443    blend->alpha_to_coverage = state->alpha_to_coverage;
444    blend->alpha_to_one = state->alpha_to_one;
445    blend->dual_src_blend = util_blend_state_is_dual(state, 0);
446    blend->logicop_enable = logicop_enable;
447 
448    unsigned num_shader_outputs = state->max_rt + 1; /* estimate */
449    if (blend->dual_src_blend)
450       num_shader_outputs = MAX2(num_shader_outputs, 2);
451 
452    if (logicop_enable) {
453       color_control |= S_028808_ROP3(state->logicop_func | (state->logicop_func << 4));
454    } else {
455       color_control |= S_028808_ROP3(0xcc);
456    }
457 
458    if (state->alpha_to_coverage && state->alpha_to_coverage_dither) {
459       si_pm4_set_reg(pm4, R_028B70_DB_ALPHA_TO_MASK,
460                      S_028B70_ALPHA_TO_MASK_ENABLE(state->alpha_to_coverage) |
461                         S_028B70_ALPHA_TO_MASK_OFFSET0(3) | S_028B70_ALPHA_TO_MASK_OFFSET1(1) |
462                         S_028B70_ALPHA_TO_MASK_OFFSET2(0) | S_028B70_ALPHA_TO_MASK_OFFSET3(2) |
463                         S_028B70_OFFSET_ROUND(1));
464    } else {
465       si_pm4_set_reg(pm4, R_028B70_DB_ALPHA_TO_MASK,
466                      S_028B70_ALPHA_TO_MASK_ENABLE(state->alpha_to_coverage) |
467                         S_028B70_ALPHA_TO_MASK_OFFSET0(2) | S_028B70_ALPHA_TO_MASK_OFFSET1(2) |
468                         S_028B70_ALPHA_TO_MASK_OFFSET2(2) | S_028B70_ALPHA_TO_MASK_OFFSET3(2) |
469                         S_028B70_OFFSET_ROUND(0));
470    }
471 
472    if (state->alpha_to_coverage)
473       blend->need_src_alpha_4bit |= 0xf;
474 
475    blend->cb_target_mask = 0;
476    blend->cb_target_enabled_4bit = 0;
477 
478    for (int i = 0; i < num_shader_outputs; i++) {
479       /* state->rt entries > 0 only written if independent blending */
480       const int j = state->independent_blend_enable ? i : 0;
481 
482       unsigned eqRGB = state->rt[j].rgb_func;
483       unsigned srcRGB = state->rt[j].rgb_src_factor;
484       unsigned dstRGB = state->rt[j].rgb_dst_factor;
485       unsigned eqA = state->rt[j].alpha_func;
486       unsigned srcA = state->rt[j].alpha_src_factor;
487       unsigned dstA = state->rt[j].alpha_dst_factor;
488 
489       unsigned srcRGB_opt, dstRGB_opt, srcA_opt, dstA_opt;
490       unsigned blend_cntl = 0;
491 
492       sx_mrt_blend_opt[i] = S_028760_COLOR_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED) |
493                             S_028760_ALPHA_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED);
494 
495       /* Only set dual source blending for MRT0 to avoid a hang. */
496       if (i >= 1 && blend->dual_src_blend) {
497          /* Vulkan does this for dual source blending. */
498          if (i == 1)
499             blend_cntl |= S_028780_ENABLE(1);
500 
501          si_pm4_set_reg(pm4, R_028780_CB_BLEND0_CONTROL + i * 4, blend_cntl);
502          continue;
503       }
504 
505       /* Only addition and subtraction equations are supported with
506        * dual source blending.
507        */
508       if (blend->dual_src_blend && (eqRGB == PIPE_BLEND_MIN || eqRGB == PIPE_BLEND_MAX ||
509                                     eqA == PIPE_BLEND_MIN || eqA == PIPE_BLEND_MAX)) {
510          assert(!"Unsupported equation for dual source blending");
511          si_pm4_set_reg(pm4, R_028780_CB_BLEND0_CONTROL + i * 4, blend_cntl);
512          continue;
513       }
514 
515       /* cb_render_state will disable unused ones */
516       blend->cb_target_mask |= (unsigned)state->rt[j].colormask << (4 * i);
517       if (state->rt[j].colormask)
518          blend->cb_target_enabled_4bit |= 0xf << (4 * i);
519 
520       if (!state->rt[j].colormask || !state->rt[j].blend_enable) {
521          si_pm4_set_reg(pm4, R_028780_CB_BLEND0_CONTROL + i * 4, blend_cntl);
522          continue;
523       }
524 
525       si_blend_check_commutativity(sctx->screen, blend, eqRGB, srcRGB, dstRGB, 0x7 << (4 * i));
526       si_blend_check_commutativity(sctx->screen, blend, eqA, srcA, dstA, 0x8 << (4 * i));
527 
528       /* Blending optimizations for RB+.
529        * These transformations don't change the behavior.
530        *
531        * First, get rid of DST in the blend factors:
532        *    func(src * DST, dst * 0) ---> func(src * 0, dst * SRC)
533        */
534       si_blend_remove_dst(&eqRGB, &srcRGB, &dstRGB, PIPE_BLENDFACTOR_DST_COLOR,
535                           PIPE_BLENDFACTOR_SRC_COLOR);
536       si_blend_remove_dst(&eqA, &srcA, &dstA, PIPE_BLENDFACTOR_DST_COLOR,
537                           PIPE_BLENDFACTOR_SRC_COLOR);
538       si_blend_remove_dst(&eqA, &srcA, &dstA, PIPE_BLENDFACTOR_DST_ALPHA,
539                           PIPE_BLENDFACTOR_SRC_ALPHA);
540 
541       /* Look up the ideal settings from tables. */
542       srcRGB_opt = si_translate_blend_opt_factor(srcRGB, false);
543       dstRGB_opt = si_translate_blend_opt_factor(dstRGB, false);
544       srcA_opt = si_translate_blend_opt_factor(srcA, true);
545       dstA_opt = si_translate_blend_opt_factor(dstA, true);
546 
547       /* Handle interdependencies. */
548       if (util_blend_factor_uses_dest(srcRGB, false))
549          dstRGB_opt = V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE;
550       if (util_blend_factor_uses_dest(srcA, false))
551          dstA_opt = V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE;
552 
553       if (srcRGB == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE &&
554           (dstRGB == PIPE_BLENDFACTOR_ZERO || dstRGB == PIPE_BLENDFACTOR_SRC_ALPHA ||
555            dstRGB == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE))
556          dstRGB_opt = V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_A0;
557 
558       /* Set the final value. */
559       sx_mrt_blend_opt[i] = S_028760_COLOR_SRC_OPT(srcRGB_opt) |
560                             S_028760_COLOR_DST_OPT(dstRGB_opt) |
561                             S_028760_COLOR_COMB_FCN(si_translate_blend_opt_function(eqRGB)) |
562                             S_028760_ALPHA_SRC_OPT(srcA_opt) | S_028760_ALPHA_DST_OPT(dstA_opt) |
563                             S_028760_ALPHA_COMB_FCN(si_translate_blend_opt_function(eqA));
564 
565       /* Set blend state. */
566       blend_cntl |= S_028780_ENABLE(1);
567       blend_cntl |= S_028780_COLOR_COMB_FCN(si_translate_blend_function(eqRGB));
568       blend_cntl |= S_028780_COLOR_SRCBLEND(si_translate_blend_factor(srcRGB));
569       blend_cntl |= S_028780_COLOR_DESTBLEND(si_translate_blend_factor(dstRGB));
570 
571       if (srcA != srcRGB || dstA != dstRGB || eqA != eqRGB) {
572          blend_cntl |= S_028780_SEPARATE_ALPHA_BLEND(1);
573          blend_cntl |= S_028780_ALPHA_COMB_FCN(si_translate_blend_function(eqA));
574          blend_cntl |= S_028780_ALPHA_SRCBLEND(si_translate_blend_factor(srcA));
575          blend_cntl |= S_028780_ALPHA_DESTBLEND(si_translate_blend_factor(dstA));
576       }
577       si_pm4_set_reg(pm4, R_028780_CB_BLEND0_CONTROL + i * 4, blend_cntl);
578 
579       blend->blend_enable_4bit |= 0xfu << (i * 4);
580 
581       if (sctx->chip_class >= GFX8 && sctx->chip_class <= GFX10)
582          blend->dcc_msaa_corruption_4bit |= 0xfu << (i * 4);
583 
584       /* This is only important for formats without alpha. */
585       if (srcRGB == PIPE_BLENDFACTOR_SRC_ALPHA || dstRGB == PIPE_BLENDFACTOR_SRC_ALPHA ||
586           srcRGB == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE ||
587           dstRGB == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE ||
588           srcRGB == PIPE_BLENDFACTOR_INV_SRC_ALPHA || dstRGB == PIPE_BLENDFACTOR_INV_SRC_ALPHA)
589          blend->need_src_alpha_4bit |= 0xfu << (i * 4);
590    }
591 
592    if (sctx->chip_class >= GFX8 && sctx->chip_class <= GFX10 && logicop_enable)
593       blend->dcc_msaa_corruption_4bit |= blend->cb_target_enabled_4bit;
594 
595    if (blend->cb_target_mask) {
596       color_control |= S_028808_MODE(mode);
597    } else {
598       color_control |= S_028808_MODE(V_028808_CB_DISABLE);
599    }
600 
601    if (sctx->screen->info.rbplus_allowed) {
602       /* Disable RB+ blend optimizations for dual source blending.
603        * Vulkan does this.
604        */
605       if (blend->dual_src_blend) {
606          for (int i = 0; i < num_shader_outputs; i++) {
607             sx_mrt_blend_opt[i] = S_028760_COLOR_COMB_FCN(V_028760_OPT_COMB_NONE) |
608                                   S_028760_ALPHA_COMB_FCN(V_028760_OPT_COMB_NONE);
609          }
610       }
611 
612       for (int i = 0; i < num_shader_outputs; i++)
613          si_pm4_set_reg(pm4, R_028760_SX_MRT0_BLEND_OPT + i * 4, sx_mrt_blend_opt[i]);
614 
615       /* RB+ doesn't work with dual source blending, logic op, and RESOLVE. */
616       if (blend->dual_src_blend || logicop_enable || mode == V_028808_CB_RESOLVE)
617          color_control |= S_028808_DISABLE_DUAL_QUAD(1);
618    }
619 
620    si_pm4_set_reg(pm4, R_028808_CB_COLOR_CONTROL, color_control);
621    return blend;
622 }
623 
si_create_blend_state(struct pipe_context * ctx,const struct pipe_blend_state * state)624 static void *si_create_blend_state(struct pipe_context *ctx, const struct pipe_blend_state *state)
625 {
626    return si_create_blend_state_mode(ctx, state, V_028808_CB_NORMAL);
627 }
628 
si_bind_blend_state(struct pipe_context * ctx,void * state)629 static void si_bind_blend_state(struct pipe_context *ctx, void *state)
630 {
631    struct si_context *sctx = (struct si_context *)ctx;
632    struct si_state_blend *old_blend = sctx->queued.named.blend;
633    struct si_state_blend *blend = (struct si_state_blend *)state;
634 
635    if (!blend)
636       blend = (struct si_state_blend *)sctx->noop_blend;
637 
638    si_pm4_bind_state(sctx, blend, blend);
639 
640    if (old_blend->cb_target_mask != blend->cb_target_mask ||
641        old_blend->dual_src_blend != blend->dual_src_blend ||
642        (old_blend->dcc_msaa_corruption_4bit != blend->dcc_msaa_corruption_4bit &&
643         sctx->framebuffer.nr_samples >= 2 && sctx->screen->dcc_msaa_allowed))
644       si_mark_atom_dirty(sctx, &sctx->atoms.s.cb_render_state);
645 
646    if (old_blend->cb_target_mask != blend->cb_target_mask ||
647        old_blend->alpha_to_coverage != blend->alpha_to_coverage ||
648        old_blend->alpha_to_one != blend->alpha_to_one ||
649        old_blend->dual_src_blend != blend->dual_src_blend ||
650        old_blend->blend_enable_4bit != blend->blend_enable_4bit ||
651        old_blend->need_src_alpha_4bit != blend->need_src_alpha_4bit)
652       sctx->do_update_shaders = true;
653 
654    if (sctx->screen->dpbb_allowed &&
655        (old_blend->alpha_to_coverage != blend->alpha_to_coverage ||
656         old_blend->blend_enable_4bit != blend->blend_enable_4bit ||
657         old_blend->cb_target_enabled_4bit != blend->cb_target_enabled_4bit))
658       si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state);
659 
660    if (sctx->screen->has_out_of_order_rast &&
661        ((old_blend->blend_enable_4bit != blend->blend_enable_4bit ||
662          old_blend->cb_target_enabled_4bit != blend->cb_target_enabled_4bit ||
663          old_blend->commutative_4bit != blend->commutative_4bit ||
664          old_blend->logicop_enable != blend->logicop_enable)))
665       si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
666 }
667 
si_delete_blend_state(struct pipe_context * ctx,void * state)668 static void si_delete_blend_state(struct pipe_context *ctx, void *state)
669 {
670    struct si_context *sctx = (struct si_context *)ctx;
671 
672    if (sctx->queued.named.blend == state)
673       si_bind_blend_state(ctx, sctx->noop_blend);
674 
675    si_pm4_delete_state(sctx, blend, (struct si_state_blend *)state);
676 }
677 
si_set_blend_color(struct pipe_context * ctx,const struct pipe_blend_color * state)678 static void si_set_blend_color(struct pipe_context *ctx, const struct pipe_blend_color *state)
679 {
680    struct si_context *sctx = (struct si_context *)ctx;
681    static const struct pipe_blend_color zeros;
682 
683    sctx->blend_color.state = *state;
684    sctx->blend_color.any_nonzeros = memcmp(state, &zeros, sizeof(*state)) != 0;
685    si_mark_atom_dirty(sctx, &sctx->atoms.s.blend_color);
686 }
687 
si_emit_blend_color(struct si_context * sctx)688 static void si_emit_blend_color(struct si_context *sctx)
689 {
690    struct radeon_cmdbuf *cs = sctx->gfx_cs;
691 
692    radeon_set_context_reg_seq(cs, R_028414_CB_BLEND_RED, 4);
693    radeon_emit_array(cs, (uint32_t *)sctx->blend_color.state.color, 4);
694 }
695 
696 /*
697  * Clipping
698  */
699 
si_set_clip_state(struct pipe_context * ctx,const struct pipe_clip_state * state)700 static void si_set_clip_state(struct pipe_context *ctx, const struct pipe_clip_state *state)
701 {
702    struct si_context *sctx = (struct si_context *)ctx;
703    struct pipe_constant_buffer cb;
704    static const struct pipe_clip_state zeros;
705 
706    if (memcmp(&sctx->clip_state.state, state, sizeof(*state)) == 0)
707       return;
708 
709    sctx->clip_state.state = *state;
710    sctx->clip_state.any_nonzeros = memcmp(state, &zeros, sizeof(*state)) != 0;
711    si_mark_atom_dirty(sctx, &sctx->atoms.s.clip_state);
712 
713    cb.buffer = NULL;
714    cb.user_buffer = state->ucp;
715    cb.buffer_offset = 0;
716    cb.buffer_size = 4 * 4 * 8;
717    si_set_rw_buffer(sctx, SI_VS_CONST_CLIP_PLANES, &cb);
718    pipe_resource_reference(&cb.buffer, NULL);
719 }
720 
si_emit_clip_state(struct si_context * sctx)721 static void si_emit_clip_state(struct si_context *sctx)
722 {
723    struct radeon_cmdbuf *cs = sctx->gfx_cs;
724 
725    radeon_set_context_reg_seq(cs, R_0285BC_PA_CL_UCP_0_X, 6 * 4);
726    radeon_emit_array(cs, (uint32_t *)sctx->clip_state.state.ucp, 6 * 4);
727 }
728 
si_emit_clip_regs(struct si_context * sctx)729 static void si_emit_clip_regs(struct si_context *sctx)
730 {
731    struct si_shader *vs = si_get_vs_state(sctx);
732    struct si_shader_selector *vs_sel = vs->selector;
733    struct si_shader_info *info = &vs_sel->info;
734    struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
735    bool window_space = info->stage == MESA_SHADER_VERTEX ?
736                           info->base.vs.window_space_position : 0;
737    unsigned clipdist_mask = vs_sel->clipdist_mask;
738    unsigned ucp_mask = clipdist_mask ? 0 : rs->clip_plane_enable & SIX_BITS;
739    unsigned culldist_mask = vs_sel->culldist_mask;
740    unsigned vs_out_mask = (clipdist_mask & ~vs->key.opt.kill_clip_distances) | culldist_mask;
741 
742    /* Clip distances on points have no effect, so need to be implemented
743     * as cull distances. This applies for the clipvertex case as well.
744     *
745     * Setting this for primitives other than points should have no adverse
746     * effects.
747     */
748    clipdist_mask &= rs->clip_plane_enable;
749    culldist_mask |= clipdist_mask;
750 
751    unsigned initial_cdw = sctx->gfx_cs->current.cdw;
752    unsigned pa_cl_cntl = S_02881C_VS_OUT_CCDIST0_VEC_ENA((vs_out_mask & 0x0F) != 0) |
753                          S_02881C_VS_OUT_CCDIST1_VEC_ENA((vs_out_mask & 0xF0) != 0) |
754                          S_02881C_BYPASS_VTX_RATE_COMBINER(sctx->chip_class >= GFX10_3) |
755                          S_02881C_BYPASS_PRIM_RATE_COMBINER(sctx->chip_class >= GFX10_3) |
756                          clipdist_mask | (culldist_mask << 8);
757 
758    if (sctx->chip_class >= GFX10) {
759       radeon_opt_set_context_reg_rmw(sctx, R_02881C_PA_CL_VS_OUT_CNTL,
760                                      SI_TRACKED_PA_CL_VS_OUT_CNTL__CL, pa_cl_cntl,
761                                      ~SI_TRACKED_PA_CL_VS_OUT_CNTL__VS_MASK);
762    } else {
763       radeon_opt_set_context_reg(sctx, R_02881C_PA_CL_VS_OUT_CNTL, SI_TRACKED_PA_CL_VS_OUT_CNTL__CL,
764                                  vs_sel->pa_cl_vs_out_cntl | pa_cl_cntl);
765    }
766    radeon_opt_set_context_reg(sctx, R_028810_PA_CL_CLIP_CNTL, SI_TRACKED_PA_CL_CLIP_CNTL,
767                               rs->pa_cl_clip_cntl | ucp_mask | S_028810_CLIP_DISABLE(window_space));
768 
769    if (initial_cdw != sctx->gfx_cs->current.cdw)
770       sctx->context_roll = true;
771 }
772 
773 /*
774  * inferred state between framebuffer and rasterizer
775  */
si_update_poly_offset_state(struct si_context * sctx)776 static void si_update_poly_offset_state(struct si_context *sctx)
777 {
778    struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
779 
780    if (!rs->uses_poly_offset || !sctx->framebuffer.state.zsbuf) {
781       si_pm4_bind_state(sctx, poly_offset, NULL);
782       return;
783    }
784 
785    /* Use the user format, not db_render_format, so that the polygon
786     * offset behaves as expected by applications.
787     */
788    switch (sctx->framebuffer.state.zsbuf->texture->format) {
789    case PIPE_FORMAT_Z16_UNORM:
790       si_pm4_bind_state(sctx, poly_offset, &rs->pm4_poly_offset[0]);
791       break;
792    default: /* 24-bit */
793       si_pm4_bind_state(sctx, poly_offset, &rs->pm4_poly_offset[1]);
794       break;
795    case PIPE_FORMAT_Z32_FLOAT:
796    case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
797       si_pm4_bind_state(sctx, poly_offset, &rs->pm4_poly_offset[2]);
798       break;
799    }
800 }
801 
802 /*
803  * Rasterizer
804  */
805 
si_translate_fill(uint32_t func)806 static uint32_t si_translate_fill(uint32_t func)
807 {
808    switch (func) {
809    case PIPE_POLYGON_MODE_FILL:
810       return V_028814_X_DRAW_TRIANGLES;
811    case PIPE_POLYGON_MODE_LINE:
812       return V_028814_X_DRAW_LINES;
813    case PIPE_POLYGON_MODE_POINT:
814       return V_028814_X_DRAW_POINTS;
815    default:
816       assert(0);
817       return V_028814_X_DRAW_POINTS;
818    }
819 }
820 
si_create_rs_state(struct pipe_context * ctx,const struct pipe_rasterizer_state * state)821 static void *si_create_rs_state(struct pipe_context *ctx, const struct pipe_rasterizer_state *state)
822 {
823    struct si_screen *sscreen = ((struct si_context *)ctx)->screen;
824    struct si_state_rasterizer *rs = CALLOC_STRUCT(si_state_rasterizer);
825    struct si_pm4_state *pm4 = &rs->pm4;
826    unsigned tmp, i;
827    float psize_min, psize_max;
828 
829    if (!rs) {
830       return NULL;
831    }
832 
833    if (!state->front_ccw) {
834       rs->cull_front = !!(state->cull_face & PIPE_FACE_FRONT);
835       rs->cull_back = !!(state->cull_face & PIPE_FACE_BACK);
836    } else {
837       rs->cull_back = !!(state->cull_face & PIPE_FACE_FRONT);
838       rs->cull_front = !!(state->cull_face & PIPE_FACE_BACK);
839    }
840    rs->depth_clamp_any = !state->depth_clip_near || !state->depth_clip_far;
841    rs->provoking_vertex_first = state->flatshade_first;
842    rs->scissor_enable = state->scissor;
843    rs->clip_halfz = state->clip_halfz;
844    rs->two_side = state->light_twoside;
845    rs->multisample_enable = state->multisample;
846    rs->force_persample_interp = state->force_persample_interp;
847    rs->clip_plane_enable = state->clip_plane_enable;
848    rs->half_pixel_center = state->half_pixel_center;
849    rs->line_stipple_enable = state->line_stipple_enable;
850    rs->poly_stipple_enable = state->poly_stipple_enable;
851    rs->line_smooth = state->line_smooth;
852    rs->line_width = state->line_width;
853    rs->poly_smooth = state->poly_smooth;
854    rs->uses_poly_offset = state->offset_point || state->offset_line || state->offset_tri;
855    rs->clamp_fragment_color = state->clamp_fragment_color;
856    rs->clamp_vertex_color = state->clamp_vertex_color;
857    rs->flatshade = state->flatshade;
858    rs->flatshade_first = state->flatshade_first;
859    rs->sprite_coord_enable = state->sprite_coord_enable;
860    rs->rasterizer_discard = state->rasterizer_discard;
861    rs->polygon_mode_enabled =
862       (state->fill_front != PIPE_POLYGON_MODE_FILL && !(state->cull_face & PIPE_FACE_FRONT)) ||
863       (state->fill_back != PIPE_POLYGON_MODE_FILL && !(state->cull_face & PIPE_FACE_BACK));
864    rs->polygon_mode_is_lines =
865       (state->fill_front == PIPE_POLYGON_MODE_LINE && !(state->cull_face & PIPE_FACE_FRONT)) ||
866       (state->fill_back == PIPE_POLYGON_MODE_LINE && !(state->cull_face & PIPE_FACE_BACK));
867    rs->polygon_mode_is_points =
868       (state->fill_front == PIPE_POLYGON_MODE_POINT && !(state->cull_face & PIPE_FACE_FRONT)) ||
869       (state->fill_back == PIPE_POLYGON_MODE_POINT && !(state->cull_face & PIPE_FACE_BACK));
870    rs->pa_sc_line_stipple = state->line_stipple_enable
871                                ? S_028A0C_LINE_PATTERN(state->line_stipple_pattern) |
872                                     S_028A0C_REPEAT_COUNT(state->line_stipple_factor)
873                                : 0;
874    rs->pa_cl_clip_cntl = S_028810_DX_CLIP_SPACE_DEF(state->clip_halfz) |
875                          S_028810_ZCLIP_NEAR_DISABLE(!state->depth_clip_near) |
876                          S_028810_ZCLIP_FAR_DISABLE(!state->depth_clip_far) |
877                          S_028810_DX_RASTERIZATION_KILL(state->rasterizer_discard) |
878                          S_028810_DX_LINEAR_ATTR_CLIP_ENA(1);
879 
880    si_pm4_set_reg(
881       pm4, R_0286D4_SPI_INTERP_CONTROL_0,
882       S_0286D4_FLAT_SHADE_ENA(1) | S_0286D4_PNT_SPRITE_ENA(state->point_quad_rasterization) |
883          S_0286D4_PNT_SPRITE_OVRD_X(V_0286D4_SPI_PNT_SPRITE_SEL_S) |
884          S_0286D4_PNT_SPRITE_OVRD_Y(V_0286D4_SPI_PNT_SPRITE_SEL_T) |
885          S_0286D4_PNT_SPRITE_OVRD_Z(V_0286D4_SPI_PNT_SPRITE_SEL_0) |
886          S_0286D4_PNT_SPRITE_OVRD_W(V_0286D4_SPI_PNT_SPRITE_SEL_1) |
887          S_0286D4_PNT_SPRITE_TOP_1(state->sprite_coord_mode != PIPE_SPRITE_COORD_UPPER_LEFT));
888 
889    /* point size 12.4 fixed point */
890    tmp = (unsigned)(state->point_size * 8.0);
891    si_pm4_set_reg(pm4, R_028A00_PA_SU_POINT_SIZE, S_028A00_HEIGHT(tmp) | S_028A00_WIDTH(tmp));
892 
893    if (state->point_size_per_vertex) {
894       psize_min = util_get_min_point_size(state);
895       psize_max = SI_MAX_POINT_SIZE;
896    } else {
897       /* Force the point size to be as if the vertex output was disabled. */
898       psize_min = state->point_size;
899       psize_max = state->point_size;
900    }
901    rs->max_point_size = psize_max;
902 
903    /* Divide by two, because 0.5 = 1 pixel. */
904    si_pm4_set_reg(pm4, R_028A04_PA_SU_POINT_MINMAX,
905                   S_028A04_MIN_SIZE(si_pack_float_12p4(psize_min / 2)) |
906                      S_028A04_MAX_SIZE(si_pack_float_12p4(psize_max / 2)));
907 
908    si_pm4_set_reg(pm4, R_028A08_PA_SU_LINE_CNTL,
909                   S_028A08_WIDTH(si_pack_float_12p4(state->line_width / 2)));
910    si_pm4_set_reg(
911       pm4, R_028A48_PA_SC_MODE_CNTL_0,
912       S_028A48_LINE_STIPPLE_ENABLE(state->line_stipple_enable) |
913          S_028A48_MSAA_ENABLE(state->multisample || state->poly_smooth || state->line_smooth) |
914          S_028A48_VPORT_SCISSOR_ENABLE(1) |
915          S_028A48_ALTERNATE_RBS_PER_TILE(sscreen->info.chip_class >= GFX9));
916 
917    si_pm4_set_reg(pm4, R_028B7C_PA_SU_POLY_OFFSET_CLAMP, fui(state->offset_clamp));
918    si_pm4_set_reg(pm4, R_028814_PA_SU_SC_MODE_CNTL,
919                   S_028814_PROVOKING_VTX_LAST(!state->flatshade_first) |
920                      S_028814_CULL_FRONT((state->cull_face & PIPE_FACE_FRONT) ? 1 : 0) |
921                      S_028814_CULL_BACK((state->cull_face & PIPE_FACE_BACK) ? 1 : 0) |
922                      S_028814_FACE(!state->front_ccw) |
923                      S_028814_POLY_OFFSET_FRONT_ENABLE(util_get_offset(state, state->fill_front)) |
924                      S_028814_POLY_OFFSET_BACK_ENABLE(util_get_offset(state, state->fill_back)) |
925                      S_028814_POLY_OFFSET_PARA_ENABLE(state->offset_point || state->offset_line) |
926                      S_028814_POLY_MODE(rs->polygon_mode_enabled) |
927                      S_028814_POLYMODE_FRONT_PTYPE(si_translate_fill(state->fill_front)) |
928                      S_028814_POLYMODE_BACK_PTYPE(si_translate_fill(state->fill_back)) |
929                      /* this must be set if POLY_MODE or PERPENDICULAR_ENDCAP_ENA is set */
930                      S_028814_KEEP_TOGETHER_ENABLE(sscreen->info.chip_class >= GFX10 ? rs->polygon_mode_enabled : 0));
931 
932    if (!rs->uses_poly_offset)
933       return rs;
934 
935    rs->pm4_poly_offset = CALLOC(3, sizeof(struct si_pm4_state));
936    if (!rs->pm4_poly_offset) {
937       FREE(rs);
938       return NULL;
939    }
940 
941    /* Precalculate polygon offset states for 16-bit, 24-bit, and 32-bit zbuffers. */
942    for (i = 0; i < 3; i++) {
943       struct si_pm4_state *pm4 = &rs->pm4_poly_offset[i];
944       float offset_units = state->offset_units;
945       float offset_scale = state->offset_scale * 16.0f;
946       uint32_t pa_su_poly_offset_db_fmt_cntl = 0;
947 
948       if (!state->offset_units_unscaled) {
949          switch (i) {
950          case 0: /* 16-bit zbuffer */
951             offset_units *= 4.0f;
952             pa_su_poly_offset_db_fmt_cntl = S_028B78_POLY_OFFSET_NEG_NUM_DB_BITS(-16);
953             break;
954          case 1: /* 24-bit zbuffer */
955             offset_units *= 2.0f;
956             pa_su_poly_offset_db_fmt_cntl = S_028B78_POLY_OFFSET_NEG_NUM_DB_BITS(-24);
957             break;
958          case 2: /* 32-bit zbuffer */
959             offset_units *= 1.0f;
960             pa_su_poly_offset_db_fmt_cntl =
961                S_028B78_POLY_OFFSET_NEG_NUM_DB_BITS(-23) | S_028B78_POLY_OFFSET_DB_IS_FLOAT_FMT(1);
962             break;
963          }
964       }
965 
966       si_pm4_set_reg(pm4, R_028B80_PA_SU_POLY_OFFSET_FRONT_SCALE, fui(offset_scale));
967       si_pm4_set_reg(pm4, R_028B84_PA_SU_POLY_OFFSET_FRONT_OFFSET, fui(offset_units));
968       si_pm4_set_reg(pm4, R_028B88_PA_SU_POLY_OFFSET_BACK_SCALE, fui(offset_scale));
969       si_pm4_set_reg(pm4, R_028B8C_PA_SU_POLY_OFFSET_BACK_OFFSET, fui(offset_units));
970       si_pm4_set_reg(pm4, R_028B78_PA_SU_POLY_OFFSET_DB_FMT_CNTL, pa_su_poly_offset_db_fmt_cntl);
971    }
972 
973    return rs;
974 }
975 
si_bind_rs_state(struct pipe_context * ctx,void * state)976 static void si_bind_rs_state(struct pipe_context *ctx, void *state)
977 {
978    struct si_context *sctx = (struct si_context *)ctx;
979    struct si_state_rasterizer *old_rs = (struct si_state_rasterizer *)sctx->queued.named.rasterizer;
980    struct si_state_rasterizer *rs = (struct si_state_rasterizer *)state;
981 
982    if (!rs)
983       rs = (struct si_state_rasterizer *)sctx->discard_rasterizer_state;
984 
985    if (old_rs->multisample_enable != rs->multisample_enable) {
986       si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
987 
988       /* Update the small primitive filter workaround if necessary. */
989       if (sctx->screen->info.has_msaa_sample_loc_bug && sctx->framebuffer.nr_samples > 1)
990          si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_sample_locs);
991 
992       /* NGG cull state uses multisample_enable. */
993       if (sctx->screen->use_ngg_culling)
994          si_mark_atom_dirty(sctx, &sctx->atoms.s.ngg_cull_state);
995    }
996 
997    sctx->current_vs_state &= C_VS_STATE_CLAMP_VERTEX_COLOR;
998    sctx->current_vs_state |= S_VS_STATE_CLAMP_VERTEX_COLOR(rs->clamp_vertex_color);
999 
1000    si_pm4_bind_state(sctx, rasterizer, rs);
1001    si_update_poly_offset_state(sctx);
1002 
1003    if (old_rs->scissor_enable != rs->scissor_enable)
1004       si_mark_atom_dirty(sctx, &sctx->atoms.s.scissors);
1005 
1006    if (old_rs->line_width != rs->line_width || old_rs->max_point_size != rs->max_point_size ||
1007        old_rs->half_pixel_center != rs->half_pixel_center)
1008       si_mark_atom_dirty(sctx, &sctx->atoms.s.guardband);
1009 
1010    if (old_rs->clip_halfz != rs->clip_halfz)
1011       si_mark_atom_dirty(sctx, &sctx->atoms.s.viewports);
1012 
1013    if (old_rs->clip_plane_enable != rs->clip_plane_enable ||
1014        old_rs->pa_cl_clip_cntl != rs->pa_cl_clip_cntl)
1015       si_mark_atom_dirty(sctx, &sctx->atoms.s.clip_regs);
1016 
1017    if (old_rs->clip_plane_enable != rs->clip_plane_enable ||
1018        old_rs->rasterizer_discard != rs->rasterizer_discard ||
1019        old_rs->sprite_coord_enable != rs->sprite_coord_enable ||
1020        old_rs->flatshade != rs->flatshade || old_rs->two_side != rs->two_side ||
1021        old_rs->multisample_enable != rs->multisample_enable ||
1022        old_rs->poly_stipple_enable != rs->poly_stipple_enable ||
1023        old_rs->poly_smooth != rs->poly_smooth || old_rs->line_smooth != rs->line_smooth ||
1024        old_rs->clamp_fragment_color != rs->clamp_fragment_color ||
1025        old_rs->force_persample_interp != rs->force_persample_interp ||
1026        old_rs->polygon_mode_is_points != rs->polygon_mode_is_points)
1027       sctx->do_update_shaders = true;
1028 }
1029 
si_delete_rs_state(struct pipe_context * ctx,void * state)1030 static void si_delete_rs_state(struct pipe_context *ctx, void *state)
1031 {
1032    struct si_context *sctx = (struct si_context *)ctx;
1033    struct si_state_rasterizer *rs = (struct si_state_rasterizer *)state;
1034 
1035    if (sctx->queued.named.rasterizer == state)
1036       si_bind_rs_state(ctx, sctx->discard_rasterizer_state);
1037 
1038    FREE(rs->pm4_poly_offset);
1039    si_pm4_delete_state(sctx, rasterizer, rs);
1040 }
1041 
1042 /*
1043  * infeered state between dsa and stencil ref
1044  */
si_emit_stencil_ref(struct si_context * sctx)1045 static void si_emit_stencil_ref(struct si_context *sctx)
1046 {
1047    struct radeon_cmdbuf *cs = sctx->gfx_cs;
1048    struct pipe_stencil_ref *ref = &sctx->stencil_ref.state;
1049    struct si_dsa_stencil_ref_part *dsa = &sctx->stencil_ref.dsa_part;
1050 
1051    radeon_set_context_reg_seq(cs, R_028430_DB_STENCILREFMASK, 2);
1052    radeon_emit(cs, S_028430_STENCILTESTVAL(ref->ref_value[0]) |
1053                       S_028430_STENCILMASK(dsa->valuemask[0]) |
1054                       S_028430_STENCILWRITEMASK(dsa->writemask[0]) | S_028430_STENCILOPVAL(1));
1055    radeon_emit(cs, S_028434_STENCILTESTVAL_BF(ref->ref_value[1]) |
1056                       S_028434_STENCILMASK_BF(dsa->valuemask[1]) |
1057                       S_028434_STENCILWRITEMASK_BF(dsa->writemask[1]) |
1058                       S_028434_STENCILOPVAL_BF(1));
1059 }
1060 
si_set_stencil_ref(struct pipe_context * ctx,const struct pipe_stencil_ref * state)1061 static void si_set_stencil_ref(struct pipe_context *ctx, const struct pipe_stencil_ref *state)
1062 {
1063    struct si_context *sctx = (struct si_context *)ctx;
1064 
1065    if (memcmp(&sctx->stencil_ref.state, state, sizeof(*state)) == 0)
1066       return;
1067 
1068    sctx->stencil_ref.state = *state;
1069    si_mark_atom_dirty(sctx, &sctx->atoms.s.stencil_ref);
1070 }
1071 
1072 /*
1073  * DSA
1074  */
1075 
si_translate_stencil_op(int s_op)1076 static uint32_t si_translate_stencil_op(int s_op)
1077 {
1078    switch (s_op) {
1079    case PIPE_STENCIL_OP_KEEP:
1080       return V_02842C_STENCIL_KEEP;
1081    case PIPE_STENCIL_OP_ZERO:
1082       return V_02842C_STENCIL_ZERO;
1083    case PIPE_STENCIL_OP_REPLACE:
1084       return V_02842C_STENCIL_REPLACE_TEST;
1085    case PIPE_STENCIL_OP_INCR:
1086       return V_02842C_STENCIL_ADD_CLAMP;
1087    case PIPE_STENCIL_OP_DECR:
1088       return V_02842C_STENCIL_SUB_CLAMP;
1089    case PIPE_STENCIL_OP_INCR_WRAP:
1090       return V_02842C_STENCIL_ADD_WRAP;
1091    case PIPE_STENCIL_OP_DECR_WRAP:
1092       return V_02842C_STENCIL_SUB_WRAP;
1093    case PIPE_STENCIL_OP_INVERT:
1094       return V_02842C_STENCIL_INVERT;
1095    default:
1096       PRINT_ERR("Unknown stencil op %d", s_op);
1097       assert(0);
1098       break;
1099    }
1100    return 0;
1101 }
1102 
si_dsa_writes_stencil(const struct pipe_stencil_state * s)1103 static bool si_dsa_writes_stencil(const struct pipe_stencil_state *s)
1104 {
1105    return s->enabled && s->writemask &&
1106           (s->fail_op != PIPE_STENCIL_OP_KEEP || s->zfail_op != PIPE_STENCIL_OP_KEEP ||
1107            s->zpass_op != PIPE_STENCIL_OP_KEEP);
1108 }
1109 
si_order_invariant_stencil_op(enum pipe_stencil_op op)1110 static bool si_order_invariant_stencil_op(enum pipe_stencil_op op)
1111 {
1112    /* REPLACE is normally order invariant, except when the stencil
1113     * reference value is written by the fragment shader. Tracking this
1114     * interaction does not seem worth the effort, so be conservative. */
1115    return op != PIPE_STENCIL_OP_INCR && op != PIPE_STENCIL_OP_DECR && op != PIPE_STENCIL_OP_REPLACE;
1116 }
1117 
1118 /* Compute whether, assuming Z writes are disabled, this stencil state is order
1119  * invariant in the sense that the set of passing fragments as well as the
1120  * final stencil buffer result does not depend on the order of fragments. */
si_order_invariant_stencil_state(const struct pipe_stencil_state * state)1121 static bool si_order_invariant_stencil_state(const struct pipe_stencil_state *state)
1122 {
1123    return !state->enabled || !state->writemask ||
1124           /* The following assumes that Z writes are disabled. */
1125           (state->func == PIPE_FUNC_ALWAYS && si_order_invariant_stencil_op(state->zpass_op) &&
1126            si_order_invariant_stencil_op(state->zfail_op)) ||
1127           (state->func == PIPE_FUNC_NEVER && si_order_invariant_stencil_op(state->fail_op));
1128 }
1129 
si_create_dsa_state(struct pipe_context * ctx,const struct pipe_depth_stencil_alpha_state * state)1130 static void *si_create_dsa_state(struct pipe_context *ctx,
1131                                  const struct pipe_depth_stencil_alpha_state *state)
1132 {
1133    struct si_context *sctx = (struct si_context *)ctx;
1134    struct si_state_dsa *dsa = CALLOC_STRUCT(si_state_dsa);
1135    struct si_pm4_state *pm4 = &dsa->pm4;
1136    unsigned db_depth_control;
1137    uint32_t db_stencil_control = 0;
1138 
1139    if (!dsa) {
1140       return NULL;
1141    }
1142 
1143    dsa->stencil_ref.valuemask[0] = state->stencil[0].valuemask;
1144    dsa->stencil_ref.valuemask[1] = state->stencil[1].valuemask;
1145    dsa->stencil_ref.writemask[0] = state->stencil[0].writemask;
1146    dsa->stencil_ref.writemask[1] = state->stencil[1].writemask;
1147 
1148    db_depth_control =
1149       S_028800_Z_ENABLE(state->depth.enabled) | S_028800_Z_WRITE_ENABLE(state->depth.writemask) |
1150       S_028800_ZFUNC(state->depth.func) | S_028800_DEPTH_BOUNDS_ENABLE(state->depth.bounds_test);
1151 
1152    /* stencil */
1153    if (state->stencil[0].enabled) {
1154       db_depth_control |= S_028800_STENCIL_ENABLE(1);
1155       db_depth_control |= S_028800_STENCILFUNC(state->stencil[0].func);
1156       db_stencil_control |=
1157          S_02842C_STENCILFAIL(si_translate_stencil_op(state->stencil[0].fail_op));
1158       db_stencil_control |=
1159          S_02842C_STENCILZPASS(si_translate_stencil_op(state->stencil[0].zpass_op));
1160       db_stencil_control |=
1161          S_02842C_STENCILZFAIL(si_translate_stencil_op(state->stencil[0].zfail_op));
1162 
1163       if (state->stencil[1].enabled) {
1164          db_depth_control |= S_028800_BACKFACE_ENABLE(1);
1165          db_depth_control |= S_028800_STENCILFUNC_BF(state->stencil[1].func);
1166          db_stencil_control |=
1167             S_02842C_STENCILFAIL_BF(si_translate_stencil_op(state->stencil[1].fail_op));
1168          db_stencil_control |=
1169             S_02842C_STENCILZPASS_BF(si_translate_stencil_op(state->stencil[1].zpass_op));
1170          db_stencil_control |=
1171             S_02842C_STENCILZFAIL_BF(si_translate_stencil_op(state->stencil[1].zfail_op));
1172       }
1173    }
1174 
1175    /* alpha */
1176    if (state->alpha.enabled) {
1177       dsa->alpha_func = state->alpha.func;
1178 
1179       si_pm4_set_reg(pm4, R_00B030_SPI_SHADER_USER_DATA_PS_0 + SI_SGPR_ALPHA_REF * 4,
1180                      fui(state->alpha.ref_value));
1181    } else {
1182       dsa->alpha_func = PIPE_FUNC_ALWAYS;
1183    }
1184 
1185    si_pm4_set_reg(pm4, R_028800_DB_DEPTH_CONTROL, db_depth_control);
1186    if (state->stencil[0].enabled)
1187       si_pm4_set_reg(pm4, R_02842C_DB_STENCIL_CONTROL, db_stencil_control);
1188    if (state->depth.bounds_test) {
1189       si_pm4_set_reg(pm4, R_028020_DB_DEPTH_BOUNDS_MIN, fui(state->depth.bounds_min));
1190       si_pm4_set_reg(pm4, R_028024_DB_DEPTH_BOUNDS_MAX, fui(state->depth.bounds_max));
1191    }
1192 
1193    dsa->depth_enabled = state->depth.enabled;
1194    dsa->depth_write_enabled = state->depth.enabled && state->depth.writemask;
1195    dsa->stencil_enabled = state->stencil[0].enabled;
1196    dsa->stencil_write_enabled =
1197       state->stencil[0].enabled &&
1198       (si_dsa_writes_stencil(&state->stencil[0]) || si_dsa_writes_stencil(&state->stencil[1]));
1199    dsa->db_can_write = dsa->depth_write_enabled || dsa->stencil_write_enabled;
1200 
1201    bool zfunc_is_ordered =
1202       state->depth.func == PIPE_FUNC_NEVER || state->depth.func == PIPE_FUNC_LESS ||
1203       state->depth.func == PIPE_FUNC_LEQUAL || state->depth.func == PIPE_FUNC_GREATER ||
1204       state->depth.func == PIPE_FUNC_GEQUAL;
1205 
1206    bool nozwrite_and_order_invariant_stencil =
1207       !dsa->db_can_write ||
1208       (!dsa->depth_write_enabled && si_order_invariant_stencil_state(&state->stencil[0]) &&
1209        si_order_invariant_stencil_state(&state->stencil[1]));
1210 
1211    dsa->order_invariance[1].zs =
1212       nozwrite_and_order_invariant_stencil || (!dsa->stencil_write_enabled && zfunc_is_ordered);
1213    dsa->order_invariance[0].zs = !dsa->depth_write_enabled || zfunc_is_ordered;
1214 
1215    dsa->order_invariance[1].pass_set =
1216       nozwrite_and_order_invariant_stencil ||
1217       (!dsa->stencil_write_enabled &&
1218        (state->depth.func == PIPE_FUNC_ALWAYS || state->depth.func == PIPE_FUNC_NEVER));
1219    dsa->order_invariance[0].pass_set =
1220       !dsa->depth_write_enabled ||
1221       (state->depth.func == PIPE_FUNC_ALWAYS || state->depth.func == PIPE_FUNC_NEVER);
1222 
1223    dsa->order_invariance[1].pass_last = sctx->screen->assume_no_z_fights &&
1224                                         !dsa->stencil_write_enabled && dsa->depth_write_enabled &&
1225                                         zfunc_is_ordered;
1226    dsa->order_invariance[0].pass_last =
1227       sctx->screen->assume_no_z_fights && dsa->depth_write_enabled && zfunc_is_ordered;
1228 
1229    return dsa;
1230 }
1231 
si_bind_dsa_state(struct pipe_context * ctx,void * state)1232 static void si_bind_dsa_state(struct pipe_context *ctx, void *state)
1233 {
1234    struct si_context *sctx = (struct si_context *)ctx;
1235    struct si_state_dsa *old_dsa = sctx->queued.named.dsa;
1236    struct si_state_dsa *dsa = state;
1237 
1238    if (!dsa)
1239       dsa = (struct si_state_dsa *)sctx->noop_dsa;
1240 
1241    si_pm4_bind_state(sctx, dsa, dsa);
1242 
1243    if (memcmp(&dsa->stencil_ref, &sctx->stencil_ref.dsa_part,
1244               sizeof(struct si_dsa_stencil_ref_part)) != 0) {
1245       sctx->stencil_ref.dsa_part = dsa->stencil_ref;
1246       si_mark_atom_dirty(sctx, &sctx->atoms.s.stencil_ref);
1247    }
1248 
1249    if (old_dsa->alpha_func != dsa->alpha_func)
1250       sctx->do_update_shaders = true;
1251 
1252    if (sctx->screen->dpbb_allowed && ((old_dsa->depth_enabled != dsa->depth_enabled ||
1253                                        old_dsa->stencil_enabled != dsa->stencil_enabled ||
1254                                        old_dsa->db_can_write != dsa->db_can_write)))
1255       si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state);
1256 
1257    if (sctx->screen->has_out_of_order_rast &&
1258        (memcmp(old_dsa->order_invariance, dsa->order_invariance,
1259                sizeof(old_dsa->order_invariance))))
1260       si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
1261 }
1262 
si_delete_dsa_state(struct pipe_context * ctx,void * state)1263 static void si_delete_dsa_state(struct pipe_context *ctx, void *state)
1264 {
1265    struct si_context *sctx = (struct si_context *)ctx;
1266 
1267    if (sctx->queued.named.dsa == state)
1268       si_bind_dsa_state(ctx, sctx->noop_dsa);
1269 
1270    si_pm4_delete_state(sctx, dsa, (struct si_state_dsa *)state);
1271 }
1272 
si_create_db_flush_dsa(struct si_context * sctx)1273 static void *si_create_db_flush_dsa(struct si_context *sctx)
1274 {
1275    struct pipe_depth_stencil_alpha_state dsa = {};
1276 
1277    return sctx->b.create_depth_stencil_alpha_state(&sctx->b, &dsa);
1278 }
1279 
1280 /* DB RENDER STATE */
1281 
si_set_active_query_state(struct pipe_context * ctx,bool enable)1282 static void si_set_active_query_state(struct pipe_context *ctx, bool enable)
1283 {
1284    struct si_context *sctx = (struct si_context *)ctx;
1285 
1286    /* Pipeline stat & streamout queries. */
1287    if (enable) {
1288       sctx->flags &= ~SI_CONTEXT_STOP_PIPELINE_STATS;
1289       sctx->flags |= SI_CONTEXT_START_PIPELINE_STATS;
1290    } else {
1291       sctx->flags &= ~SI_CONTEXT_START_PIPELINE_STATS;
1292       sctx->flags |= SI_CONTEXT_STOP_PIPELINE_STATS;
1293    }
1294 
1295    /* Occlusion queries. */
1296    if (sctx->occlusion_queries_disabled != !enable) {
1297       sctx->occlusion_queries_disabled = !enable;
1298       si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
1299    }
1300 }
1301 
si_set_occlusion_query_state(struct si_context * sctx,bool old_perfect_enable)1302 void si_set_occlusion_query_state(struct si_context *sctx, bool old_perfect_enable)
1303 {
1304    si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
1305 
1306    bool perfect_enable = sctx->num_perfect_occlusion_queries != 0;
1307 
1308    if (perfect_enable != old_perfect_enable)
1309       si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
1310 }
1311 
si_save_qbo_state(struct si_context * sctx,struct si_qbo_state * st)1312 void si_save_qbo_state(struct si_context *sctx, struct si_qbo_state *st)
1313 {
1314    st->saved_compute = sctx->cs_shader_state.program;
1315 
1316    si_get_pipe_constant_buffer(sctx, PIPE_SHADER_COMPUTE, 0, &st->saved_const0);
1317    si_get_shader_buffers(sctx, PIPE_SHADER_COMPUTE, 0, 3, st->saved_ssbo);
1318 
1319    st->saved_ssbo_writable_mask = 0;
1320 
1321    for (unsigned i = 0; i < 3; i++) {
1322       if (sctx->const_and_shader_buffers[PIPE_SHADER_COMPUTE].writable_mask &
1323           (1u << si_get_shaderbuf_slot(i)))
1324          st->saved_ssbo_writable_mask |= 1 << i;
1325    }
1326 }
1327 
si_restore_qbo_state(struct si_context * sctx,struct si_qbo_state * st)1328 void si_restore_qbo_state(struct si_context *sctx, struct si_qbo_state *st)
1329 {
1330    sctx->b.bind_compute_state(&sctx->b, st->saved_compute);
1331 
1332    sctx->b.set_constant_buffer(&sctx->b, PIPE_SHADER_COMPUTE, 0, &st->saved_const0);
1333    pipe_resource_reference(&st->saved_const0.buffer, NULL);
1334 
1335    sctx->b.set_shader_buffers(&sctx->b, PIPE_SHADER_COMPUTE, 0, 3, st->saved_ssbo,
1336                               st->saved_ssbo_writable_mask);
1337    for (unsigned i = 0; i < 3; ++i)
1338       pipe_resource_reference(&st->saved_ssbo[i].buffer, NULL);
1339 }
1340 
si_emit_db_render_state(struct si_context * sctx)1341 static void si_emit_db_render_state(struct si_context *sctx)
1342 {
1343    struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
1344    unsigned db_shader_control, db_render_control, db_count_control;
1345    unsigned initial_cdw = sctx->gfx_cs->current.cdw;
1346 
1347    /* DB_RENDER_CONTROL */
1348    if (sctx->dbcb_depth_copy_enabled || sctx->dbcb_stencil_copy_enabled) {
1349       db_render_control = S_028000_DEPTH_COPY(sctx->dbcb_depth_copy_enabled) |
1350                           S_028000_STENCIL_COPY(sctx->dbcb_stencil_copy_enabled) |
1351                           S_028000_COPY_CENTROID(1) | S_028000_COPY_SAMPLE(sctx->dbcb_copy_sample);
1352    } else if (sctx->db_flush_depth_inplace || sctx->db_flush_stencil_inplace) {
1353       db_render_control = S_028000_DEPTH_COMPRESS_DISABLE(sctx->db_flush_depth_inplace) |
1354                           S_028000_STENCIL_COMPRESS_DISABLE(sctx->db_flush_stencil_inplace);
1355    } else {
1356       db_render_control = S_028000_DEPTH_CLEAR_ENABLE(sctx->db_depth_clear) |
1357                           S_028000_STENCIL_CLEAR_ENABLE(sctx->db_stencil_clear);
1358    }
1359 
1360    /* DB_COUNT_CONTROL (occlusion queries) */
1361    if (sctx->num_occlusion_queries > 0 && !sctx->occlusion_queries_disabled) {
1362       bool perfect = sctx->num_perfect_occlusion_queries > 0;
1363       bool gfx10_perfect = sctx->chip_class >= GFX10 && perfect;
1364 
1365       if (sctx->chip_class >= GFX7) {
1366          unsigned log_sample_rate = sctx->framebuffer.log_samples;
1367 
1368          db_count_control = S_028004_PERFECT_ZPASS_COUNTS(perfect) |
1369                             S_028004_DISABLE_CONSERVATIVE_ZPASS_COUNTS(gfx10_perfect) |
1370                             S_028004_SAMPLE_RATE(log_sample_rate) | S_028004_ZPASS_ENABLE(1) |
1371                             S_028004_SLICE_EVEN_ENABLE(1) | S_028004_SLICE_ODD_ENABLE(1);
1372       } else {
1373          db_count_control = S_028004_PERFECT_ZPASS_COUNTS(perfect) |
1374                             S_028004_SAMPLE_RATE(sctx->framebuffer.log_samples);
1375       }
1376    } else {
1377       /* Disable occlusion queries. */
1378       if (sctx->chip_class >= GFX7) {
1379          db_count_control = 0;
1380       } else {
1381          db_count_control = S_028004_ZPASS_INCREMENT_DISABLE(1);
1382       }
1383    }
1384 
1385    radeon_opt_set_context_reg2(sctx, R_028000_DB_RENDER_CONTROL, SI_TRACKED_DB_RENDER_CONTROL,
1386                                db_render_control, db_count_control);
1387 
1388    /* DB_RENDER_OVERRIDE2 */
1389    radeon_opt_set_context_reg(
1390       sctx, R_028010_DB_RENDER_OVERRIDE2, SI_TRACKED_DB_RENDER_OVERRIDE2,
1391       S_028010_DISABLE_ZMASK_EXPCLEAR_OPTIMIZATION(sctx->db_depth_disable_expclear) |
1392       S_028010_DISABLE_SMEM_EXPCLEAR_OPTIMIZATION(sctx->db_stencil_disable_expclear) |
1393       S_028010_DECOMPRESS_Z_ON_FLUSH(sctx->framebuffer.nr_samples >= 4) |
1394       S_028010_CENTROID_COMPUTATION_MODE(sctx->chip_class >= GFX10_3 ? 2 : 0));
1395 
1396    db_shader_control = sctx->ps_db_shader_control;
1397 
1398    /* Bug workaround for smoothing (overrasterization) on GFX6. */
1399    if (sctx->chip_class == GFX6 && sctx->smoothing_enabled) {
1400       db_shader_control &= C_02880C_Z_ORDER;
1401       db_shader_control |= S_02880C_Z_ORDER(V_02880C_LATE_Z);
1402    }
1403 
1404    /* Disable the gl_SampleMask fragment shader output if MSAA is disabled. */
1405    if (!rs->multisample_enable)
1406       db_shader_control &= C_02880C_MASK_EXPORT_ENABLE;
1407 
1408    if (sctx->screen->info.has_rbplus && !sctx->screen->info.rbplus_allowed)
1409       db_shader_control |= S_02880C_DUAL_QUAD_DISABLE(1);
1410 
1411    radeon_opt_set_context_reg(sctx, R_02880C_DB_SHADER_CONTROL, SI_TRACKED_DB_SHADER_CONTROL,
1412                               db_shader_control);
1413 
1414    if (initial_cdw != sctx->gfx_cs->current.cdw)
1415       sctx->context_roll = true;
1416 }
1417 
1418 /*
1419  * format translation
1420  */
si_translate_colorformat(enum chip_class chip_class,enum pipe_format format)1421 static uint32_t si_translate_colorformat(enum chip_class chip_class,
1422                                          enum pipe_format format)
1423 {
1424    const struct util_format_description *desc = util_format_description(format);
1425    if (!desc)
1426       return V_028C70_COLOR_INVALID;
1427 
1428 #define HAS_SIZE(x, y, z, w)                                                                       \
1429    (desc->channel[0].size == (x) && desc->channel[1].size == (y) &&                                \
1430     desc->channel[2].size == (z) && desc->channel[3].size == (w))
1431 
1432    if (format == PIPE_FORMAT_R11G11B10_FLOAT) /* isn't plain */
1433       return V_028C70_COLOR_10_11_11;
1434 
1435    if (chip_class >= GFX10_3 &&
1436        format == PIPE_FORMAT_R9G9B9E5_FLOAT) /* isn't plain */
1437       return V_028C70_COLOR_5_9_9_9;
1438 
1439    if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN)
1440       return V_028C70_COLOR_INVALID;
1441 
1442    /* hw cannot support mixed formats (except depth/stencil, since
1443     * stencil is not written to). */
1444    if (desc->is_mixed && desc->colorspace != UTIL_FORMAT_COLORSPACE_ZS)
1445       return V_028C70_COLOR_INVALID;
1446 
1447    switch (desc->nr_channels) {
1448    case 1:
1449       switch (desc->channel[0].size) {
1450       case 8:
1451          return V_028C70_COLOR_8;
1452       case 16:
1453          return V_028C70_COLOR_16;
1454       case 32:
1455          return V_028C70_COLOR_32;
1456       }
1457       break;
1458    case 2:
1459       if (desc->channel[0].size == desc->channel[1].size) {
1460          switch (desc->channel[0].size) {
1461          case 8:
1462             return V_028C70_COLOR_8_8;
1463          case 16:
1464             return V_028C70_COLOR_16_16;
1465          case 32:
1466             return V_028C70_COLOR_32_32;
1467          }
1468       } else if (HAS_SIZE(8, 24, 0, 0)) {
1469          return V_028C70_COLOR_24_8;
1470       } else if (HAS_SIZE(24, 8, 0, 0)) {
1471          return V_028C70_COLOR_8_24;
1472       }
1473       break;
1474    case 3:
1475       if (HAS_SIZE(5, 6, 5, 0)) {
1476          return V_028C70_COLOR_5_6_5;
1477       } else if (HAS_SIZE(32, 8, 24, 0)) {
1478          return V_028C70_COLOR_X24_8_32_FLOAT;
1479       }
1480       break;
1481    case 4:
1482       if (desc->channel[0].size == desc->channel[1].size &&
1483           desc->channel[0].size == desc->channel[2].size &&
1484           desc->channel[0].size == desc->channel[3].size) {
1485          switch (desc->channel[0].size) {
1486          case 4:
1487             return V_028C70_COLOR_4_4_4_4;
1488          case 8:
1489             return V_028C70_COLOR_8_8_8_8;
1490          case 16:
1491             return V_028C70_COLOR_16_16_16_16;
1492          case 32:
1493             return V_028C70_COLOR_32_32_32_32;
1494          }
1495       } else if (HAS_SIZE(5, 5, 5, 1)) {
1496          return V_028C70_COLOR_1_5_5_5;
1497       } else if (HAS_SIZE(1, 5, 5, 5)) {
1498          return V_028C70_COLOR_5_5_5_1;
1499       } else if (HAS_SIZE(10, 10, 10, 2)) {
1500          return V_028C70_COLOR_2_10_10_10;
1501       }
1502       break;
1503    }
1504    return V_028C70_COLOR_INVALID;
1505 }
1506 
si_colorformat_endian_swap(uint32_t colorformat)1507 static uint32_t si_colorformat_endian_swap(uint32_t colorformat)
1508 {
1509    if (SI_BIG_ENDIAN) {
1510       switch (colorformat) {
1511       /* 8-bit buffers. */
1512       case V_028C70_COLOR_8:
1513          return V_028C70_ENDIAN_NONE;
1514 
1515       /* 16-bit buffers. */
1516       case V_028C70_COLOR_5_6_5:
1517       case V_028C70_COLOR_1_5_5_5:
1518       case V_028C70_COLOR_4_4_4_4:
1519       case V_028C70_COLOR_16:
1520       case V_028C70_COLOR_8_8:
1521          return V_028C70_ENDIAN_8IN16;
1522 
1523       /* 32-bit buffers. */
1524       case V_028C70_COLOR_8_8_8_8:
1525       case V_028C70_COLOR_2_10_10_10:
1526       case V_028C70_COLOR_8_24:
1527       case V_028C70_COLOR_24_8:
1528       case V_028C70_COLOR_16_16:
1529          return V_028C70_ENDIAN_8IN32;
1530 
1531       /* 64-bit buffers. */
1532       case V_028C70_COLOR_16_16_16_16:
1533          return V_028C70_ENDIAN_8IN16;
1534 
1535       case V_028C70_COLOR_32_32:
1536          return V_028C70_ENDIAN_8IN32;
1537 
1538       /* 128-bit buffers. */
1539       case V_028C70_COLOR_32_32_32_32:
1540          return V_028C70_ENDIAN_8IN32;
1541       default:
1542          return V_028C70_ENDIAN_NONE; /* Unsupported. */
1543       }
1544    } else {
1545       return V_028C70_ENDIAN_NONE;
1546    }
1547 }
1548 
si_translate_dbformat(enum pipe_format format)1549 static uint32_t si_translate_dbformat(enum pipe_format format)
1550 {
1551    switch (format) {
1552    case PIPE_FORMAT_Z16_UNORM:
1553       return V_028040_Z_16;
1554    case PIPE_FORMAT_S8_UINT_Z24_UNORM:
1555    case PIPE_FORMAT_X8Z24_UNORM:
1556    case PIPE_FORMAT_Z24X8_UNORM:
1557    case PIPE_FORMAT_Z24_UNORM_S8_UINT:
1558       return V_028040_Z_24; /* deprecated on AMD GCN */
1559    case PIPE_FORMAT_Z32_FLOAT:
1560    case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
1561       return V_028040_Z_32_FLOAT;
1562    default:
1563       return V_028040_Z_INVALID;
1564    }
1565 }
1566 
1567 /*
1568  * Texture translation
1569  */
1570 
si_translate_texformat(struct pipe_screen * screen,enum pipe_format format,const struct util_format_description * desc,int first_non_void)1571 static uint32_t si_translate_texformat(struct pipe_screen *screen, enum pipe_format format,
1572                                        const struct util_format_description *desc,
1573                                        int first_non_void)
1574 {
1575    struct si_screen *sscreen = (struct si_screen *)screen;
1576    bool uniform = true;
1577    int i;
1578 
1579    assert(sscreen->info.chip_class <= GFX9);
1580 
1581    /* Colorspace (return non-RGB formats directly). */
1582    switch (desc->colorspace) {
1583    /* Depth stencil formats */
1584    case UTIL_FORMAT_COLORSPACE_ZS:
1585       switch (format) {
1586       case PIPE_FORMAT_Z16_UNORM:
1587          return V_008F14_IMG_DATA_FORMAT_16;
1588       case PIPE_FORMAT_X24S8_UINT:
1589       case PIPE_FORMAT_S8X24_UINT:
1590          /*
1591           * Implemented as an 8_8_8_8 data format to fix texture
1592           * gathers in stencil sampling. This affects at least
1593           * GL45-CTS.texture_cube_map_array.sampling on GFX8.
1594           */
1595          if (sscreen->info.chip_class <= GFX8)
1596             return V_008F14_IMG_DATA_FORMAT_8_8_8_8;
1597 
1598          if (format == PIPE_FORMAT_X24S8_UINT)
1599             return V_008F14_IMG_DATA_FORMAT_8_24;
1600          else
1601             return V_008F14_IMG_DATA_FORMAT_24_8;
1602       case PIPE_FORMAT_Z24X8_UNORM:
1603       case PIPE_FORMAT_Z24_UNORM_S8_UINT:
1604          return V_008F14_IMG_DATA_FORMAT_8_24;
1605       case PIPE_FORMAT_X8Z24_UNORM:
1606       case PIPE_FORMAT_S8_UINT_Z24_UNORM:
1607          return V_008F14_IMG_DATA_FORMAT_24_8;
1608       case PIPE_FORMAT_S8_UINT:
1609          return V_008F14_IMG_DATA_FORMAT_8;
1610       case PIPE_FORMAT_Z32_FLOAT:
1611          return V_008F14_IMG_DATA_FORMAT_32;
1612       case PIPE_FORMAT_X32_S8X24_UINT:
1613       case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
1614          return V_008F14_IMG_DATA_FORMAT_X24_8_32;
1615       default:
1616          goto out_unknown;
1617       }
1618 
1619    case UTIL_FORMAT_COLORSPACE_YUV:
1620       goto out_unknown; /* TODO */
1621 
1622    case UTIL_FORMAT_COLORSPACE_SRGB:
1623       if (desc->nr_channels != 4 && desc->nr_channels != 1)
1624          goto out_unknown;
1625       break;
1626 
1627    default:
1628       break;
1629    }
1630 
1631    if (desc->layout == UTIL_FORMAT_LAYOUT_RGTC) {
1632       if (!sscreen->info.has_format_bc1_through_bc7)
1633          goto out_unknown;
1634 
1635       switch (format) {
1636       case PIPE_FORMAT_RGTC1_SNORM:
1637       case PIPE_FORMAT_LATC1_SNORM:
1638       case PIPE_FORMAT_RGTC1_UNORM:
1639       case PIPE_FORMAT_LATC1_UNORM:
1640          return V_008F14_IMG_DATA_FORMAT_BC4;
1641       case PIPE_FORMAT_RGTC2_SNORM:
1642       case PIPE_FORMAT_LATC2_SNORM:
1643       case PIPE_FORMAT_RGTC2_UNORM:
1644       case PIPE_FORMAT_LATC2_UNORM:
1645          return V_008F14_IMG_DATA_FORMAT_BC5;
1646       default:
1647          goto out_unknown;
1648       }
1649    }
1650 
1651    if (desc->layout == UTIL_FORMAT_LAYOUT_ETC &&
1652        (sscreen->info.family == CHIP_STONEY || sscreen->info.family == CHIP_VEGA10 ||
1653         sscreen->info.family == CHIP_RAVEN || sscreen->info.family == CHIP_RAVEN2)) {
1654       switch (format) {
1655       case PIPE_FORMAT_ETC1_RGB8:
1656       case PIPE_FORMAT_ETC2_RGB8:
1657       case PIPE_FORMAT_ETC2_SRGB8:
1658          return V_008F14_IMG_DATA_FORMAT_ETC2_RGB;
1659       case PIPE_FORMAT_ETC2_RGB8A1:
1660       case PIPE_FORMAT_ETC2_SRGB8A1:
1661          return V_008F14_IMG_DATA_FORMAT_ETC2_RGBA1;
1662       case PIPE_FORMAT_ETC2_RGBA8:
1663       case PIPE_FORMAT_ETC2_SRGBA8:
1664          return V_008F14_IMG_DATA_FORMAT_ETC2_RGBA;
1665       case PIPE_FORMAT_ETC2_R11_UNORM:
1666       case PIPE_FORMAT_ETC2_R11_SNORM:
1667          return V_008F14_IMG_DATA_FORMAT_ETC2_R;
1668       case PIPE_FORMAT_ETC2_RG11_UNORM:
1669       case PIPE_FORMAT_ETC2_RG11_SNORM:
1670          return V_008F14_IMG_DATA_FORMAT_ETC2_RG;
1671       default:
1672          goto out_unknown;
1673       }
1674    }
1675 
1676    if (desc->layout == UTIL_FORMAT_LAYOUT_BPTC) {
1677       if (!sscreen->info.has_format_bc1_through_bc7)
1678          goto out_unknown;
1679 
1680       switch (format) {
1681       case PIPE_FORMAT_BPTC_RGBA_UNORM:
1682       case PIPE_FORMAT_BPTC_SRGBA:
1683          return V_008F14_IMG_DATA_FORMAT_BC7;
1684       case PIPE_FORMAT_BPTC_RGB_FLOAT:
1685       case PIPE_FORMAT_BPTC_RGB_UFLOAT:
1686          return V_008F14_IMG_DATA_FORMAT_BC6;
1687       default:
1688          goto out_unknown;
1689       }
1690    }
1691 
1692    if (desc->layout == UTIL_FORMAT_LAYOUT_SUBSAMPLED) {
1693       switch (format) {
1694       case PIPE_FORMAT_R8G8_B8G8_UNORM:
1695       case PIPE_FORMAT_G8R8_B8R8_UNORM:
1696          return V_008F14_IMG_DATA_FORMAT_GB_GR;
1697       case PIPE_FORMAT_G8R8_G8B8_UNORM:
1698       case PIPE_FORMAT_R8G8_R8B8_UNORM:
1699          return V_008F14_IMG_DATA_FORMAT_BG_RG;
1700       default:
1701          goto out_unknown;
1702       }
1703    }
1704 
1705    if (desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
1706       if (!sscreen->info.has_format_bc1_through_bc7)
1707          goto out_unknown;
1708 
1709       switch (format) {
1710       case PIPE_FORMAT_DXT1_RGB:
1711       case PIPE_FORMAT_DXT1_RGBA:
1712       case PIPE_FORMAT_DXT1_SRGB:
1713       case PIPE_FORMAT_DXT1_SRGBA:
1714          return V_008F14_IMG_DATA_FORMAT_BC1;
1715       case PIPE_FORMAT_DXT3_RGBA:
1716       case PIPE_FORMAT_DXT3_SRGBA:
1717          return V_008F14_IMG_DATA_FORMAT_BC2;
1718       case PIPE_FORMAT_DXT5_RGBA:
1719       case PIPE_FORMAT_DXT5_SRGBA:
1720          return V_008F14_IMG_DATA_FORMAT_BC3;
1721       default:
1722          goto out_unknown;
1723       }
1724    }
1725 
1726    if (format == PIPE_FORMAT_R9G9B9E5_FLOAT) {
1727       return V_008F14_IMG_DATA_FORMAT_5_9_9_9;
1728    } else if (format == PIPE_FORMAT_R11G11B10_FLOAT) {
1729       return V_008F14_IMG_DATA_FORMAT_10_11_11;
1730    }
1731 
1732    /* R8G8Bx_SNORM - TODO CxV8U8 */
1733 
1734    /* hw cannot support mixed formats (except depth/stencil, since only
1735     * depth is read).*/
1736    if (desc->is_mixed && desc->colorspace != UTIL_FORMAT_COLORSPACE_ZS)
1737       goto out_unknown;
1738 
1739    /* See whether the components are of the same size. */
1740    for (i = 1; i < desc->nr_channels; i++) {
1741       uniform = uniform && desc->channel[0].size == desc->channel[i].size;
1742    }
1743 
1744    /* Non-uniform formats. */
1745    if (!uniform) {
1746       switch (desc->nr_channels) {
1747       case 3:
1748          if (desc->channel[0].size == 5 && desc->channel[1].size == 6 &&
1749              desc->channel[2].size == 5) {
1750             return V_008F14_IMG_DATA_FORMAT_5_6_5;
1751          }
1752          goto out_unknown;
1753       case 4:
1754          if (desc->channel[0].size == 5 && desc->channel[1].size == 5 &&
1755              desc->channel[2].size == 5 && desc->channel[3].size == 1) {
1756             return V_008F14_IMG_DATA_FORMAT_1_5_5_5;
1757          }
1758          if (desc->channel[0].size == 1 && desc->channel[1].size == 5 &&
1759              desc->channel[2].size == 5 && desc->channel[3].size == 5) {
1760             return V_008F14_IMG_DATA_FORMAT_5_5_5_1;
1761          }
1762          if (desc->channel[0].size == 10 && desc->channel[1].size == 10 &&
1763              desc->channel[2].size == 10 && desc->channel[3].size == 2) {
1764             return V_008F14_IMG_DATA_FORMAT_2_10_10_10;
1765          }
1766          goto out_unknown;
1767       }
1768       goto out_unknown;
1769    }
1770 
1771    if (first_non_void < 0 || first_non_void > 3)
1772       goto out_unknown;
1773 
1774    /* uniform formats */
1775    switch (desc->channel[first_non_void].size) {
1776    case 4:
1777       switch (desc->nr_channels) {
1778 #if 0 /* Not supported for render targets */
1779       case 2:
1780          return V_008F14_IMG_DATA_FORMAT_4_4;
1781 #endif
1782       case 4:
1783          return V_008F14_IMG_DATA_FORMAT_4_4_4_4;
1784       }
1785       break;
1786    case 8:
1787       switch (desc->nr_channels) {
1788       case 1:
1789          return V_008F14_IMG_DATA_FORMAT_8;
1790       case 2:
1791          return V_008F14_IMG_DATA_FORMAT_8_8;
1792       case 4:
1793          return V_008F14_IMG_DATA_FORMAT_8_8_8_8;
1794       }
1795       break;
1796    case 16:
1797       switch (desc->nr_channels) {
1798       case 1:
1799          return V_008F14_IMG_DATA_FORMAT_16;
1800       case 2:
1801          return V_008F14_IMG_DATA_FORMAT_16_16;
1802       case 4:
1803          return V_008F14_IMG_DATA_FORMAT_16_16_16_16;
1804       }
1805       break;
1806    case 32:
1807       switch (desc->nr_channels) {
1808       case 1:
1809          return V_008F14_IMG_DATA_FORMAT_32;
1810       case 2:
1811          return V_008F14_IMG_DATA_FORMAT_32_32;
1812 #if 0 /* Not supported for render targets */
1813       case 3:
1814          return V_008F14_IMG_DATA_FORMAT_32_32_32;
1815 #endif
1816       case 4:
1817          return V_008F14_IMG_DATA_FORMAT_32_32_32_32;
1818       }
1819    }
1820 
1821 out_unknown:
1822    return ~0;
1823 }
1824 
si_tex_wrap(unsigned wrap)1825 static unsigned si_tex_wrap(unsigned wrap)
1826 {
1827    switch (wrap) {
1828    default:
1829    case PIPE_TEX_WRAP_REPEAT:
1830       return V_008F30_SQ_TEX_WRAP;
1831    case PIPE_TEX_WRAP_CLAMP:
1832       return V_008F30_SQ_TEX_CLAMP_HALF_BORDER;
1833    case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
1834       return V_008F30_SQ_TEX_CLAMP_LAST_TEXEL;
1835    case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
1836       return V_008F30_SQ_TEX_CLAMP_BORDER;
1837    case PIPE_TEX_WRAP_MIRROR_REPEAT:
1838       return V_008F30_SQ_TEX_MIRROR;
1839    case PIPE_TEX_WRAP_MIRROR_CLAMP:
1840       return V_008F30_SQ_TEX_MIRROR_ONCE_HALF_BORDER;
1841    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
1842       return V_008F30_SQ_TEX_MIRROR_ONCE_LAST_TEXEL;
1843    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
1844       return V_008F30_SQ_TEX_MIRROR_ONCE_BORDER;
1845    }
1846 }
1847 
si_tex_mipfilter(unsigned filter)1848 static unsigned si_tex_mipfilter(unsigned filter)
1849 {
1850    switch (filter) {
1851    case PIPE_TEX_MIPFILTER_NEAREST:
1852       return V_008F38_SQ_TEX_Z_FILTER_POINT;
1853    case PIPE_TEX_MIPFILTER_LINEAR:
1854       return V_008F38_SQ_TEX_Z_FILTER_LINEAR;
1855    default:
1856    case PIPE_TEX_MIPFILTER_NONE:
1857       return V_008F38_SQ_TEX_Z_FILTER_NONE;
1858    }
1859 }
1860 
si_tex_compare(unsigned compare)1861 static unsigned si_tex_compare(unsigned compare)
1862 {
1863    switch (compare) {
1864    default:
1865    case PIPE_FUNC_NEVER:
1866       return V_008F30_SQ_TEX_DEPTH_COMPARE_NEVER;
1867    case PIPE_FUNC_LESS:
1868       return V_008F30_SQ_TEX_DEPTH_COMPARE_LESS;
1869    case PIPE_FUNC_EQUAL:
1870       return V_008F30_SQ_TEX_DEPTH_COMPARE_EQUAL;
1871    case PIPE_FUNC_LEQUAL:
1872       return V_008F30_SQ_TEX_DEPTH_COMPARE_LESSEQUAL;
1873    case PIPE_FUNC_GREATER:
1874       return V_008F30_SQ_TEX_DEPTH_COMPARE_GREATER;
1875    case PIPE_FUNC_NOTEQUAL:
1876       return V_008F30_SQ_TEX_DEPTH_COMPARE_NOTEQUAL;
1877    case PIPE_FUNC_GEQUAL:
1878       return V_008F30_SQ_TEX_DEPTH_COMPARE_GREATEREQUAL;
1879    case PIPE_FUNC_ALWAYS:
1880       return V_008F30_SQ_TEX_DEPTH_COMPARE_ALWAYS;
1881    }
1882 }
1883 
si_tex_dim(struct si_screen * sscreen,struct si_texture * tex,unsigned view_target,unsigned nr_samples)1884 static unsigned si_tex_dim(struct si_screen *sscreen, struct si_texture *tex, unsigned view_target,
1885                            unsigned nr_samples)
1886 {
1887    unsigned res_target = tex->buffer.b.b.target;
1888 
1889    if (view_target == PIPE_TEXTURE_CUBE || view_target == PIPE_TEXTURE_CUBE_ARRAY)
1890       res_target = view_target;
1891    /* If interpreting cubemaps as something else, set 2D_ARRAY. */
1892    else if (res_target == PIPE_TEXTURE_CUBE || res_target == PIPE_TEXTURE_CUBE_ARRAY)
1893       res_target = PIPE_TEXTURE_2D_ARRAY;
1894 
1895    /* GFX9 allocates 1D textures as 2D. */
1896    if ((res_target == PIPE_TEXTURE_1D || res_target == PIPE_TEXTURE_1D_ARRAY) &&
1897        sscreen->info.chip_class == GFX9 &&
1898        tex->surface.u.gfx9.resource_type == RADEON_RESOURCE_2D) {
1899       if (res_target == PIPE_TEXTURE_1D)
1900          res_target = PIPE_TEXTURE_2D;
1901       else
1902          res_target = PIPE_TEXTURE_2D_ARRAY;
1903    }
1904 
1905    switch (res_target) {
1906    default:
1907    case PIPE_TEXTURE_1D:
1908       return V_008F1C_SQ_RSRC_IMG_1D;
1909    case PIPE_TEXTURE_1D_ARRAY:
1910       return V_008F1C_SQ_RSRC_IMG_1D_ARRAY;
1911    case PIPE_TEXTURE_2D:
1912    case PIPE_TEXTURE_RECT:
1913       return nr_samples > 1 ? V_008F1C_SQ_RSRC_IMG_2D_MSAA : V_008F1C_SQ_RSRC_IMG_2D;
1914    case PIPE_TEXTURE_2D_ARRAY:
1915       return nr_samples > 1 ? V_008F1C_SQ_RSRC_IMG_2D_MSAA_ARRAY : V_008F1C_SQ_RSRC_IMG_2D_ARRAY;
1916    case PIPE_TEXTURE_3D:
1917       return V_008F1C_SQ_RSRC_IMG_3D;
1918    case PIPE_TEXTURE_CUBE:
1919    case PIPE_TEXTURE_CUBE_ARRAY:
1920       return V_008F1C_SQ_RSRC_IMG_CUBE;
1921    }
1922 }
1923 
1924 /*
1925  * Format support testing
1926  */
1927 
si_is_sampler_format_supported(struct pipe_screen * screen,enum pipe_format format)1928 static bool si_is_sampler_format_supported(struct pipe_screen *screen, enum pipe_format format)
1929 {
1930    struct si_screen *sscreen = (struct si_screen *)screen;
1931 
1932    if (sscreen->info.chip_class >= GFX10) {
1933       const struct gfx10_format *fmt = &gfx10_format_table[format];
1934       if (!fmt->img_format || fmt->buffers_only)
1935          return false;
1936       return true;
1937    }
1938 
1939    const struct util_format_description *desc = util_format_description(format);
1940    if (!desc)
1941       return false;
1942 
1943    return si_translate_texformat(screen, format, desc,
1944                                  util_format_get_first_non_void_channel(format)) != ~0U;
1945 }
1946 
si_translate_buffer_dataformat(struct pipe_screen * screen,const struct util_format_description * desc,int first_non_void)1947 static uint32_t si_translate_buffer_dataformat(struct pipe_screen *screen,
1948                                                const struct util_format_description *desc,
1949                                                int first_non_void)
1950 {
1951    int i;
1952 
1953    assert(((struct si_screen *)screen)->info.chip_class <= GFX9);
1954 
1955    if (desc->format == PIPE_FORMAT_R11G11B10_FLOAT)
1956       return V_008F0C_BUF_DATA_FORMAT_10_11_11;
1957 
1958    assert(first_non_void >= 0);
1959 
1960    if (desc->nr_channels == 4 && desc->channel[0].size == 10 && desc->channel[1].size == 10 &&
1961        desc->channel[2].size == 10 && desc->channel[3].size == 2)
1962       return V_008F0C_BUF_DATA_FORMAT_2_10_10_10;
1963 
1964    /* See whether the components are of the same size. */
1965    for (i = 0; i < desc->nr_channels; i++) {
1966       if (desc->channel[first_non_void].size != desc->channel[i].size)
1967          return V_008F0C_BUF_DATA_FORMAT_INVALID;
1968    }
1969 
1970    switch (desc->channel[first_non_void].size) {
1971    case 8:
1972       switch (desc->nr_channels) {
1973       case 1:
1974       case 3: /* 3 loads */
1975          return V_008F0C_BUF_DATA_FORMAT_8;
1976       case 2:
1977          return V_008F0C_BUF_DATA_FORMAT_8_8;
1978       case 4:
1979          return V_008F0C_BUF_DATA_FORMAT_8_8_8_8;
1980       }
1981       break;
1982    case 16:
1983       switch (desc->nr_channels) {
1984       case 1:
1985       case 3: /* 3 loads */
1986          return V_008F0C_BUF_DATA_FORMAT_16;
1987       case 2:
1988          return V_008F0C_BUF_DATA_FORMAT_16_16;
1989       case 4:
1990          return V_008F0C_BUF_DATA_FORMAT_16_16_16_16;
1991       }
1992       break;
1993    case 32:
1994       switch (desc->nr_channels) {
1995       case 1:
1996          return V_008F0C_BUF_DATA_FORMAT_32;
1997       case 2:
1998          return V_008F0C_BUF_DATA_FORMAT_32_32;
1999       case 3:
2000          return V_008F0C_BUF_DATA_FORMAT_32_32_32;
2001       case 4:
2002          return V_008F0C_BUF_DATA_FORMAT_32_32_32_32;
2003       }
2004       break;
2005    case 64:
2006       /* Legacy double formats. */
2007       switch (desc->nr_channels) {
2008       case 1: /* 1 load */
2009          return V_008F0C_BUF_DATA_FORMAT_32_32;
2010       case 2: /* 1 load */
2011          return V_008F0C_BUF_DATA_FORMAT_32_32_32_32;
2012       case 3: /* 3 loads */
2013          return V_008F0C_BUF_DATA_FORMAT_32_32;
2014       case 4: /* 2 loads */
2015          return V_008F0C_BUF_DATA_FORMAT_32_32_32_32;
2016       }
2017       break;
2018    }
2019 
2020    return V_008F0C_BUF_DATA_FORMAT_INVALID;
2021 }
2022 
si_translate_buffer_numformat(struct pipe_screen * screen,const struct util_format_description * desc,int first_non_void)2023 static uint32_t si_translate_buffer_numformat(struct pipe_screen *screen,
2024                                               const struct util_format_description *desc,
2025                                               int first_non_void)
2026 {
2027    assert(((struct si_screen *)screen)->info.chip_class <= GFX9);
2028 
2029    if (desc->format == PIPE_FORMAT_R11G11B10_FLOAT)
2030       return V_008F0C_BUF_NUM_FORMAT_FLOAT;
2031 
2032    assert(first_non_void >= 0);
2033 
2034    switch (desc->channel[first_non_void].type) {
2035    case UTIL_FORMAT_TYPE_SIGNED:
2036    case UTIL_FORMAT_TYPE_FIXED:
2037       if (desc->channel[first_non_void].size >= 32 || desc->channel[first_non_void].pure_integer)
2038          return V_008F0C_BUF_NUM_FORMAT_SINT;
2039       else if (desc->channel[first_non_void].normalized)
2040          return V_008F0C_BUF_NUM_FORMAT_SNORM;
2041       else
2042          return V_008F0C_BUF_NUM_FORMAT_SSCALED;
2043       break;
2044    case UTIL_FORMAT_TYPE_UNSIGNED:
2045       if (desc->channel[first_non_void].size >= 32 || desc->channel[first_non_void].pure_integer)
2046          return V_008F0C_BUF_NUM_FORMAT_UINT;
2047       else if (desc->channel[first_non_void].normalized)
2048          return V_008F0C_BUF_NUM_FORMAT_UNORM;
2049       else
2050          return V_008F0C_BUF_NUM_FORMAT_USCALED;
2051       break;
2052    case UTIL_FORMAT_TYPE_FLOAT:
2053    default:
2054       return V_008F0C_BUF_NUM_FORMAT_FLOAT;
2055    }
2056 }
2057 
si_is_vertex_format_supported(struct pipe_screen * screen,enum pipe_format format,unsigned usage)2058 static unsigned si_is_vertex_format_supported(struct pipe_screen *screen, enum pipe_format format,
2059                                               unsigned usage)
2060 {
2061    struct si_screen *sscreen = (struct si_screen *)screen;
2062    const struct util_format_description *desc;
2063    int first_non_void;
2064    unsigned data_format;
2065 
2066    assert((usage & ~(PIPE_BIND_SHADER_IMAGE | PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_VERTEX_BUFFER)) ==
2067           0);
2068 
2069    desc = util_format_description(format);
2070    if (!desc)
2071       return 0;
2072 
2073    /* There are no native 8_8_8 or 16_16_16 data formats, and we currently
2074     * select 8_8_8_8 and 16_16_16_16 instead. This works reasonably well
2075     * for read-only access (with caveats surrounding bounds checks), but
2076     * obviously fails for write access which we have to implement for
2077     * shader images. Luckily, OpenGL doesn't expect this to be supported
2078     * anyway, and so the only impact is on PBO uploads / downloads, which
2079     * shouldn't be expected to be fast for GL_RGB anyway.
2080     */
2081    if (desc->block.bits == 3 * 8 || desc->block.bits == 3 * 16) {
2082       if (usage & (PIPE_BIND_SHADER_IMAGE | PIPE_BIND_SAMPLER_VIEW)) {
2083          usage &= ~(PIPE_BIND_SHADER_IMAGE | PIPE_BIND_SAMPLER_VIEW);
2084          if (!usage)
2085             return 0;
2086       }
2087    }
2088 
2089    if (sscreen->info.chip_class >= GFX10) {
2090       const struct gfx10_format *fmt = &gfx10_format_table[format];
2091       if (!fmt->img_format || fmt->img_format >= 128)
2092          return 0;
2093       return usage;
2094    }
2095 
2096    first_non_void = util_format_get_first_non_void_channel(format);
2097    data_format = si_translate_buffer_dataformat(screen, desc, first_non_void);
2098    if (data_format == V_008F0C_BUF_DATA_FORMAT_INVALID)
2099       return 0;
2100 
2101    return usage;
2102 }
2103 
si_is_colorbuffer_format_supported(enum chip_class chip_class,enum pipe_format format)2104 static bool si_is_colorbuffer_format_supported(enum chip_class chip_class,
2105                                                enum pipe_format format)
2106 {
2107    return si_translate_colorformat(chip_class, format) != V_028C70_COLOR_INVALID &&
2108           si_translate_colorswap(format, false) != ~0U;
2109 }
2110 
si_is_zs_format_supported(enum pipe_format format)2111 static bool si_is_zs_format_supported(enum pipe_format format)
2112 {
2113    return si_translate_dbformat(format) != V_028040_Z_INVALID;
2114 }
2115 
si_is_format_supported(struct pipe_screen * screen,enum pipe_format format,enum pipe_texture_target target,unsigned sample_count,unsigned storage_sample_count,unsigned usage)2116 static bool si_is_format_supported(struct pipe_screen *screen, enum pipe_format format,
2117                                    enum pipe_texture_target target, unsigned sample_count,
2118                                    unsigned storage_sample_count, unsigned usage)
2119 {
2120    struct si_screen *sscreen = (struct si_screen *)screen;
2121    unsigned retval = 0;
2122 
2123    if (target >= PIPE_MAX_TEXTURE_TYPES) {
2124       PRINT_ERR("radeonsi: unsupported texture type %d\n", target);
2125       return false;
2126    }
2127 
2128    if (MAX2(1, sample_count) < MAX2(1, storage_sample_count))
2129       return false;
2130 
2131    if (sample_count > 1) {
2132       if (!screen->get_param(screen, PIPE_CAP_TEXTURE_MULTISAMPLE))
2133          return false;
2134 
2135       /* Only power-of-two sample counts are supported. */
2136       if (!util_is_power_of_two_or_zero(sample_count) ||
2137           !util_is_power_of_two_or_zero(storage_sample_count))
2138          return false;
2139 
2140       /* Chips with 1 RB don't increment occlusion queries at 16x MSAA sample rate,
2141        * so don't expose 16 samples there.
2142        */
2143       const unsigned max_eqaa_samples = sscreen->info.num_render_backends == 1 ? 8 : 16;
2144       const unsigned max_samples = 8;
2145 
2146       /* MSAA support without framebuffer attachments. */
2147       if (format == PIPE_FORMAT_NONE && sample_count <= max_eqaa_samples)
2148          return true;
2149 
2150       if (!sscreen->info.has_eqaa_surface_allocator || util_format_is_depth_or_stencil(format)) {
2151          /* Color without EQAA or depth/stencil. */
2152          if (sample_count > max_samples || sample_count != storage_sample_count)
2153             return false;
2154       } else {
2155          /* Color with EQAA. */
2156          if (sample_count > max_eqaa_samples || storage_sample_count > max_samples)
2157             return false;
2158       }
2159    }
2160 
2161    if (usage & (PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_SHADER_IMAGE)) {
2162       if (target == PIPE_BUFFER) {
2163          retval |= si_is_vertex_format_supported(
2164             screen, format, usage & (PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_SHADER_IMAGE));
2165       } else {
2166          if (si_is_sampler_format_supported(screen, format))
2167             retval |= usage & (PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_SHADER_IMAGE);
2168       }
2169    }
2170 
2171    if ((usage & (PIPE_BIND_RENDER_TARGET | PIPE_BIND_DISPLAY_TARGET | PIPE_BIND_SCANOUT |
2172                  PIPE_BIND_SHARED | PIPE_BIND_BLENDABLE)) &&
2173        si_is_colorbuffer_format_supported(sscreen->info.chip_class, format)) {
2174       retval |= usage & (PIPE_BIND_RENDER_TARGET | PIPE_BIND_DISPLAY_TARGET | PIPE_BIND_SCANOUT |
2175                          PIPE_BIND_SHARED);
2176       if (!util_format_is_pure_integer(format) && !util_format_is_depth_or_stencil(format))
2177          retval |= usage & PIPE_BIND_BLENDABLE;
2178    }
2179 
2180    if ((usage & PIPE_BIND_DEPTH_STENCIL) && si_is_zs_format_supported(format)) {
2181       retval |= PIPE_BIND_DEPTH_STENCIL;
2182    }
2183 
2184    if (usage & PIPE_BIND_VERTEX_BUFFER) {
2185       retval |= si_is_vertex_format_supported(screen, format, PIPE_BIND_VERTEX_BUFFER);
2186    }
2187 
2188    if ((usage & PIPE_BIND_LINEAR) && !util_format_is_compressed(format) &&
2189        !(usage & PIPE_BIND_DEPTH_STENCIL))
2190       retval |= PIPE_BIND_LINEAR;
2191 
2192    return retval == usage;
2193 }
2194 
2195 /*
2196  * framebuffer handling
2197  */
2198 
si_choose_spi_color_formats(struct si_surface * surf,unsigned format,unsigned swap,unsigned ntype,bool is_depth)2199 static void si_choose_spi_color_formats(struct si_surface *surf, unsigned format, unsigned swap,
2200                                         unsigned ntype, bool is_depth)
2201 {
2202    struct ac_spi_color_formats formats = {};
2203 
2204    ac_choose_spi_color_formats(format, swap, ntype, is_depth, &formats);
2205 
2206    surf->spi_shader_col_format = formats.normal;
2207    surf->spi_shader_col_format_alpha = formats.alpha;
2208    surf->spi_shader_col_format_blend = formats.blend;
2209    surf->spi_shader_col_format_blend_alpha = formats.blend_alpha;
2210 }
2211 
si_initialize_color_surface(struct si_context * sctx,struct si_surface * surf)2212 static void si_initialize_color_surface(struct si_context *sctx, struct si_surface *surf)
2213 {
2214    struct si_texture *tex = (struct si_texture *)surf->base.texture;
2215    unsigned color_info, color_attrib;
2216    unsigned format, swap, ntype, endian;
2217    const struct util_format_description *desc;
2218    int firstchan;
2219    unsigned blend_clamp = 0, blend_bypass = 0;
2220 
2221    desc = util_format_description(surf->base.format);
2222    for (firstchan = 0; firstchan < 4; firstchan++) {
2223       if (desc->channel[firstchan].type != UTIL_FORMAT_TYPE_VOID) {
2224          break;
2225       }
2226    }
2227    if (firstchan == 4 || desc->channel[firstchan].type == UTIL_FORMAT_TYPE_FLOAT) {
2228       ntype = V_028C70_NUMBER_FLOAT;
2229    } else {
2230       ntype = V_028C70_NUMBER_UNORM;
2231       if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB)
2232          ntype = V_028C70_NUMBER_SRGB;
2233       else if (desc->channel[firstchan].type == UTIL_FORMAT_TYPE_SIGNED) {
2234          if (desc->channel[firstchan].pure_integer) {
2235             ntype = V_028C70_NUMBER_SINT;
2236          } else {
2237             assert(desc->channel[firstchan].normalized);
2238             ntype = V_028C70_NUMBER_SNORM;
2239          }
2240       } else if (desc->channel[firstchan].type == UTIL_FORMAT_TYPE_UNSIGNED) {
2241          if (desc->channel[firstchan].pure_integer) {
2242             ntype = V_028C70_NUMBER_UINT;
2243          } else {
2244             assert(desc->channel[firstchan].normalized);
2245             ntype = V_028C70_NUMBER_UNORM;
2246          }
2247       }
2248    }
2249 
2250    format = si_translate_colorformat(sctx->chip_class, surf->base.format);
2251    if (format == V_028C70_COLOR_INVALID) {
2252       PRINT_ERR("Invalid CB format: %d, disabling CB.\n", surf->base.format);
2253    }
2254    assert(format != V_028C70_COLOR_INVALID);
2255    swap = si_translate_colorswap(surf->base.format, false);
2256    endian = si_colorformat_endian_swap(format);
2257 
2258    /* blend clamp should be set for all NORM/SRGB types */
2259    if (ntype == V_028C70_NUMBER_UNORM || ntype == V_028C70_NUMBER_SNORM ||
2260        ntype == V_028C70_NUMBER_SRGB)
2261       blend_clamp = 1;
2262 
2263    /* set blend bypass according to docs if SINT/UINT or
2264       8/24 COLOR variants */
2265    if (ntype == V_028C70_NUMBER_UINT || ntype == V_028C70_NUMBER_SINT ||
2266        format == V_028C70_COLOR_8_24 || format == V_028C70_COLOR_24_8 ||
2267        format == V_028C70_COLOR_X24_8_32_FLOAT) {
2268       blend_clamp = 0;
2269       blend_bypass = 1;
2270    }
2271 
2272    if (ntype == V_028C70_NUMBER_UINT || ntype == V_028C70_NUMBER_SINT) {
2273       if (format == V_028C70_COLOR_8 || format == V_028C70_COLOR_8_8 ||
2274           format == V_028C70_COLOR_8_8_8_8)
2275          surf->color_is_int8 = true;
2276       else if (format == V_028C70_COLOR_10_10_10_2 || format == V_028C70_COLOR_2_10_10_10)
2277          surf->color_is_int10 = true;
2278    }
2279 
2280    color_info =
2281       S_028C70_FORMAT(format) | S_028C70_COMP_SWAP(swap) | S_028C70_BLEND_CLAMP(blend_clamp) |
2282       S_028C70_BLEND_BYPASS(blend_bypass) | S_028C70_SIMPLE_FLOAT(1) |
2283       S_028C70_ROUND_MODE(ntype != V_028C70_NUMBER_UNORM && ntype != V_028C70_NUMBER_SNORM &&
2284                           ntype != V_028C70_NUMBER_SRGB && format != V_028C70_COLOR_8_24 &&
2285                           format != V_028C70_COLOR_24_8) |
2286       S_028C70_NUMBER_TYPE(ntype) | S_028C70_ENDIAN(endian);
2287 
2288    /* Intensity is implemented as Red, so treat it that way. */
2289    color_attrib = S_028C74_FORCE_DST_ALPHA_1(desc->swizzle[3] == PIPE_SWIZZLE_1 ||
2290                                              util_format_is_intensity(surf->base.format));
2291 
2292    if (tex->buffer.b.b.nr_samples > 1) {
2293       unsigned log_samples = util_logbase2(tex->buffer.b.b.nr_samples);
2294       unsigned log_fragments = util_logbase2(tex->buffer.b.b.nr_storage_samples);
2295 
2296       color_attrib |= S_028C74_NUM_SAMPLES(log_samples) | S_028C74_NUM_FRAGMENTS(log_fragments);
2297 
2298       if (tex->surface.fmask_offset) {
2299          color_info |= S_028C70_COMPRESSION(1);
2300          unsigned fmask_bankh = util_logbase2(tex->surface.u.legacy.fmask.bankh);
2301 
2302          if (sctx->chip_class == GFX6) {
2303             /* due to a hw bug, FMASK_BANK_HEIGHT must be set on GFX6 too */
2304             color_attrib |= S_028C74_FMASK_BANK_HEIGHT(fmask_bankh);
2305          }
2306       }
2307    }
2308 
2309    if (sctx->chip_class >= GFX10) {
2310       unsigned min_compressed_block_size = V_028C78_MIN_BLOCK_SIZE_32B;
2311 
2312       /* amdvlk: [min-compressed-block-size] should be set to 32 for dGPU and
2313          64 for APU because all of our APUs to date use DIMMs which have
2314          a request granularity size of 64B while all other chips have a
2315          32B request size */
2316       if (!sctx->screen->info.has_dedicated_vram)
2317          min_compressed_block_size = V_028C78_MIN_BLOCK_SIZE_64B;
2318 
2319       surf->cb_dcc_control = S_028C78_MAX_UNCOMPRESSED_BLOCK_SIZE(V_028C78_MAX_BLOCK_SIZE_256B) |
2320                              S_028C78_MAX_COMPRESSED_BLOCK_SIZE(tex->surface.u.gfx9.dcc.max_compressed_block_size) |
2321                              S_028C78_MIN_COMPRESSED_BLOCK_SIZE(min_compressed_block_size) |
2322                              S_028C78_INDEPENDENT_64B_BLOCKS(tex->surface.u.gfx9.dcc.independent_64B_blocks) |
2323                              S_028C78_INDEPENDENT_128B_BLOCKS(tex->surface.u.gfx9.dcc.independent_128B_blocks);
2324    } else if (sctx->chip_class >= GFX8) {
2325       unsigned max_uncompressed_block_size = V_028C78_MAX_BLOCK_SIZE_256B;
2326       unsigned min_compressed_block_size = V_028C78_MIN_BLOCK_SIZE_32B;
2327 
2328       /* amdvlk: [min-compressed-block-size] should be set to 32 for dGPU and
2329          64 for APU because all of our APUs to date use DIMMs which have
2330          a request granularity size of 64B while all other chips have a
2331          32B request size */
2332       if (!sctx->screen->info.has_dedicated_vram)
2333          min_compressed_block_size = V_028C78_MIN_BLOCK_SIZE_64B;
2334 
2335       if (tex->buffer.b.b.nr_storage_samples > 1) {
2336          if (tex->surface.bpe == 1)
2337             max_uncompressed_block_size = V_028C78_MAX_BLOCK_SIZE_64B;
2338          else if (tex->surface.bpe == 2)
2339             max_uncompressed_block_size = V_028C78_MAX_BLOCK_SIZE_128B;
2340       }
2341 
2342       surf->cb_dcc_control = S_028C78_MAX_UNCOMPRESSED_BLOCK_SIZE(max_uncompressed_block_size) |
2343                              S_028C78_MIN_COMPRESSED_BLOCK_SIZE(min_compressed_block_size) |
2344                              S_028C78_INDEPENDENT_64B_BLOCKS(1);
2345    }
2346 
2347    /* This must be set for fast clear to work without FMASK. */
2348    if (!tex->surface.fmask_size && sctx->chip_class == GFX6) {
2349       unsigned bankh = util_logbase2(tex->surface.u.legacy.bankh);
2350       color_attrib |= S_028C74_FMASK_BANK_HEIGHT(bankh);
2351    }
2352 
2353    /* GFX10 field has the same base shift as the GFX6 field */
2354    unsigned color_view = S_028C6C_SLICE_START(surf->base.u.tex.first_layer) |
2355                          S_028C6C_SLICE_MAX_GFX10(surf->base.u.tex.last_layer);
2356    unsigned mip0_depth = util_max_layer(&tex->buffer.b.b, 0);
2357 
2358    if (sctx->chip_class >= GFX10) {
2359       color_view |= S_028C6C_MIP_LEVEL_GFX10(surf->base.u.tex.level);
2360 
2361       surf->cb_color_attrib3 = S_028EE0_MIP0_DEPTH(mip0_depth) |
2362                                S_028EE0_RESOURCE_TYPE(tex->surface.u.gfx9.resource_type) |
2363                                S_028EE0_RESOURCE_LEVEL(1);
2364    } else if (sctx->chip_class == GFX9) {
2365       color_view |= S_028C6C_MIP_LEVEL_GFX9(surf->base.u.tex.level);
2366       color_attrib |= S_028C74_MIP0_DEPTH(mip0_depth) |
2367                       S_028C74_RESOURCE_TYPE(tex->surface.u.gfx9.resource_type);
2368    }
2369 
2370    if (sctx->chip_class >= GFX9) {
2371       surf->cb_color_attrib2 = S_028C68_MIP0_WIDTH(surf->width0 - 1) |
2372                                S_028C68_MIP0_HEIGHT(surf->height0 - 1) |
2373                                S_028C68_MAX_MIP(tex->buffer.b.b.last_level);
2374    }
2375 
2376    surf->cb_color_view = color_view;
2377    surf->cb_color_info = color_info;
2378    surf->cb_color_attrib = color_attrib;
2379 
2380    /* Determine pixel shader export format */
2381    si_choose_spi_color_formats(surf, format, swap, ntype, tex->is_depth);
2382 
2383    surf->color_initialized = true;
2384 }
2385 
si_init_depth_surface(struct si_context * sctx,struct si_surface * surf)2386 static void si_init_depth_surface(struct si_context *sctx, struct si_surface *surf)
2387 {
2388    struct si_texture *tex = (struct si_texture *)surf->base.texture;
2389    unsigned level = surf->base.u.tex.level;
2390    unsigned format, stencil_format;
2391    uint32_t z_info, s_info;
2392 
2393    format = si_translate_dbformat(tex->db_render_format);
2394    stencil_format = tex->surface.has_stencil ? V_028044_STENCIL_8 : V_028044_STENCIL_INVALID;
2395 
2396    assert(format != V_028040_Z_INVALID);
2397    if (format == V_028040_Z_INVALID)
2398       PRINT_ERR("Invalid DB format: %d, disabling DB.\n", tex->buffer.b.b.format);
2399 
2400    surf->db_depth_view = S_028008_SLICE_START(surf->base.u.tex.first_layer) |
2401                          S_028008_SLICE_MAX(surf->base.u.tex.last_layer);
2402    surf->db_htile_data_base = 0;
2403    surf->db_htile_surface = 0;
2404 
2405    if (sctx->chip_class >= GFX10) {
2406       surf->db_depth_view |= S_028008_SLICE_START_HI(surf->base.u.tex.first_layer >> 11) |
2407                              S_028008_SLICE_MAX_HI(surf->base.u.tex.last_layer >> 11);
2408    }
2409 
2410    if (sctx->chip_class >= GFX9) {
2411       assert(tex->surface.u.gfx9.surf_offset == 0);
2412       surf->db_depth_base = tex->buffer.gpu_address >> 8;
2413       surf->db_stencil_base = (tex->buffer.gpu_address + tex->surface.u.gfx9.stencil_offset) >> 8;
2414       z_info = S_028038_FORMAT(format) |
2415                S_028038_NUM_SAMPLES(util_logbase2(tex->buffer.b.b.nr_samples)) |
2416                S_028038_SW_MODE(tex->surface.u.gfx9.surf.swizzle_mode) |
2417                S_028038_MAXMIP(tex->buffer.b.b.last_level);
2418       s_info = S_02803C_FORMAT(stencil_format) |
2419                S_02803C_SW_MODE(tex->surface.u.gfx9.stencil.swizzle_mode);
2420 
2421       if (sctx->chip_class == GFX9) {
2422          surf->db_z_info2 = S_028068_EPITCH(tex->surface.u.gfx9.surf.epitch);
2423          surf->db_stencil_info2 = S_02806C_EPITCH(tex->surface.u.gfx9.stencil.epitch);
2424       }
2425       surf->db_depth_view |= S_028008_MIPID(level);
2426       surf->db_depth_size =
2427          S_02801C_X_MAX(tex->buffer.b.b.width0 - 1) | S_02801C_Y_MAX(tex->buffer.b.b.height0 - 1);
2428 
2429       if (si_htile_enabled(tex, level, PIPE_MASK_ZS)) {
2430          z_info |= S_028038_TILE_SURFACE_ENABLE(1) | S_028038_ALLOW_EXPCLEAR(1);
2431 
2432          if (tex->surface.has_stencil && !tex->htile_stencil_disabled) {
2433             /* Stencil buffer workaround ported from the GFX6-GFX8 code.
2434              * See that for explanation.
2435              */
2436             s_info |= S_02803C_ALLOW_EXPCLEAR(tex->buffer.b.b.nr_samples <= 1);
2437          } else {
2438             /* Use all HTILE for depth if there's no stencil. */
2439             s_info |= S_02803C_TILE_STENCIL_DISABLE(1);
2440          }
2441 
2442          surf->db_htile_data_base = (tex->buffer.gpu_address + tex->surface.htile_offset) >> 8;
2443          surf->db_htile_surface =
2444             S_028ABC_FULL_CACHE(1) | S_028ABC_PIPE_ALIGNED(1);
2445          if (sctx->chip_class == GFX9) {
2446             surf->db_htile_surface |= S_028ABC_RB_ALIGNED(1);
2447          }
2448       }
2449    } else {
2450       /* GFX6-GFX8 */
2451       struct legacy_surf_level *levelinfo = &tex->surface.u.legacy.level[level];
2452 
2453       assert(levelinfo->nblk_x % 8 == 0 && levelinfo->nblk_y % 8 == 0);
2454 
2455       surf->db_depth_base =
2456          (tex->buffer.gpu_address + tex->surface.u.legacy.level[level].offset) >> 8;
2457       surf->db_stencil_base =
2458          (tex->buffer.gpu_address + tex->surface.u.legacy.stencil_level[level].offset) >> 8;
2459 
2460       z_info =
2461          S_028040_FORMAT(format) | S_028040_NUM_SAMPLES(util_logbase2(tex->buffer.b.b.nr_samples));
2462       s_info = S_028044_FORMAT(stencil_format);
2463       surf->db_depth_info = 0;
2464 
2465       if (sctx->chip_class >= GFX7) {
2466          struct radeon_info *info = &sctx->screen->info;
2467          unsigned index = tex->surface.u.legacy.tiling_index[level];
2468          unsigned stencil_index = tex->surface.u.legacy.stencil_tiling_index[level];
2469          unsigned macro_index = tex->surface.u.legacy.macro_tile_index;
2470          unsigned tile_mode = info->si_tile_mode_array[index];
2471          unsigned stencil_tile_mode = info->si_tile_mode_array[stencil_index];
2472          unsigned macro_mode = info->cik_macrotile_mode_array[macro_index];
2473 
2474          surf->db_depth_info |= S_02803C_ARRAY_MODE(G_009910_ARRAY_MODE(tile_mode)) |
2475                                 S_02803C_PIPE_CONFIG(G_009910_PIPE_CONFIG(tile_mode)) |
2476                                 S_02803C_BANK_WIDTH(G_009990_BANK_WIDTH(macro_mode)) |
2477                                 S_02803C_BANK_HEIGHT(G_009990_BANK_HEIGHT(macro_mode)) |
2478                                 S_02803C_MACRO_TILE_ASPECT(G_009990_MACRO_TILE_ASPECT(macro_mode)) |
2479                                 S_02803C_NUM_BANKS(G_009990_NUM_BANKS(macro_mode));
2480          z_info |= S_028040_TILE_SPLIT(G_009910_TILE_SPLIT(tile_mode));
2481          s_info |= S_028044_TILE_SPLIT(G_009910_TILE_SPLIT(stencil_tile_mode));
2482       } else {
2483          unsigned tile_mode_index = si_tile_mode_index(tex, level, false);
2484          z_info |= S_028040_TILE_MODE_INDEX(tile_mode_index);
2485          tile_mode_index = si_tile_mode_index(tex, level, true);
2486          s_info |= S_028044_TILE_MODE_INDEX(tile_mode_index);
2487       }
2488 
2489       surf->db_depth_size = S_028058_PITCH_TILE_MAX((levelinfo->nblk_x / 8) - 1) |
2490                             S_028058_HEIGHT_TILE_MAX((levelinfo->nblk_y / 8) - 1);
2491       surf->db_depth_slice =
2492          S_02805C_SLICE_TILE_MAX((levelinfo->nblk_x * levelinfo->nblk_y) / 64 - 1);
2493 
2494       if (si_htile_enabled(tex, level, PIPE_MASK_ZS)) {
2495          z_info |= S_028040_TILE_SURFACE_ENABLE(1) | S_028040_ALLOW_EXPCLEAR(1);
2496 
2497          if (tex->surface.has_stencil) {
2498             /* Workaround: For a not yet understood reason, the
2499              * combination of MSAA, fast stencil clear and stencil
2500              * decompress messes with subsequent stencil buffer
2501              * uses. Problem was reproduced on Verde, Bonaire,
2502              * Tonga, and Carrizo.
2503              *
2504              * Disabling EXPCLEAR works around the problem.
2505              *
2506              * Check piglit's arb_texture_multisample-stencil-clear
2507              * test if you want to try changing this.
2508              */
2509             if (tex->buffer.b.b.nr_samples <= 1)
2510                s_info |= S_028044_ALLOW_EXPCLEAR(1);
2511          }
2512 
2513          surf->db_htile_data_base = (tex->buffer.gpu_address + tex->surface.htile_offset) >> 8;
2514          surf->db_htile_surface = S_028ABC_FULL_CACHE(1);
2515       }
2516    }
2517 
2518    surf->db_z_info = z_info;
2519    surf->db_stencil_info = s_info;
2520 
2521    surf->depth_initialized = true;
2522 }
2523 
si_update_fb_dirtiness_after_rendering(struct si_context * sctx)2524 void si_update_fb_dirtiness_after_rendering(struct si_context *sctx)
2525 {
2526    if (sctx->decompression_enabled)
2527       return;
2528 
2529    if (sctx->framebuffer.state.zsbuf) {
2530       struct pipe_surface *surf = sctx->framebuffer.state.zsbuf;
2531       struct si_texture *tex = (struct si_texture *)surf->texture;
2532 
2533       tex->dirty_level_mask |= 1 << surf->u.tex.level;
2534 
2535       if (tex->surface.has_stencil)
2536          tex->stencil_dirty_level_mask |= 1 << surf->u.tex.level;
2537    }
2538 
2539    unsigned compressed_cb_mask = sctx->framebuffer.compressed_cb_mask;
2540    while (compressed_cb_mask) {
2541       unsigned i = u_bit_scan(&compressed_cb_mask);
2542       struct pipe_surface *surf = sctx->framebuffer.state.cbufs[i];
2543       struct si_texture *tex = (struct si_texture *)surf->texture;
2544 
2545       if (tex->surface.fmask_offset) {
2546          tex->dirty_level_mask |= 1 << surf->u.tex.level;
2547          tex->fmask_is_identity = false;
2548       }
2549       if (tex->dcc_gather_statistics)
2550          tex->separate_dcc_dirty = true;
2551    }
2552 }
2553 
si_dec_framebuffer_counters(const struct pipe_framebuffer_state * state)2554 static void si_dec_framebuffer_counters(const struct pipe_framebuffer_state *state)
2555 {
2556    for (int i = 0; i < state->nr_cbufs; ++i) {
2557       struct si_surface *surf = NULL;
2558       struct si_texture *tex;
2559 
2560       if (!state->cbufs[i])
2561          continue;
2562       surf = (struct si_surface *)state->cbufs[i];
2563       tex = (struct si_texture *)surf->base.texture;
2564 
2565       p_atomic_dec(&tex->framebuffers_bound);
2566    }
2567 }
2568 
si_update_display_dcc_dirty(struct si_context * sctx)2569 static void si_update_display_dcc_dirty(struct si_context *sctx)
2570 {
2571    const struct pipe_framebuffer_state *state = &sctx->framebuffer.state;
2572    struct si_surface *surf;
2573    struct si_texture *tex;
2574    int i;
2575 
2576    for (i = 0; i < state->nr_cbufs; i++) {
2577       if (!state->cbufs[i])
2578          continue;
2579 
2580       surf = (struct si_surface *)state->cbufs[i];
2581       tex = (struct si_texture *)surf->base.texture;
2582 
2583       if (!tex->surface.display_dcc_offset)
2584          continue;
2585 
2586       tex->displayable_dcc_dirty = true;
2587    }
2588 }
2589 
si_set_framebuffer_state(struct pipe_context * ctx,const struct pipe_framebuffer_state * state)2590 static void si_set_framebuffer_state(struct pipe_context *ctx,
2591                                      const struct pipe_framebuffer_state *state)
2592 {
2593    struct si_context *sctx = (struct si_context *)ctx;
2594    struct si_surface *surf = NULL;
2595    struct si_texture *tex;
2596    bool old_any_dst_linear = sctx->framebuffer.any_dst_linear;
2597    unsigned old_nr_samples = sctx->framebuffer.nr_samples;
2598    unsigned old_colorbuf_enabled_4bit = sctx->framebuffer.colorbuf_enabled_4bit;
2599    bool old_has_zsbuf = !!sctx->framebuffer.state.zsbuf;
2600    bool old_has_stencil =
2601       old_has_zsbuf &&
2602       ((struct si_texture *)sctx->framebuffer.state.zsbuf->texture)->surface.has_stencil;
2603    bool unbound = false;
2604    int i;
2605 
2606    /* Reject zero-sized framebuffers due to a hw bug on GFX6 that occurs
2607     * when PA_SU_HARDWARE_SCREEN_OFFSET != 0 and any_scissor.BR_X/Y <= 0.
2608     * We could implement the full workaround here, but it's a useless case.
2609     */
2610    if ((!state->width || !state->height) && (state->nr_cbufs || state->zsbuf)) {
2611       unreachable("the framebuffer shouldn't have zero area");
2612       return;
2613    }
2614 
2615    si_update_fb_dirtiness_after_rendering(sctx);
2616 
2617    for (i = 0; i < sctx->framebuffer.state.nr_cbufs; i++) {
2618       if (!sctx->framebuffer.state.cbufs[i])
2619          continue;
2620 
2621       tex = (struct si_texture *)sctx->framebuffer.state.cbufs[i]->texture;
2622       if (tex->dcc_gather_statistics)
2623          vi_separate_dcc_stop_query(sctx, tex);
2624    }
2625 
2626    /* Disable DCC if the formats are incompatible. */
2627    for (i = 0; i < state->nr_cbufs; i++) {
2628       if (!state->cbufs[i])
2629          continue;
2630 
2631       surf = (struct si_surface *)state->cbufs[i];
2632       tex = (struct si_texture *)surf->base.texture;
2633 
2634       if (!surf->dcc_incompatible)
2635          continue;
2636 
2637       /* Since the DCC decompression calls back into set_framebuffer-
2638        * _state, we need to unbind the framebuffer, so that
2639        * vi_separate_dcc_stop_query isn't called twice with the same
2640        * color buffer.
2641        */
2642       if (!unbound) {
2643          util_copy_framebuffer_state(&sctx->framebuffer.state, NULL);
2644          unbound = true;
2645       }
2646 
2647       if (vi_dcc_enabled(tex, surf->base.u.tex.level))
2648          if (!si_texture_disable_dcc(sctx, tex))
2649             si_decompress_dcc(sctx, tex);
2650 
2651       surf->dcc_incompatible = false;
2652    }
2653 
2654    /* Only flush TC when changing the framebuffer state, because
2655     * the only client not using TC that can change textures is
2656     * the framebuffer.
2657     *
2658     * Wait for compute shaders because of possible transitions:
2659     * - FB write -> shader read
2660     * - shader write -> FB read
2661     *
2662     * DB caches are flushed on demand (using si_decompress_textures).
2663     *
2664     * When MSAA is enabled, CB and TC caches are flushed on demand
2665     * (after FMASK decompression). Shader write -> FB read transitions
2666     * cannot happen for MSAA textures, because MSAA shader images are
2667     * not supported.
2668     *
2669     * Only flush and wait for CB if there is actually a bound color buffer.
2670     */
2671    if (sctx->framebuffer.uncompressed_cb_mask) {
2672       si_make_CB_shader_coherent(sctx, sctx->framebuffer.nr_samples,
2673                                  sctx->framebuffer.CB_has_shader_readable_metadata,
2674                                  sctx->framebuffer.all_DCC_pipe_aligned);
2675    }
2676 
2677    sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
2678 
2679    /* u_blitter doesn't invoke depth decompression when it does multiple
2680     * blits in a row, but the only case when it matters for DB is when
2681     * doing generate_mipmap. So here we flush DB manually between
2682     * individual generate_mipmap blits.
2683     * Note that lower mipmap levels aren't compressed.
2684     */
2685    if (sctx->generate_mipmap_for_depth) {
2686       si_make_DB_shader_coherent(sctx, 1, false, sctx->framebuffer.DB_has_shader_readable_metadata);
2687    } else if (sctx->chip_class == GFX9) {
2688       /* It appears that DB metadata "leaks" in a sequence of:
2689        *  - depth clear
2690        *  - DCC decompress for shader image writes (with DB disabled)
2691        *  - render with DEPTH_BEFORE_SHADER=1
2692        * Flushing DB metadata works around the problem.
2693        */
2694       sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_DB_META;
2695    }
2696 
2697    /* Take the maximum of the old and new count. If the new count is lower,
2698     * dirtying is needed to disable the unbound colorbuffers.
2699     */
2700    sctx->framebuffer.dirty_cbufs |=
2701       (1 << MAX2(sctx->framebuffer.state.nr_cbufs, state->nr_cbufs)) - 1;
2702    sctx->framebuffer.dirty_zsbuf |= sctx->framebuffer.state.zsbuf != state->zsbuf;
2703 
2704    si_dec_framebuffer_counters(&sctx->framebuffer.state);
2705    util_copy_framebuffer_state(&sctx->framebuffer.state, state);
2706 
2707    sctx->framebuffer.colorbuf_enabled_4bit = 0;
2708    sctx->framebuffer.spi_shader_col_format = 0;
2709    sctx->framebuffer.spi_shader_col_format_alpha = 0;
2710    sctx->framebuffer.spi_shader_col_format_blend = 0;
2711    sctx->framebuffer.spi_shader_col_format_blend_alpha = 0;
2712    sctx->framebuffer.color_is_int8 = 0;
2713    sctx->framebuffer.color_is_int10 = 0;
2714 
2715    sctx->framebuffer.compressed_cb_mask = 0;
2716    sctx->framebuffer.uncompressed_cb_mask = 0;
2717    sctx->framebuffer.nr_samples = util_framebuffer_get_num_samples(state);
2718    sctx->framebuffer.nr_color_samples = sctx->framebuffer.nr_samples;
2719    sctx->framebuffer.log_samples = util_logbase2(sctx->framebuffer.nr_samples);
2720    sctx->framebuffer.any_dst_linear = false;
2721    sctx->framebuffer.CB_has_shader_readable_metadata = false;
2722    sctx->framebuffer.DB_has_shader_readable_metadata = false;
2723    sctx->framebuffer.all_DCC_pipe_aligned = true;
2724    sctx->framebuffer.min_bytes_per_pixel = 0;
2725 
2726    for (i = 0; i < state->nr_cbufs; i++) {
2727       if (!state->cbufs[i])
2728          continue;
2729 
2730       surf = (struct si_surface *)state->cbufs[i];
2731       tex = (struct si_texture *)surf->base.texture;
2732 
2733       if (!surf->color_initialized) {
2734          si_initialize_color_surface(sctx, surf);
2735       }
2736 
2737       sctx->framebuffer.colorbuf_enabled_4bit |= 0xf << (i * 4);
2738       sctx->framebuffer.spi_shader_col_format |= surf->spi_shader_col_format << (i * 4);
2739       sctx->framebuffer.spi_shader_col_format_alpha |= surf->spi_shader_col_format_alpha << (i * 4);
2740       sctx->framebuffer.spi_shader_col_format_blend |= surf->spi_shader_col_format_blend << (i * 4);
2741       sctx->framebuffer.spi_shader_col_format_blend_alpha |= surf->spi_shader_col_format_blend_alpha
2742                                                              << (i * 4);
2743 
2744       if (surf->color_is_int8)
2745          sctx->framebuffer.color_is_int8 |= 1 << i;
2746       if (surf->color_is_int10)
2747          sctx->framebuffer.color_is_int10 |= 1 << i;
2748 
2749       if (tex->surface.fmask_offset)
2750          sctx->framebuffer.compressed_cb_mask |= 1 << i;
2751       else
2752          sctx->framebuffer.uncompressed_cb_mask |= 1 << i;
2753 
2754       /* Don't update nr_color_samples for non-AA buffers.
2755        * (e.g. destination of MSAA resolve)
2756        */
2757       if (tex->buffer.b.b.nr_samples >= 2 &&
2758           tex->buffer.b.b.nr_storage_samples < tex->buffer.b.b.nr_samples) {
2759          sctx->framebuffer.nr_color_samples =
2760             MIN2(sctx->framebuffer.nr_color_samples, tex->buffer.b.b.nr_storage_samples);
2761          sctx->framebuffer.nr_color_samples = MAX2(1, sctx->framebuffer.nr_color_samples);
2762       }
2763 
2764       if (tex->surface.is_linear)
2765          sctx->framebuffer.any_dst_linear = true;
2766 
2767       if (vi_dcc_enabled(tex, surf->base.u.tex.level)) {
2768          sctx->framebuffer.CB_has_shader_readable_metadata = true;
2769 
2770          if (sctx->chip_class >= GFX9 && !tex->surface.u.gfx9.dcc.pipe_aligned)
2771             sctx->framebuffer.all_DCC_pipe_aligned = false;
2772       }
2773 
2774       si_context_add_resource_size(sctx, surf->base.texture);
2775 
2776       p_atomic_inc(&tex->framebuffers_bound);
2777 
2778       if (tex->dcc_gather_statistics) {
2779          /* Dirty tracking must be enabled for DCC usage analysis. */
2780          sctx->framebuffer.compressed_cb_mask |= 1 << i;
2781          vi_separate_dcc_start_query(sctx, tex);
2782       }
2783 
2784       /* Update the minimum but don't keep 0. */
2785       if (!sctx->framebuffer.min_bytes_per_pixel ||
2786           tex->surface.bpe < sctx->framebuffer.min_bytes_per_pixel)
2787          sctx->framebuffer.min_bytes_per_pixel = tex->surface.bpe;
2788    }
2789 
2790    /* For optimal DCC performance. */
2791    if (sctx->chip_class >= GFX10)
2792       sctx->framebuffer.dcc_overwrite_combiner_watermark = 6;
2793    else
2794       sctx->framebuffer.dcc_overwrite_combiner_watermark = 4;
2795 
2796    struct si_texture *zstex = NULL;
2797 
2798    if (state->zsbuf) {
2799       surf = (struct si_surface *)state->zsbuf;
2800       zstex = (struct si_texture *)surf->base.texture;
2801 
2802       if (!surf->depth_initialized) {
2803          si_init_depth_surface(sctx, surf);
2804       }
2805 
2806       if (vi_tc_compat_htile_enabled(zstex, surf->base.u.tex.level, PIPE_MASK_ZS))
2807          sctx->framebuffer.DB_has_shader_readable_metadata = true;
2808 
2809       si_context_add_resource_size(sctx, surf->base.texture);
2810 
2811       /* Update the minimum but don't keep 0. */
2812       if (!sctx->framebuffer.min_bytes_per_pixel ||
2813           zstex->surface.bpe < sctx->framebuffer.min_bytes_per_pixel)
2814          sctx->framebuffer.min_bytes_per_pixel = zstex->surface.bpe;
2815    }
2816 
2817    si_update_ps_colorbuf0_slot(sctx);
2818    si_update_poly_offset_state(sctx);
2819    si_mark_atom_dirty(sctx, &sctx->atoms.s.cb_render_state);
2820    si_mark_atom_dirty(sctx, &sctx->atoms.s.framebuffer);
2821 
2822    /* NGG cull state uses the sample count. */
2823    if (sctx->screen->use_ngg_culling)
2824       si_mark_atom_dirty(sctx, &sctx->atoms.s.ngg_cull_state);
2825 
2826    if (sctx->screen->dpbb_allowed)
2827       si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state);
2828 
2829    if (sctx->framebuffer.any_dst_linear != old_any_dst_linear)
2830       si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
2831 
2832    if (sctx->screen->has_out_of_order_rast &&
2833        (sctx->framebuffer.colorbuf_enabled_4bit != old_colorbuf_enabled_4bit ||
2834         !!sctx->framebuffer.state.zsbuf != old_has_zsbuf ||
2835         (zstex && zstex->surface.has_stencil != old_has_stencil)))
2836       si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
2837 
2838    if (sctx->framebuffer.nr_samples != old_nr_samples) {
2839       struct pipe_constant_buffer constbuf = {0};
2840 
2841       si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
2842       si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
2843 
2844       constbuf.buffer = sctx->sample_pos_buffer;
2845 
2846       /* Set sample locations as fragment shader constants. */
2847       switch (sctx->framebuffer.nr_samples) {
2848       case 1:
2849          constbuf.buffer_offset = 0;
2850          break;
2851       case 2:
2852          constbuf.buffer_offset =
2853             (ubyte *)sctx->sample_positions.x2 - (ubyte *)sctx->sample_positions.x1;
2854          break;
2855       case 4:
2856          constbuf.buffer_offset =
2857             (ubyte *)sctx->sample_positions.x4 - (ubyte *)sctx->sample_positions.x1;
2858          break;
2859       case 8:
2860          constbuf.buffer_offset =
2861             (ubyte *)sctx->sample_positions.x8 - (ubyte *)sctx->sample_positions.x1;
2862          break;
2863       case 16:
2864          constbuf.buffer_offset =
2865             (ubyte *)sctx->sample_positions.x16 - (ubyte *)sctx->sample_positions.x1;
2866          break;
2867       default:
2868          PRINT_ERR("Requested an invalid number of samples %i.\n", sctx->framebuffer.nr_samples);
2869          assert(0);
2870       }
2871       constbuf.buffer_size = sctx->framebuffer.nr_samples * 2 * 4;
2872       si_set_rw_buffer(sctx, SI_PS_CONST_SAMPLE_POSITIONS, &constbuf);
2873 
2874       si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_sample_locs);
2875    }
2876 
2877    sctx->do_update_shaders = true;
2878 
2879    if (!sctx->decompression_enabled) {
2880       /* Prevent textures decompression when the framebuffer state
2881        * changes come from the decompression passes themselves.
2882        */
2883       sctx->need_check_render_feedback = true;
2884    }
2885 }
2886 
si_emit_framebuffer_state(struct si_context * sctx)2887 static void si_emit_framebuffer_state(struct si_context *sctx)
2888 {
2889    struct radeon_cmdbuf *cs = sctx->gfx_cs;
2890    struct pipe_framebuffer_state *state = &sctx->framebuffer.state;
2891    unsigned i, nr_cbufs = state->nr_cbufs;
2892    struct si_texture *tex = NULL;
2893    struct si_surface *cb = NULL;
2894    unsigned cb_color_info = 0;
2895 
2896    /* Colorbuffers. */
2897    for (i = 0; i < nr_cbufs; i++) {
2898       uint64_t cb_color_base, cb_color_fmask, cb_color_cmask, cb_dcc_base;
2899       unsigned cb_color_attrib;
2900 
2901       if (!(sctx->framebuffer.dirty_cbufs & (1 << i)))
2902          continue;
2903 
2904       cb = (struct si_surface *)state->cbufs[i];
2905       if (!cb) {
2906          radeon_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C,
2907                                 S_028C70_FORMAT(V_028C70_COLOR_INVALID));
2908          continue;
2909       }
2910 
2911       tex = (struct si_texture *)cb->base.texture;
2912       radeon_add_to_buffer_list(
2913          sctx, sctx->gfx_cs, &tex->buffer, RADEON_USAGE_READWRITE,
2914          tex->buffer.b.b.nr_samples > 1 ? RADEON_PRIO_COLOR_BUFFER_MSAA : RADEON_PRIO_COLOR_BUFFER);
2915 
2916       if (tex->cmask_buffer && tex->cmask_buffer != &tex->buffer) {
2917          radeon_add_to_buffer_list(sctx, sctx->gfx_cs, tex->cmask_buffer, RADEON_USAGE_READWRITE,
2918                                    RADEON_PRIO_SEPARATE_META);
2919       }
2920 
2921       if (tex->dcc_separate_buffer)
2922          radeon_add_to_buffer_list(sctx, sctx->gfx_cs, tex->dcc_separate_buffer,
2923                                    RADEON_USAGE_READWRITE, RADEON_PRIO_SEPARATE_META);
2924 
2925       /* Compute mutable surface parameters. */
2926       cb_color_base = tex->buffer.gpu_address >> 8;
2927       cb_color_fmask = 0;
2928       cb_color_cmask = tex->cmask_base_address_reg;
2929       cb_dcc_base = 0;
2930       cb_color_info = cb->cb_color_info | tex->cb_color_info;
2931       cb_color_attrib = cb->cb_color_attrib;
2932 
2933       if (cb->base.u.tex.level > 0)
2934          cb_color_info &= C_028C70_FAST_CLEAR;
2935 
2936       if (tex->surface.fmask_offset) {
2937          cb_color_fmask = (tex->buffer.gpu_address + tex->surface.fmask_offset) >> 8;
2938          cb_color_fmask |= tex->surface.fmask_tile_swizzle;
2939       }
2940 
2941       /* Set up DCC. */
2942       if (vi_dcc_enabled(tex, cb->base.u.tex.level)) {
2943          bool is_msaa_resolve_dst = state->cbufs[0] && state->cbufs[0]->texture->nr_samples > 1 &&
2944                                     state->cbufs[1] == &cb->base &&
2945                                     state->cbufs[1]->texture->nr_samples <= 1;
2946 
2947          if (!is_msaa_resolve_dst)
2948             cb_color_info |= S_028C70_DCC_ENABLE(1);
2949 
2950          cb_dcc_base =
2951             ((!tex->dcc_separate_buffer ? tex->buffer.gpu_address : 0) + tex->surface.dcc_offset) >>
2952             8;
2953 
2954          unsigned dcc_tile_swizzle = tex->surface.tile_swizzle;
2955          dcc_tile_swizzle &= (tex->surface.dcc_alignment - 1) >> 8;
2956          cb_dcc_base |= dcc_tile_swizzle;
2957       }
2958 
2959       if (sctx->chip_class >= GFX10) {
2960          unsigned cb_color_attrib3;
2961 
2962          /* Set mutable surface parameters. */
2963          cb_color_base += tex->surface.u.gfx9.surf_offset >> 8;
2964          cb_color_base |= tex->surface.tile_swizzle;
2965          if (!tex->surface.fmask_offset)
2966             cb_color_fmask = cb_color_base;
2967          if (cb->base.u.tex.level > 0)
2968             cb_color_cmask = cb_color_base;
2969 
2970          cb_color_attrib3 = cb->cb_color_attrib3 |
2971                             S_028EE0_COLOR_SW_MODE(tex->surface.u.gfx9.surf.swizzle_mode) |
2972                             S_028EE0_FMASK_SW_MODE(tex->surface.u.gfx9.fmask.swizzle_mode) |
2973                             S_028EE0_CMASK_PIPE_ALIGNED(1) |
2974                             S_028EE0_DCC_PIPE_ALIGNED(tex->surface.u.gfx9.dcc.pipe_aligned);
2975 
2976          radeon_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 14);
2977          radeon_emit(cs, cb_color_base);             /* CB_COLOR0_BASE */
2978          radeon_emit(cs, 0);                         /* hole */
2979          radeon_emit(cs, 0);                         /* hole */
2980          radeon_emit(cs, cb->cb_color_view);         /* CB_COLOR0_VIEW */
2981          radeon_emit(cs, cb_color_info);             /* CB_COLOR0_INFO */
2982          radeon_emit(cs, cb_color_attrib);           /* CB_COLOR0_ATTRIB */
2983          radeon_emit(cs, cb->cb_dcc_control);        /* CB_COLOR0_DCC_CONTROL */
2984          radeon_emit(cs, cb_color_cmask);            /* CB_COLOR0_CMASK */
2985          radeon_emit(cs, 0);                         /* hole */
2986          radeon_emit(cs, cb_color_fmask);            /* CB_COLOR0_FMASK */
2987          radeon_emit(cs, 0);                         /* hole */
2988          radeon_emit(cs, tex->color_clear_value[0]); /* CB_COLOR0_CLEAR_WORD0 */
2989          radeon_emit(cs, tex->color_clear_value[1]); /* CB_COLOR0_CLEAR_WORD1 */
2990          radeon_emit(cs, cb_dcc_base);               /* CB_COLOR0_DCC_BASE */
2991 
2992          radeon_set_context_reg(cs, R_028E40_CB_COLOR0_BASE_EXT + i * 4, cb_color_base >> 32);
2993          radeon_set_context_reg(cs, R_028E60_CB_COLOR0_CMASK_BASE_EXT + i * 4,
2994                                 cb_color_cmask >> 32);
2995          radeon_set_context_reg(cs, R_028E80_CB_COLOR0_FMASK_BASE_EXT + i * 4,
2996                                 cb_color_fmask >> 32);
2997          radeon_set_context_reg(cs, R_028EA0_CB_COLOR0_DCC_BASE_EXT + i * 4, cb_dcc_base >> 32);
2998          radeon_set_context_reg(cs, R_028EC0_CB_COLOR0_ATTRIB2 + i * 4, cb->cb_color_attrib2);
2999          radeon_set_context_reg(cs, R_028EE0_CB_COLOR0_ATTRIB3 + i * 4, cb_color_attrib3);
3000       } else if (sctx->chip_class == GFX9) {
3001          struct gfx9_surf_meta_flags meta = {
3002             .rb_aligned = 1,
3003             .pipe_aligned = 1,
3004          };
3005 
3006          if (tex->surface.dcc_offset)
3007             meta = tex->surface.u.gfx9.dcc;
3008 
3009          /* Set mutable surface parameters. */
3010          cb_color_base += tex->surface.u.gfx9.surf_offset >> 8;
3011          cb_color_base |= tex->surface.tile_swizzle;
3012          if (!tex->surface.fmask_offset)
3013             cb_color_fmask = cb_color_base;
3014          if (cb->base.u.tex.level > 0)
3015             cb_color_cmask = cb_color_base;
3016          cb_color_attrib |= S_028C74_COLOR_SW_MODE(tex->surface.u.gfx9.surf.swizzle_mode) |
3017                             S_028C74_FMASK_SW_MODE(tex->surface.u.gfx9.fmask.swizzle_mode) |
3018                             S_028C74_RB_ALIGNED(meta.rb_aligned) |
3019                             S_028C74_PIPE_ALIGNED(meta.pipe_aligned);
3020 
3021          radeon_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 15);
3022          radeon_emit(cs, cb_color_base);                            /* CB_COLOR0_BASE */
3023          radeon_emit(cs, S_028C64_BASE_256B(cb_color_base >> 32));  /* CB_COLOR0_BASE_EXT */
3024          radeon_emit(cs, cb->cb_color_attrib2);                     /* CB_COLOR0_ATTRIB2 */
3025          radeon_emit(cs, cb->cb_color_view);                        /* CB_COLOR0_VIEW */
3026          radeon_emit(cs, cb_color_info);                            /* CB_COLOR0_INFO */
3027          radeon_emit(cs, cb_color_attrib);                          /* CB_COLOR0_ATTRIB */
3028          radeon_emit(cs, cb->cb_dcc_control);                       /* CB_COLOR0_DCC_CONTROL */
3029          radeon_emit(cs, cb_color_cmask);                           /* CB_COLOR0_CMASK */
3030          radeon_emit(cs, S_028C80_BASE_256B(cb_color_cmask >> 32)); /* CB_COLOR0_CMASK_BASE_EXT */
3031          radeon_emit(cs, cb_color_fmask);                           /* CB_COLOR0_FMASK */
3032          radeon_emit(cs, S_028C88_BASE_256B(cb_color_fmask >> 32)); /* CB_COLOR0_FMASK_BASE_EXT */
3033          radeon_emit(cs, tex->color_clear_value[0]);                /* CB_COLOR0_CLEAR_WORD0 */
3034          radeon_emit(cs, tex->color_clear_value[1]);                /* CB_COLOR0_CLEAR_WORD1 */
3035          radeon_emit(cs, cb_dcc_base);                              /* CB_COLOR0_DCC_BASE */
3036          radeon_emit(cs, S_028C98_BASE_256B(cb_dcc_base >> 32));    /* CB_COLOR0_DCC_BASE_EXT */
3037 
3038          radeon_set_context_reg(cs, R_0287A0_CB_MRT0_EPITCH + i * 4,
3039                                 S_0287A0_EPITCH(tex->surface.u.gfx9.surf.epitch));
3040       } else {
3041          /* Compute mutable surface parameters (GFX6-GFX8). */
3042          const struct legacy_surf_level *level_info =
3043             &tex->surface.u.legacy.level[cb->base.u.tex.level];
3044          unsigned pitch_tile_max, slice_tile_max, tile_mode_index;
3045          unsigned cb_color_pitch, cb_color_slice, cb_color_fmask_slice;
3046 
3047          cb_color_base += level_info->offset >> 8;
3048          /* Only macrotiled modes can set tile swizzle. */
3049          if (level_info->mode == RADEON_SURF_MODE_2D)
3050             cb_color_base |= tex->surface.tile_swizzle;
3051 
3052          if (!tex->surface.fmask_offset)
3053             cb_color_fmask = cb_color_base;
3054          if (cb->base.u.tex.level > 0)
3055             cb_color_cmask = cb_color_base;
3056          if (cb_dcc_base)
3057             cb_dcc_base += level_info->dcc_offset >> 8;
3058 
3059          pitch_tile_max = level_info->nblk_x / 8 - 1;
3060          slice_tile_max = level_info->nblk_x * level_info->nblk_y / 64 - 1;
3061          tile_mode_index = si_tile_mode_index(tex, cb->base.u.tex.level, false);
3062 
3063          cb_color_attrib |= S_028C74_TILE_MODE_INDEX(tile_mode_index);
3064          cb_color_pitch = S_028C64_TILE_MAX(pitch_tile_max);
3065          cb_color_slice = S_028C68_TILE_MAX(slice_tile_max);
3066 
3067          if (tex->surface.fmask_offset) {
3068             if (sctx->chip_class >= GFX7)
3069                cb_color_pitch |=
3070                   S_028C64_FMASK_TILE_MAX(tex->surface.u.legacy.fmask.pitch_in_pixels / 8 - 1);
3071             cb_color_attrib |=
3072                S_028C74_FMASK_TILE_MODE_INDEX(tex->surface.u.legacy.fmask.tiling_index);
3073             cb_color_fmask_slice = S_028C88_TILE_MAX(tex->surface.u.legacy.fmask.slice_tile_max);
3074          } else {
3075             /* This must be set for fast clear to work without FMASK. */
3076             if (sctx->chip_class >= GFX7)
3077                cb_color_pitch |= S_028C64_FMASK_TILE_MAX(pitch_tile_max);
3078             cb_color_attrib |= S_028C74_FMASK_TILE_MODE_INDEX(tile_mode_index);
3079             cb_color_fmask_slice = S_028C88_TILE_MAX(slice_tile_max);
3080          }
3081 
3082          radeon_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C,
3083                                     sctx->chip_class >= GFX8 ? 14 : 13);
3084          radeon_emit(cs, cb_color_base);                              /* CB_COLOR0_BASE */
3085          radeon_emit(cs, cb_color_pitch);                             /* CB_COLOR0_PITCH */
3086          radeon_emit(cs, cb_color_slice);                             /* CB_COLOR0_SLICE */
3087          radeon_emit(cs, cb->cb_color_view);                          /* CB_COLOR0_VIEW */
3088          radeon_emit(cs, cb_color_info);                              /* CB_COLOR0_INFO */
3089          radeon_emit(cs, cb_color_attrib);                            /* CB_COLOR0_ATTRIB */
3090          radeon_emit(cs, cb->cb_dcc_control);                         /* CB_COLOR0_DCC_CONTROL */
3091          radeon_emit(cs, cb_color_cmask);                             /* CB_COLOR0_CMASK */
3092          radeon_emit(cs, tex->surface.u.legacy.cmask_slice_tile_max); /* CB_COLOR0_CMASK_SLICE */
3093          radeon_emit(cs, cb_color_fmask);                             /* CB_COLOR0_FMASK */
3094          radeon_emit(cs, cb_color_fmask_slice);                       /* CB_COLOR0_FMASK_SLICE */
3095          radeon_emit(cs, tex->color_clear_value[0]);                  /* CB_COLOR0_CLEAR_WORD0 */
3096          radeon_emit(cs, tex->color_clear_value[1]);                  /* CB_COLOR0_CLEAR_WORD1 */
3097 
3098          if (sctx->chip_class >= GFX8) /* R_028C94_CB_COLOR0_DCC_BASE */
3099             radeon_emit(cs, cb_dcc_base);
3100       }
3101    }
3102    for (; i < 8; i++)
3103       if (sctx->framebuffer.dirty_cbufs & (1 << i))
3104          radeon_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C, 0);
3105 
3106    /* ZS buffer. */
3107    if (state->zsbuf && sctx->framebuffer.dirty_zsbuf) {
3108       struct si_surface *zb = (struct si_surface *)state->zsbuf;
3109       struct si_texture *tex = (struct si_texture *)zb->base.texture;
3110       unsigned db_z_info = zb->db_z_info;
3111       unsigned db_stencil_info = zb->db_stencil_info;
3112       unsigned db_htile_surface = zb->db_htile_surface;
3113 
3114       radeon_add_to_buffer_list(sctx, sctx->gfx_cs, &tex->buffer, RADEON_USAGE_READWRITE,
3115                                 zb->base.texture->nr_samples > 1 ? RADEON_PRIO_DEPTH_BUFFER_MSAA
3116                                                                  : RADEON_PRIO_DEPTH_BUFFER);
3117 
3118       /* Set fields dependent on tc_compatile_htile. */
3119       if (sctx->chip_class >= GFX9 &&
3120           vi_tc_compat_htile_enabled(tex, zb->base.u.tex.level, PIPE_MASK_ZS)) {
3121          unsigned max_zplanes = 4;
3122 
3123          if (tex->db_render_format == PIPE_FORMAT_Z16_UNORM && tex->buffer.b.b.nr_samples > 1)
3124             max_zplanes = 2;
3125 
3126          db_z_info |= S_028038_DECOMPRESS_ON_N_ZPLANES(max_zplanes + 1);
3127 
3128          if (sctx->chip_class >= GFX10) {
3129             db_z_info |= S_028040_ITERATE_FLUSH(1);
3130             db_stencil_info |= S_028044_ITERATE_FLUSH(!tex->htile_stencil_disabled);
3131          } else {
3132             db_z_info |= S_028038_ITERATE_FLUSH(1);
3133             db_stencil_info |= S_02803C_ITERATE_FLUSH(1);
3134          }
3135       }
3136 
3137       if (sctx->chip_class >= GFX10) {
3138          radeon_set_context_reg(cs, R_028014_DB_HTILE_DATA_BASE, zb->db_htile_data_base);
3139          radeon_set_context_reg(cs, R_02801C_DB_DEPTH_SIZE_XY, zb->db_depth_size);
3140 
3141          radeon_set_context_reg_seq(cs, R_02803C_DB_DEPTH_INFO, 7);
3142          radeon_emit(cs, S_02803C_RESOURCE_LEVEL(1)); /* DB_DEPTH_INFO */
3143          radeon_emit(cs, db_z_info |                  /* DB_Z_INFO */
3144                             S_028038_ZRANGE_PRECISION(tex->depth_clear_value != 0));
3145          radeon_emit(cs, db_stencil_info);     /* DB_STENCIL_INFO */
3146          radeon_emit(cs, zb->db_depth_base);   /* DB_Z_READ_BASE */
3147          radeon_emit(cs, zb->db_stencil_base); /* DB_STENCIL_READ_BASE */
3148          radeon_emit(cs, zb->db_depth_base);   /* DB_Z_WRITE_BASE */
3149          radeon_emit(cs, zb->db_stencil_base); /* DB_STENCIL_WRITE_BASE */
3150 
3151          radeon_set_context_reg_seq(cs, R_028068_DB_Z_READ_BASE_HI, 5);
3152          radeon_emit(cs, zb->db_depth_base >> 32);      /* DB_Z_READ_BASE_HI */
3153          radeon_emit(cs, zb->db_stencil_base >> 32);    /* DB_STENCIL_READ_BASE_HI */
3154          radeon_emit(cs, zb->db_depth_base >> 32);      /* DB_Z_WRITE_BASE_HI */
3155          radeon_emit(cs, zb->db_stencil_base >> 32);    /* DB_STENCIL_WRITE_BASE_HI */
3156          radeon_emit(cs, zb->db_htile_data_base >> 32); /* DB_HTILE_DATA_BASE_HI */
3157       } else if (sctx->chip_class == GFX9) {
3158          radeon_set_context_reg_seq(cs, R_028014_DB_HTILE_DATA_BASE, 3);
3159          radeon_emit(cs, zb->db_htile_data_base); /* DB_HTILE_DATA_BASE */
3160          radeon_emit(cs,
3161                      S_028018_BASE_HI(zb->db_htile_data_base >> 32)); /* DB_HTILE_DATA_BASE_HI */
3162          radeon_emit(cs, zb->db_depth_size);                          /* DB_DEPTH_SIZE */
3163 
3164          radeon_set_context_reg_seq(cs, R_028038_DB_Z_INFO, 10);
3165          radeon_emit(cs, db_z_info |                                   /* DB_Z_INFO */
3166                             S_028038_ZRANGE_PRECISION(tex->depth_clear_value != 0));
3167          radeon_emit(cs, db_stencil_info);                             /* DB_STENCIL_INFO */
3168          radeon_emit(cs, zb->db_depth_base);                           /* DB_Z_READ_BASE */
3169          radeon_emit(cs, S_028044_BASE_HI(zb->db_depth_base >> 32));   /* DB_Z_READ_BASE_HI */
3170          radeon_emit(cs, zb->db_stencil_base);                         /* DB_STENCIL_READ_BASE */
3171          radeon_emit(cs, S_02804C_BASE_HI(zb->db_stencil_base >> 32)); /* DB_STENCIL_READ_BASE_HI */
3172          radeon_emit(cs, zb->db_depth_base);                           /* DB_Z_WRITE_BASE */
3173          radeon_emit(cs, S_028054_BASE_HI(zb->db_depth_base >> 32));   /* DB_Z_WRITE_BASE_HI */
3174          radeon_emit(cs, zb->db_stencil_base);                         /* DB_STENCIL_WRITE_BASE */
3175          radeon_emit(cs,
3176                      S_02805C_BASE_HI(zb->db_stencil_base >> 32)); /* DB_STENCIL_WRITE_BASE_HI */
3177 
3178          radeon_set_context_reg_seq(cs, R_028068_DB_Z_INFO2, 2);
3179          radeon_emit(cs, zb->db_z_info2);       /* DB_Z_INFO2 */
3180          radeon_emit(cs, zb->db_stencil_info2); /* DB_STENCIL_INFO2 */
3181       } else {
3182          /* GFX6-GFX8 */
3183          /* Set fields dependent on tc_compatile_htile. */
3184          if (si_htile_enabled(tex, zb->base.u.tex.level, PIPE_MASK_ZS)) {
3185             if (!tex->surface.has_stencil && !tex->tc_compatible_htile) {
3186                /* Use all of the htile_buffer for depth if there's no stencil.
3187                 * This must not be set when TC-compatible HTILE is enabled
3188                 * due to a hw bug.
3189                 */
3190                db_stencil_info |= S_028044_TILE_STENCIL_DISABLE(1);
3191             }
3192 
3193             if (tex->tc_compatible_htile) {
3194                db_htile_surface |= S_028ABC_TC_COMPATIBLE(1);
3195 
3196                /* 0 = full compression. N = only compress up to N-1 Z planes. */
3197                if (tex->buffer.b.b.nr_samples <= 1)
3198                   db_z_info |= S_028040_DECOMPRESS_ON_N_ZPLANES(5);
3199                else if (tex->buffer.b.b.nr_samples <= 4)
3200                   db_z_info |= S_028040_DECOMPRESS_ON_N_ZPLANES(3);
3201                else
3202                   db_z_info |= S_028040_DECOMPRESS_ON_N_ZPLANES(2);
3203             }
3204          }
3205 
3206          radeon_set_context_reg(cs, R_028014_DB_HTILE_DATA_BASE, zb->db_htile_data_base);
3207 
3208          radeon_set_context_reg_seq(cs, R_02803C_DB_DEPTH_INFO, 9);
3209          radeon_emit(cs, zb->db_depth_info |   /* DB_DEPTH_INFO */
3210                      S_02803C_ADDR5_SWIZZLE_MASK(!tex->tc_compatible_htile));
3211          radeon_emit(cs, db_z_info |           /* DB_Z_INFO */
3212                             S_028040_ZRANGE_PRECISION(tex->depth_clear_value != 0));
3213          radeon_emit(cs, db_stencil_info);     /* DB_STENCIL_INFO */
3214          radeon_emit(cs, zb->db_depth_base);   /* DB_Z_READ_BASE */
3215          radeon_emit(cs, zb->db_stencil_base); /* DB_STENCIL_READ_BASE */
3216          radeon_emit(cs, zb->db_depth_base);   /* DB_Z_WRITE_BASE */
3217          radeon_emit(cs, zb->db_stencil_base); /* DB_STENCIL_WRITE_BASE */
3218          radeon_emit(cs, zb->db_depth_size);   /* DB_DEPTH_SIZE */
3219          radeon_emit(cs, zb->db_depth_slice);  /* DB_DEPTH_SLICE */
3220       }
3221 
3222       radeon_set_context_reg_seq(cs, R_028028_DB_STENCIL_CLEAR, 2);
3223       radeon_emit(cs, tex->stencil_clear_value);    /* R_028028_DB_STENCIL_CLEAR */
3224       radeon_emit(cs, fui(tex->depth_clear_value)); /* R_02802C_DB_DEPTH_CLEAR */
3225 
3226       radeon_set_context_reg(cs, R_028008_DB_DEPTH_VIEW, zb->db_depth_view);
3227       radeon_set_context_reg(cs, R_028ABC_DB_HTILE_SURFACE, db_htile_surface);
3228    } else if (sctx->framebuffer.dirty_zsbuf) {
3229       if (sctx->chip_class == GFX9)
3230          radeon_set_context_reg_seq(cs, R_028038_DB_Z_INFO, 2);
3231       else
3232          radeon_set_context_reg_seq(cs, R_028040_DB_Z_INFO, 2);
3233 
3234       radeon_emit(cs, S_028040_FORMAT(V_028040_Z_INVALID));       /* DB_Z_INFO */
3235       radeon_emit(cs, S_028044_FORMAT(V_028044_STENCIL_INVALID)); /* DB_STENCIL_INFO */
3236    }
3237 
3238    /* Framebuffer dimensions. */
3239    /* PA_SC_WINDOW_SCISSOR_TL is set in si_init_cs_preamble_state */
3240    radeon_set_context_reg(cs, R_028208_PA_SC_WINDOW_SCISSOR_BR,
3241                           S_028208_BR_X(state->width) | S_028208_BR_Y(state->height));
3242 
3243    if (sctx->screen->dfsm_allowed) {
3244       radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
3245       radeon_emit(cs, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0));
3246    }
3247 
3248    si_update_display_dcc_dirty(sctx);
3249 
3250    sctx->framebuffer.dirty_cbufs = 0;
3251    sctx->framebuffer.dirty_zsbuf = false;
3252 }
3253 
si_emit_msaa_sample_locs(struct si_context * sctx)3254 static void si_emit_msaa_sample_locs(struct si_context *sctx)
3255 {
3256    struct radeon_cmdbuf *cs = sctx->gfx_cs;
3257    struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
3258    unsigned nr_samples = sctx->framebuffer.nr_samples;
3259    bool has_msaa_sample_loc_bug = sctx->screen->info.has_msaa_sample_loc_bug;
3260 
3261    /* Smoothing (only possible with nr_samples == 1) uses the same
3262     * sample locations as the MSAA it simulates.
3263     */
3264    if (nr_samples <= 1 && sctx->smoothing_enabled)
3265       nr_samples = SI_NUM_SMOOTH_AA_SAMPLES;
3266 
3267    /* On Polaris, the small primitive filter uses the sample locations
3268     * even when MSAA is off, so we need to make sure they're set to 0.
3269     *
3270     * GFX10 uses sample locations unconditionally, so they always need
3271     * to be set up.
3272     */
3273    if ((nr_samples >= 2 || has_msaa_sample_loc_bug || sctx->chip_class >= GFX10) &&
3274        nr_samples != sctx->sample_locs_num_samples) {
3275       sctx->sample_locs_num_samples = nr_samples;
3276       si_emit_sample_locations(cs, nr_samples);
3277    }
3278 
3279    if (sctx->family >= CHIP_POLARIS10) {
3280       unsigned small_prim_filter_cntl =
3281          S_028830_SMALL_PRIM_FILTER_ENABLE(1) |
3282          /* line bug */
3283          S_028830_LINE_FILTER_DISABLE(sctx->family <= CHIP_POLARIS12);
3284 
3285       /* The alternative of setting sample locations to 0 would
3286        * require a DB flush to avoid Z errors, see
3287        * https://bugs.freedesktop.org/show_bug.cgi?id=96908
3288        */
3289       if (has_msaa_sample_loc_bug && sctx->framebuffer.nr_samples > 1 && !rs->multisample_enable)
3290          small_prim_filter_cntl &= C_028830_SMALL_PRIM_FILTER_ENABLE;
3291 
3292       radeon_opt_set_context_reg(sctx, R_028830_PA_SU_SMALL_PRIM_FILTER_CNTL,
3293                                  SI_TRACKED_PA_SU_SMALL_PRIM_FILTER_CNTL, small_prim_filter_cntl);
3294    }
3295 
3296    /* The exclusion bits can be set to improve rasterization efficiency
3297     * if no sample lies on the pixel boundary (-8 sample offset).
3298     */
3299    bool exclusion = sctx->chip_class >= GFX7 && (!rs->multisample_enable || nr_samples != 16);
3300    radeon_opt_set_context_reg(
3301       sctx, R_02882C_PA_SU_PRIM_FILTER_CNTL, SI_TRACKED_PA_SU_PRIM_FILTER_CNTL,
3302       S_02882C_XMAX_RIGHT_EXCLUSION(exclusion) | S_02882C_YMAX_BOTTOM_EXCLUSION(exclusion));
3303 }
3304 
si_out_of_order_rasterization(struct si_context * sctx)3305 static bool si_out_of_order_rasterization(struct si_context *sctx)
3306 {
3307    struct si_state_blend *blend = sctx->queued.named.blend;
3308    struct si_state_dsa *dsa = sctx->queued.named.dsa;
3309 
3310    if (!sctx->screen->has_out_of_order_rast)
3311       return false;
3312 
3313    unsigned colormask = sctx->framebuffer.colorbuf_enabled_4bit;
3314 
3315    colormask &= blend->cb_target_enabled_4bit;
3316 
3317    /* Conservative: No logic op. */
3318    if (colormask && blend->logicop_enable)
3319       return false;
3320 
3321    struct si_dsa_order_invariance dsa_order_invariant = {.zs = true,
3322                                                          .pass_set = true,
3323                                                          .pass_last = false};
3324 
3325    if (sctx->framebuffer.state.zsbuf) {
3326       struct si_texture *zstex = (struct si_texture *)sctx->framebuffer.state.zsbuf->texture;
3327       bool has_stencil = zstex->surface.has_stencil;
3328       dsa_order_invariant = dsa->order_invariance[has_stencil];
3329       if (!dsa_order_invariant.zs)
3330          return false;
3331 
3332       /* The set of PS invocations is always order invariant,
3333        * except when early Z/S tests are requested. */
3334       if (sctx->ps_shader.cso && sctx->ps_shader.cso->info.base.writes_memory &&
3335           sctx->ps_shader.cso->info.base.fs.early_fragment_tests &&
3336           !dsa_order_invariant.pass_set)
3337          return false;
3338 
3339       if (sctx->num_perfect_occlusion_queries != 0 && !dsa_order_invariant.pass_set)
3340          return false;
3341    }
3342 
3343    if (!colormask)
3344       return true;
3345 
3346    unsigned blendmask = colormask & blend->blend_enable_4bit;
3347 
3348    if (blendmask) {
3349       /* Only commutative blending. */
3350       if (blendmask & ~blend->commutative_4bit)
3351          return false;
3352 
3353       if (!dsa_order_invariant.pass_set)
3354          return false;
3355    }
3356 
3357    if (colormask & ~blendmask) {
3358       if (!dsa_order_invariant.pass_last)
3359          return false;
3360    }
3361 
3362    return true;
3363 }
3364 
si_emit_msaa_config(struct si_context * sctx)3365 static void si_emit_msaa_config(struct si_context *sctx)
3366 {
3367    struct radeon_cmdbuf *cs = sctx->gfx_cs;
3368    unsigned num_tile_pipes = sctx->screen->info.num_tile_pipes;
3369    /* 33% faster rendering to linear color buffers */
3370    bool dst_is_linear = sctx->framebuffer.any_dst_linear;
3371    bool out_of_order_rast = si_out_of_order_rasterization(sctx);
3372    unsigned sc_mode_cntl_1 =
3373       S_028A4C_WALK_SIZE(dst_is_linear) | S_028A4C_WALK_FENCE_ENABLE(!dst_is_linear) |
3374       S_028A4C_WALK_FENCE_SIZE(num_tile_pipes == 2 ? 2 : 3) |
3375       S_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE(out_of_order_rast) |
3376       S_028A4C_OUT_OF_ORDER_WATER_MARK(0x7) |
3377       /* always 1: */
3378       S_028A4C_WALK_ALIGN8_PRIM_FITS_ST(1) | S_028A4C_SUPERTILE_WALK_ORDER_ENABLE(1) |
3379       S_028A4C_TILE_WALK_ORDER_ENABLE(1) | S_028A4C_MULTI_SHADER_ENGINE_PRIM_DISCARD_ENABLE(1) |
3380       S_028A4C_FORCE_EOV_CNTDWN_ENABLE(1) | S_028A4C_FORCE_EOV_REZ_ENABLE(1);
3381    unsigned db_eqaa = S_028804_HIGH_QUALITY_INTERSECTIONS(1) | S_028804_INCOHERENT_EQAA_READS(1) |
3382                       S_028804_INTERPOLATE_COMP_Z(1) | S_028804_STATIC_ANCHOR_ASSOCIATIONS(1);
3383    unsigned coverage_samples, color_samples, z_samples;
3384    struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
3385 
3386    /* S: Coverage samples (up to 16x):
3387     * - Scan conversion samples (PA_SC_AA_CONFIG.MSAA_NUM_SAMPLES)
3388     * - CB FMASK samples (CB_COLORi_ATTRIB.NUM_SAMPLES)
3389     *
3390     * Z: Z/S samples (up to 8x, must be <= coverage samples and >= color samples):
3391     * - Value seen by DB (DB_Z_INFO.NUM_SAMPLES)
3392     * - Value seen by CB, must be correct even if Z/S is unbound (DB_EQAA.MAX_ANCHOR_SAMPLES)
3393     * # Missing samples are derived from Z planes if Z is compressed (up to 16x quality), or
3394     * # from the closest defined sample if Z is uncompressed (same quality as the number of
3395     * # Z samples).
3396     *
3397     * F: Color samples (up to 8x, must be <= coverage samples):
3398     * - CB color samples (CB_COLORi_ATTRIB.NUM_FRAGMENTS)
3399     * - PS iter samples (DB_EQAA.PS_ITER_SAMPLES)
3400     *
3401     * Can be anything between coverage and color samples:
3402     * - SampleMaskIn samples (PA_SC_AA_CONFIG.MSAA_EXPOSED_SAMPLES)
3403     * - SampleMaskOut samples (DB_EQAA.MASK_EXPORT_NUM_SAMPLES)
3404     * - Alpha-to-coverage samples (DB_EQAA.ALPHA_TO_MASK_NUM_SAMPLES)
3405     * - Occlusion query samples (DB_COUNT_CONTROL.SAMPLE_RATE)
3406     * # All are currently set the same as coverage samples.
3407     *
3408     * If color samples < coverage samples, FMASK has a higher bpp to store an "unknown"
3409     * flag for undefined color samples. A shader-based resolve must handle unknowns
3410     * or mask them out with AND. Unknowns can also be guessed from neighbors via
3411     * an edge-detect shader-based resolve, which is required to make "color samples = 1"
3412     * useful. The CB resolve always drops unknowns.
3413     *
3414     * Sensible AA configurations:
3415     *   EQAA 16s 8z 8f - might look the same as 16x MSAA if Z is compressed
3416     *   EQAA 16s 8z 4f - might look the same as 16x MSAA if Z is compressed
3417     *   EQAA 16s 4z 4f - might look the same as 16x MSAA if Z is compressed
3418     *   EQAA  8s 8z 8f = 8x MSAA
3419     *   EQAA  8s 8z 4f - might look the same as 8x MSAA
3420     *   EQAA  8s 8z 2f - might look the same as 8x MSAA with low-density geometry
3421     *   EQAA  8s 4z 4f - might look the same as 8x MSAA if Z is compressed
3422     *   EQAA  8s 4z 2f - might look the same as 8x MSAA with low-density geometry if Z is compressed
3423     *   EQAA  4s 4z 4f = 4x MSAA
3424     *   EQAA  4s 4z 2f - might look the same as 4x MSAA with low-density geometry
3425     *   EQAA  2s 2z 2f = 2x MSAA
3426     */
3427    coverage_samples = color_samples = z_samples = si_get_num_coverage_samples(sctx);
3428 
3429    if (sctx->framebuffer.nr_samples > 1 && rs->multisample_enable) {
3430       color_samples = sctx->framebuffer.nr_color_samples;
3431 
3432       if (sctx->framebuffer.state.zsbuf) {
3433          z_samples = sctx->framebuffer.state.zsbuf->texture->nr_samples;
3434          z_samples = MAX2(1, z_samples);
3435       } else {
3436          z_samples = coverage_samples;
3437       }
3438    }
3439 
3440    /* Required by OpenGL line rasterization.
3441     *
3442     * TODO: We should also enable perpendicular endcaps for AA lines,
3443     *       but that requires implementing line stippling in the pixel
3444     *       shader. SC can only do line stippling with axis-aligned
3445     *       endcaps.
3446     */
3447    unsigned sc_line_cntl = S_028BDC_DX10_DIAMOND_TEST_ENA(1);
3448    unsigned sc_aa_config = 0;
3449 
3450    if (coverage_samples > 1) {
3451       /* distance from the pixel center, indexed by log2(nr_samples) */
3452       static unsigned max_dist[] = {
3453          0, /* unused */
3454          4, /* 2x MSAA */
3455          6, /* 4x MSAA */
3456          7, /* 8x MSAA */
3457          8, /* 16x MSAA */
3458       };
3459       unsigned log_samples = util_logbase2(coverage_samples);
3460       unsigned log_z_samples = util_logbase2(z_samples);
3461       unsigned ps_iter_samples = si_get_ps_iter_samples(sctx);
3462       unsigned log_ps_iter_samples = util_logbase2(ps_iter_samples);
3463 
3464       sc_line_cntl |= S_028BDC_EXPAND_LINE_WIDTH(1);
3465       sc_aa_config = S_028BE0_MSAA_NUM_SAMPLES(log_samples) |
3466                      S_028BE0_MAX_SAMPLE_DIST(max_dist[log_samples]) |
3467                      S_028BE0_MSAA_EXPOSED_SAMPLES(log_samples) |
3468                      S_028BE0_COVERED_CENTROID_IS_CENTER(sctx->chip_class >= GFX10_3);
3469 
3470       if (sctx->framebuffer.nr_samples > 1) {
3471          db_eqaa |= S_028804_MAX_ANCHOR_SAMPLES(log_z_samples) |
3472                     S_028804_PS_ITER_SAMPLES(log_ps_iter_samples) |
3473                     S_028804_MASK_EXPORT_NUM_SAMPLES(log_samples) |
3474                     S_028804_ALPHA_TO_MASK_NUM_SAMPLES(log_samples);
3475          sc_mode_cntl_1 |= S_028A4C_PS_ITER_SAMPLE(ps_iter_samples > 1);
3476       } else if (sctx->smoothing_enabled) {
3477          db_eqaa |= S_028804_OVERRASTERIZATION_AMOUNT(log_samples);
3478       }
3479    }
3480 
3481    unsigned initial_cdw = cs->current.cdw;
3482 
3483    /* R_028BDC_PA_SC_LINE_CNTL, R_028BE0_PA_SC_AA_CONFIG */
3484    radeon_opt_set_context_reg2(sctx, R_028BDC_PA_SC_LINE_CNTL, SI_TRACKED_PA_SC_LINE_CNTL,
3485                                sc_line_cntl, sc_aa_config);
3486    /* R_028804_DB_EQAA */
3487    radeon_opt_set_context_reg(sctx, R_028804_DB_EQAA, SI_TRACKED_DB_EQAA, db_eqaa);
3488    /* R_028A4C_PA_SC_MODE_CNTL_1 */
3489    radeon_opt_set_context_reg(sctx, R_028A4C_PA_SC_MODE_CNTL_1, SI_TRACKED_PA_SC_MODE_CNTL_1,
3490                               sc_mode_cntl_1);
3491 
3492    if (initial_cdw != cs->current.cdw) {
3493       sctx->context_roll = true;
3494 
3495       /* GFX9: Flush DFSM when the AA mode changes. */
3496       if (sctx->screen->dfsm_allowed) {
3497          radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
3498          radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_DFSM) | EVENT_INDEX(0));
3499       }
3500    }
3501 }
3502 
si_update_ps_iter_samples(struct si_context * sctx)3503 void si_update_ps_iter_samples(struct si_context *sctx)
3504 {
3505    if (sctx->framebuffer.nr_samples > 1)
3506       si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
3507    if (sctx->screen->dpbb_allowed)
3508       si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state);
3509 }
3510 
si_set_min_samples(struct pipe_context * ctx,unsigned min_samples)3511 static void si_set_min_samples(struct pipe_context *ctx, unsigned min_samples)
3512 {
3513    struct si_context *sctx = (struct si_context *)ctx;
3514 
3515    /* The hardware can only do sample shading with 2^n samples. */
3516    min_samples = util_next_power_of_two(min_samples);
3517 
3518    if (sctx->ps_iter_samples == min_samples)
3519       return;
3520 
3521    sctx->ps_iter_samples = min_samples;
3522    sctx->do_update_shaders = true;
3523 
3524    si_update_ps_iter_samples(sctx);
3525 }
3526 
3527 /*
3528  * Samplers
3529  */
3530 
3531 /**
3532  * Build the sampler view descriptor for a buffer texture.
3533  * @param state 256-bit descriptor; only the high 128 bits are filled in
3534  */
si_make_buffer_descriptor(struct si_screen * screen,struct si_resource * buf,enum pipe_format format,unsigned offset,unsigned size,uint32_t * state)3535 void si_make_buffer_descriptor(struct si_screen *screen, struct si_resource *buf,
3536                                enum pipe_format format, unsigned offset, unsigned size,
3537                                uint32_t *state)
3538 {
3539    const struct util_format_description *desc;
3540    unsigned stride;
3541    unsigned num_records;
3542 
3543    desc = util_format_description(format);
3544    stride = desc->block.bits / 8;
3545 
3546    num_records = size / stride;
3547    num_records = MIN2(num_records, (buf->b.b.width0 - offset) / stride);
3548 
3549    /* The NUM_RECORDS field has a different meaning depending on the chip,
3550     * instruction type, STRIDE, and SWIZZLE_ENABLE.
3551     *
3552     * GFX6-7,10:
3553     * - If STRIDE == 0, it's in byte units.
3554     * - If STRIDE != 0, it's in units of STRIDE, used with inst.IDXEN.
3555     *
3556     * GFX8:
3557     * - For SMEM and STRIDE == 0, it's in byte units.
3558     * - For SMEM and STRIDE != 0, it's in units of STRIDE.
3559     * - For VMEM and STRIDE == 0 or SWIZZLE_ENABLE == 0, it's in byte units.
3560     * - For VMEM and STRIDE != 0 and SWIZZLE_ENABLE == 1, it's in units of STRIDE.
3561     * NOTE: There is incompatibility between VMEM and SMEM opcodes due to SWIZZLE_-
3562     *       ENABLE. The workaround is to set STRIDE = 0 if SWIZZLE_ENABLE == 0 when
3563     *       using SMEM. This can be done in the shader by clearing STRIDE with s_and.
3564     *       That way the same descriptor can be used by both SMEM and VMEM.
3565     *
3566     * GFX9:
3567     * - For SMEM and STRIDE == 0, it's in byte units.
3568     * - For SMEM and STRIDE != 0, it's in units of STRIDE.
3569     * - For VMEM and inst.IDXEN == 0 or STRIDE == 0, it's in byte units.
3570     * - For VMEM and inst.IDXEN == 1 and STRIDE != 0, it's in units of STRIDE.
3571     */
3572    if (screen->info.chip_class == GFX8)
3573       num_records *= stride;
3574 
3575    state[4] = 0;
3576    state[5] = S_008F04_STRIDE(stride);
3577    state[6] = num_records;
3578    state[7] = S_008F0C_DST_SEL_X(si_map_swizzle(desc->swizzle[0])) |
3579               S_008F0C_DST_SEL_Y(si_map_swizzle(desc->swizzle[1])) |
3580               S_008F0C_DST_SEL_Z(si_map_swizzle(desc->swizzle[2])) |
3581               S_008F0C_DST_SEL_W(si_map_swizzle(desc->swizzle[3]));
3582 
3583    if (screen->info.chip_class >= GFX10) {
3584       const struct gfx10_format *fmt = &gfx10_format_table[format];
3585 
3586       /* OOB_SELECT chooses the out-of-bounds check:
3587        *  - 0: (index >= NUM_RECORDS) || (offset >= STRIDE)
3588        *  - 1: index >= NUM_RECORDS
3589        *  - 2: NUM_RECORDS == 0
3590        *  - 3: if SWIZZLE_ENABLE == 0: offset >= NUM_RECORDS
3591        *       else: swizzle_address >= NUM_RECORDS
3592        */
3593       state[7] |= S_008F0C_FORMAT(fmt->img_format) |
3594                   S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_STRUCTURED_WITH_OFFSET) |
3595                   S_008F0C_RESOURCE_LEVEL(1);
3596    } else {
3597       int first_non_void;
3598       unsigned num_format, data_format;
3599 
3600       first_non_void = util_format_get_first_non_void_channel(format);
3601       num_format = si_translate_buffer_numformat(&screen->b, desc, first_non_void);
3602       data_format = si_translate_buffer_dataformat(&screen->b, desc, first_non_void);
3603 
3604       state[7] |= S_008F0C_NUM_FORMAT(num_format) | S_008F0C_DATA_FORMAT(data_format);
3605    }
3606 }
3607 
gfx9_border_color_swizzle(const unsigned char swizzle[4])3608 static unsigned gfx9_border_color_swizzle(const unsigned char swizzle[4])
3609 {
3610    unsigned bc_swizzle = V_008F20_BC_SWIZZLE_XYZW;
3611 
3612    if (swizzle[3] == PIPE_SWIZZLE_X) {
3613       /* For the pre-defined border color values (white, opaque
3614        * black, transparent black), the only thing that matters is
3615        * that the alpha channel winds up in the correct place
3616        * (because the RGB channels are all the same) so either of
3617        * these enumerations will work.
3618        */
3619       if (swizzle[2] == PIPE_SWIZZLE_Y)
3620          bc_swizzle = V_008F20_BC_SWIZZLE_WZYX;
3621       else
3622          bc_swizzle = V_008F20_BC_SWIZZLE_WXYZ;
3623    } else if (swizzle[0] == PIPE_SWIZZLE_X) {
3624       if (swizzle[1] == PIPE_SWIZZLE_Y)
3625          bc_swizzle = V_008F20_BC_SWIZZLE_XYZW;
3626       else
3627          bc_swizzle = V_008F20_BC_SWIZZLE_XWYZ;
3628    } else if (swizzle[1] == PIPE_SWIZZLE_X) {
3629       bc_swizzle = V_008F20_BC_SWIZZLE_YXWZ;
3630    } else if (swizzle[2] == PIPE_SWIZZLE_X) {
3631       bc_swizzle = V_008F20_BC_SWIZZLE_ZYXW;
3632    }
3633 
3634    return bc_swizzle;
3635 }
3636 
3637 /**
3638  * Build the sampler view descriptor for a texture.
3639  */
gfx10_make_texture_descriptor(struct si_screen * screen,struct si_texture * tex,bool sampler,enum pipe_texture_target target,enum pipe_format pipe_format,const unsigned char state_swizzle[4],unsigned first_level,unsigned last_level,unsigned first_layer,unsigned last_layer,unsigned width,unsigned height,unsigned depth,uint32_t * state,uint32_t * fmask_state)3640 static void gfx10_make_texture_descriptor(
3641    struct si_screen *screen, struct si_texture *tex, bool sampler, enum pipe_texture_target target,
3642    enum pipe_format pipe_format, const unsigned char state_swizzle[4], unsigned first_level,
3643    unsigned last_level, unsigned first_layer, unsigned last_layer, unsigned width, unsigned height,
3644    unsigned depth, uint32_t *state, uint32_t *fmask_state)
3645 {
3646    struct pipe_resource *res = &tex->buffer.b.b;
3647    const struct util_format_description *desc;
3648    unsigned img_format;
3649    unsigned char swizzle[4];
3650    unsigned type;
3651    uint64_t va;
3652 
3653    desc = util_format_description(pipe_format);
3654    img_format = gfx10_format_table[pipe_format].img_format;
3655 
3656    if (desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) {
3657       const unsigned char swizzle_xxxx[4] = {0, 0, 0, 0};
3658       const unsigned char swizzle_yyyy[4] = {1, 1, 1, 1};
3659       const unsigned char swizzle_wwww[4] = {3, 3, 3, 3};
3660       bool is_stencil = false;
3661 
3662       switch (pipe_format) {
3663       case PIPE_FORMAT_S8_UINT_Z24_UNORM:
3664       case PIPE_FORMAT_X32_S8X24_UINT:
3665       case PIPE_FORMAT_X8Z24_UNORM:
3666          util_format_compose_swizzles(swizzle_yyyy, state_swizzle, swizzle);
3667          is_stencil = true;
3668          break;
3669       case PIPE_FORMAT_X24S8_UINT:
3670          /*
3671           * X24S8 is implemented as an 8_8_8_8 data format, to
3672           * fix texture gathers. This affects at least
3673           * GL45-CTS.texture_cube_map_array.sampling on GFX8.
3674           */
3675          util_format_compose_swizzles(swizzle_wwww, state_swizzle, swizzle);
3676          is_stencil = true;
3677          break;
3678       default:
3679          util_format_compose_swizzles(swizzle_xxxx, state_swizzle, swizzle);
3680          is_stencil = pipe_format == PIPE_FORMAT_S8_UINT;
3681       }
3682 
3683       if (tex->upgraded_depth && !is_stencil) {
3684          assert(img_format == V_008F0C_IMG_FORMAT_32_FLOAT);
3685          img_format = V_008F0C_IMG_FORMAT_32_FLOAT_CLAMP;
3686       }
3687    } else {
3688       util_format_compose_swizzles(desc->swizzle, state_swizzle, swizzle);
3689    }
3690 
3691    if (!sampler && (res->target == PIPE_TEXTURE_CUBE || res->target == PIPE_TEXTURE_CUBE_ARRAY)) {
3692       /* For the purpose of shader images, treat cube maps as 2D
3693        * arrays.
3694        */
3695       type = V_008F1C_SQ_RSRC_IMG_2D_ARRAY;
3696    } else {
3697       type = si_tex_dim(screen, tex, target, res->nr_samples);
3698    }
3699 
3700    if (type == V_008F1C_SQ_RSRC_IMG_1D_ARRAY) {
3701       height = 1;
3702       depth = res->array_size;
3703    } else if (type == V_008F1C_SQ_RSRC_IMG_2D_ARRAY || type == V_008F1C_SQ_RSRC_IMG_2D_MSAA_ARRAY) {
3704       if (sampler || res->target != PIPE_TEXTURE_3D)
3705          depth = res->array_size;
3706    } else if (type == V_008F1C_SQ_RSRC_IMG_CUBE)
3707       depth = res->array_size / 6;
3708 
3709    state[0] = 0;
3710    state[1] = S_00A004_FORMAT(img_format) | S_00A004_WIDTH_LO(width - 1);
3711    state[2] = S_00A008_WIDTH_HI((width - 1) >> 2) | S_00A008_HEIGHT(height - 1) |
3712               S_00A008_RESOURCE_LEVEL(1);
3713    state[3] =
3714       S_00A00C_DST_SEL_X(si_map_swizzle(swizzle[0])) |
3715       S_00A00C_DST_SEL_Y(si_map_swizzle(swizzle[1])) |
3716       S_00A00C_DST_SEL_Z(si_map_swizzle(swizzle[2])) |
3717       S_00A00C_DST_SEL_W(si_map_swizzle(swizzle[3])) |
3718       S_00A00C_BASE_LEVEL(res->nr_samples > 1 ? 0 : first_level) |
3719       S_00A00C_LAST_LEVEL(res->nr_samples > 1 ? util_logbase2(res->nr_samples) : last_level) |
3720       S_00A00C_BC_SWIZZLE(gfx9_border_color_swizzle(desc->swizzle)) | S_00A00C_TYPE(type);
3721    /* Depth is the the last accessible layer on gfx9+. The hw doesn't need
3722     * to know the total number of layers.
3723     */
3724    state[4] =
3725       S_00A010_DEPTH((type == V_008F1C_SQ_RSRC_IMG_3D && sampler) ? depth - 1 : last_layer) |
3726       S_00A010_BASE_ARRAY(first_layer);
3727    state[5] = S_00A014_ARRAY_PITCH(!!(type == V_008F1C_SQ_RSRC_IMG_3D && !sampler)) |
3728               S_00A014_MAX_MIP(res->nr_samples > 1 ? util_logbase2(res->nr_samples)
3729                                                    : tex->buffer.b.b.last_level) |
3730               S_00A014_PERF_MOD(4);
3731    state[6] = 0;
3732    state[7] = 0;
3733 
3734    if (vi_dcc_enabled(tex, first_level)) {
3735       state[6] |= S_00A018_MAX_UNCOMPRESSED_BLOCK_SIZE(V_028C78_MAX_BLOCK_SIZE_256B) |
3736                   S_00A018_MAX_COMPRESSED_BLOCK_SIZE(tex->surface.u.gfx9.dcc.max_compressed_block_size) |
3737                   S_00A018_ALPHA_IS_ON_MSB(vi_alpha_is_on_msb(screen, pipe_format));
3738    }
3739 
3740    /* Initialize the sampler view for FMASK. */
3741    if (tex->surface.fmask_offset) {
3742       uint32_t format;
3743 
3744       va = tex->buffer.gpu_address + tex->surface.fmask_offset;
3745 
3746 #define FMASK(s, f) (((unsigned)(MAX2(1, s)) * 16) + (MAX2(1, f)))
3747       switch (FMASK(res->nr_samples, res->nr_storage_samples)) {
3748       case FMASK(2, 1):
3749          format = V_008F0C_IMG_FORMAT_FMASK8_S2_F1;
3750          break;
3751       case FMASK(2, 2):
3752          format = V_008F0C_IMG_FORMAT_FMASK8_S2_F2;
3753          break;
3754       case FMASK(4, 1):
3755          format = V_008F0C_IMG_FORMAT_FMASK8_S4_F1;
3756          break;
3757       case FMASK(4, 2):
3758          format = V_008F0C_IMG_FORMAT_FMASK8_S4_F2;
3759          break;
3760       case FMASK(4, 4):
3761          format = V_008F0C_IMG_FORMAT_FMASK8_S4_F4;
3762          break;
3763       case FMASK(8, 1):
3764          format = V_008F0C_IMG_FORMAT_FMASK8_S8_F1;
3765          break;
3766       case FMASK(8, 2):
3767          format = V_008F0C_IMG_FORMAT_FMASK16_S8_F2;
3768          break;
3769       case FMASK(8, 4):
3770          format = V_008F0C_IMG_FORMAT_FMASK32_S8_F4;
3771          break;
3772       case FMASK(8, 8):
3773          format = V_008F0C_IMG_FORMAT_FMASK32_S8_F8;
3774          break;
3775       case FMASK(16, 1):
3776          format = V_008F0C_IMG_FORMAT_FMASK16_S16_F1;
3777          break;
3778       case FMASK(16, 2):
3779          format = V_008F0C_IMG_FORMAT_FMASK32_S16_F2;
3780          break;
3781       case FMASK(16, 4):
3782          format = V_008F0C_IMG_FORMAT_FMASK64_S16_F4;
3783          break;
3784       case FMASK(16, 8):
3785          format = V_008F0C_IMG_FORMAT_FMASK64_S16_F8;
3786          break;
3787       default:
3788          unreachable("invalid nr_samples");
3789       }
3790 #undef FMASK
3791       fmask_state[0] = (va >> 8) | tex->surface.fmask_tile_swizzle;
3792       fmask_state[1] = S_00A004_BASE_ADDRESS_HI(va >> 40) | S_00A004_FORMAT(format) |
3793                        S_00A004_WIDTH_LO(width - 1);
3794       fmask_state[2] = S_00A008_WIDTH_HI((width - 1) >> 2) | S_00A008_HEIGHT(height - 1) |
3795                        S_00A008_RESOURCE_LEVEL(1);
3796       fmask_state[3] =
3797          S_00A00C_DST_SEL_X(V_008F1C_SQ_SEL_X) | S_00A00C_DST_SEL_Y(V_008F1C_SQ_SEL_X) |
3798          S_00A00C_DST_SEL_Z(V_008F1C_SQ_SEL_X) | S_00A00C_DST_SEL_W(V_008F1C_SQ_SEL_X) |
3799          S_00A00C_SW_MODE(tex->surface.u.gfx9.fmask.swizzle_mode) |
3800          S_00A00C_TYPE(si_tex_dim(screen, tex, target, 0));
3801       fmask_state[4] = S_00A010_DEPTH(last_layer) | S_00A010_BASE_ARRAY(first_layer);
3802       fmask_state[5] = 0;
3803       fmask_state[6] = S_00A018_META_PIPE_ALIGNED(1);
3804       fmask_state[7] = 0;
3805    }
3806 }
3807 
3808 /**
3809  * Build the sampler view descriptor for a texture (SI-GFX9).
3810  */
si_make_texture_descriptor(struct si_screen * screen,struct si_texture * tex,bool sampler,enum pipe_texture_target target,enum pipe_format pipe_format,const unsigned char state_swizzle[4],unsigned first_level,unsigned last_level,unsigned first_layer,unsigned last_layer,unsigned width,unsigned height,unsigned depth,uint32_t * state,uint32_t * fmask_state)3811 static void si_make_texture_descriptor(struct si_screen *screen, struct si_texture *tex,
3812                                        bool sampler, enum pipe_texture_target target,
3813                                        enum pipe_format pipe_format,
3814                                        const unsigned char state_swizzle[4], unsigned first_level,
3815                                        unsigned last_level, unsigned first_layer,
3816                                        unsigned last_layer, unsigned width, unsigned height,
3817                                        unsigned depth, uint32_t *state, uint32_t *fmask_state)
3818 {
3819    struct pipe_resource *res = &tex->buffer.b.b;
3820    const struct util_format_description *desc;
3821    unsigned char swizzle[4];
3822    int first_non_void;
3823    unsigned num_format, data_format, type, num_samples;
3824    uint64_t va;
3825 
3826    desc = util_format_description(pipe_format);
3827 
3828    num_samples = desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS ? MAX2(1, res->nr_samples)
3829                                                                : MAX2(1, res->nr_storage_samples);
3830 
3831    if (desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) {
3832       const unsigned char swizzle_xxxx[4] = {0, 0, 0, 0};
3833       const unsigned char swizzle_yyyy[4] = {1, 1, 1, 1};
3834       const unsigned char swizzle_wwww[4] = {3, 3, 3, 3};
3835 
3836       switch (pipe_format) {
3837       case PIPE_FORMAT_S8_UINT_Z24_UNORM:
3838       case PIPE_FORMAT_X32_S8X24_UINT:
3839       case PIPE_FORMAT_X8Z24_UNORM:
3840          util_format_compose_swizzles(swizzle_yyyy, state_swizzle, swizzle);
3841          break;
3842       case PIPE_FORMAT_X24S8_UINT:
3843          /*
3844           * X24S8 is implemented as an 8_8_8_8 data format, to
3845           * fix texture gathers. This affects at least
3846           * GL45-CTS.texture_cube_map_array.sampling on GFX8.
3847           */
3848          if (screen->info.chip_class <= GFX8)
3849             util_format_compose_swizzles(swizzle_wwww, state_swizzle, swizzle);
3850          else
3851             util_format_compose_swizzles(swizzle_yyyy, state_swizzle, swizzle);
3852          break;
3853       default:
3854          util_format_compose_swizzles(swizzle_xxxx, state_swizzle, swizzle);
3855       }
3856    } else {
3857       util_format_compose_swizzles(desc->swizzle, state_swizzle, swizzle);
3858    }
3859 
3860    first_non_void = util_format_get_first_non_void_channel(pipe_format);
3861 
3862    switch (pipe_format) {
3863    case PIPE_FORMAT_S8_UINT_Z24_UNORM:
3864       num_format = V_008F14_IMG_NUM_FORMAT_UNORM;
3865       break;
3866    default:
3867       if (first_non_void < 0) {
3868          if (util_format_is_compressed(pipe_format)) {
3869             switch (pipe_format) {
3870             case PIPE_FORMAT_DXT1_SRGB:
3871             case PIPE_FORMAT_DXT1_SRGBA:
3872             case PIPE_FORMAT_DXT3_SRGBA:
3873             case PIPE_FORMAT_DXT5_SRGBA:
3874             case PIPE_FORMAT_BPTC_SRGBA:
3875             case PIPE_FORMAT_ETC2_SRGB8:
3876             case PIPE_FORMAT_ETC2_SRGB8A1:
3877             case PIPE_FORMAT_ETC2_SRGBA8:
3878                num_format = V_008F14_IMG_NUM_FORMAT_SRGB;
3879                break;
3880             case PIPE_FORMAT_RGTC1_SNORM:
3881             case PIPE_FORMAT_LATC1_SNORM:
3882             case PIPE_FORMAT_RGTC2_SNORM:
3883             case PIPE_FORMAT_LATC2_SNORM:
3884             case PIPE_FORMAT_ETC2_R11_SNORM:
3885             case PIPE_FORMAT_ETC2_RG11_SNORM:
3886             /* implies float, so use SNORM/UNORM to determine
3887                whether data is signed or not */
3888             case PIPE_FORMAT_BPTC_RGB_FLOAT:
3889                num_format = V_008F14_IMG_NUM_FORMAT_SNORM;
3890                break;
3891             default:
3892                num_format = V_008F14_IMG_NUM_FORMAT_UNORM;
3893                break;
3894             }
3895          } else if (desc->layout == UTIL_FORMAT_LAYOUT_SUBSAMPLED) {
3896             num_format = V_008F14_IMG_NUM_FORMAT_UNORM;
3897          } else {
3898             num_format = V_008F14_IMG_NUM_FORMAT_FLOAT;
3899          }
3900       } else if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) {
3901          num_format = V_008F14_IMG_NUM_FORMAT_SRGB;
3902       } else {
3903          num_format = V_008F14_IMG_NUM_FORMAT_UNORM;
3904 
3905          switch (desc->channel[first_non_void].type) {
3906          case UTIL_FORMAT_TYPE_FLOAT:
3907             num_format = V_008F14_IMG_NUM_FORMAT_FLOAT;
3908             break;
3909          case UTIL_FORMAT_TYPE_SIGNED:
3910             if (desc->channel[first_non_void].normalized)
3911                num_format = V_008F14_IMG_NUM_FORMAT_SNORM;
3912             else if (desc->channel[first_non_void].pure_integer)
3913                num_format = V_008F14_IMG_NUM_FORMAT_SINT;
3914             else
3915                num_format = V_008F14_IMG_NUM_FORMAT_SSCALED;
3916             break;
3917          case UTIL_FORMAT_TYPE_UNSIGNED:
3918             if (desc->channel[first_non_void].normalized)
3919                num_format = V_008F14_IMG_NUM_FORMAT_UNORM;
3920             else if (desc->channel[first_non_void].pure_integer)
3921                num_format = V_008F14_IMG_NUM_FORMAT_UINT;
3922             else
3923                num_format = V_008F14_IMG_NUM_FORMAT_USCALED;
3924          }
3925       }
3926    }
3927 
3928    data_format = si_translate_texformat(&screen->b, pipe_format, desc, first_non_void);
3929    if (data_format == ~0) {
3930       data_format = 0;
3931    }
3932 
3933    /* S8 with Z32 HTILE needs a special format. */
3934    if (screen->info.chip_class == GFX9 && pipe_format == PIPE_FORMAT_S8_UINT)
3935       data_format = V_008F14_IMG_DATA_FORMAT_S8_32;
3936 
3937    if (!sampler && (res->target == PIPE_TEXTURE_CUBE || res->target == PIPE_TEXTURE_CUBE_ARRAY ||
3938                     (screen->info.chip_class <= GFX8 && res->target == PIPE_TEXTURE_3D))) {
3939       /* For the purpose of shader images, treat cube maps and 3D
3940        * textures as 2D arrays. For 3D textures, the address
3941        * calculations for mipmaps are different, so we rely on the
3942        * caller to effectively disable mipmaps.
3943        */
3944       type = V_008F1C_SQ_RSRC_IMG_2D_ARRAY;
3945 
3946       assert(res->target != PIPE_TEXTURE_3D || (first_level == 0 && last_level == 0));
3947    } else {
3948       type = si_tex_dim(screen, tex, target, num_samples);
3949    }
3950 
3951    if (type == V_008F1C_SQ_RSRC_IMG_1D_ARRAY) {
3952       height = 1;
3953       depth = res->array_size;
3954    } else if (type == V_008F1C_SQ_RSRC_IMG_2D_ARRAY || type == V_008F1C_SQ_RSRC_IMG_2D_MSAA_ARRAY) {
3955       if (sampler || res->target != PIPE_TEXTURE_3D)
3956          depth = res->array_size;
3957    } else if (type == V_008F1C_SQ_RSRC_IMG_CUBE)
3958       depth = res->array_size / 6;
3959 
3960    state[0] = 0;
3961    state[1] = (S_008F14_DATA_FORMAT(data_format) | S_008F14_NUM_FORMAT(num_format));
3962    state[2] = (S_008F18_WIDTH(width - 1) | S_008F18_HEIGHT(height - 1) | S_008F18_PERF_MOD(4));
3963    state[3] = (S_008F1C_DST_SEL_X(si_map_swizzle(swizzle[0])) |
3964                S_008F1C_DST_SEL_Y(si_map_swizzle(swizzle[1])) |
3965                S_008F1C_DST_SEL_Z(si_map_swizzle(swizzle[2])) |
3966                S_008F1C_DST_SEL_W(si_map_swizzle(swizzle[3])) |
3967                S_008F1C_BASE_LEVEL(num_samples > 1 ? 0 : first_level) |
3968                S_008F1C_LAST_LEVEL(num_samples > 1 ? util_logbase2(num_samples) : last_level) |
3969                S_008F1C_TYPE(type));
3970    state[4] = 0;
3971    state[5] = S_008F24_BASE_ARRAY(first_layer);
3972    state[6] = 0;
3973    state[7] = 0;
3974 
3975    if (screen->info.chip_class == GFX9) {
3976       unsigned bc_swizzle = gfx9_border_color_swizzle(desc->swizzle);
3977 
3978       /* Depth is the the last accessible layer on Gfx9.
3979        * The hw doesn't need to know the total number of layers.
3980        */
3981       if (type == V_008F1C_SQ_RSRC_IMG_3D)
3982          state[4] |= S_008F20_DEPTH(depth - 1);
3983       else
3984          state[4] |= S_008F20_DEPTH(last_layer);
3985 
3986       state[4] |= S_008F20_BC_SWIZZLE(bc_swizzle);
3987       state[5] |= S_008F24_MAX_MIP(num_samples > 1 ? util_logbase2(num_samples)
3988                                                    : tex->buffer.b.b.last_level);
3989    } else {
3990       state[3] |= S_008F1C_POW2_PAD(res->last_level > 0);
3991       state[4] |= S_008F20_DEPTH(depth - 1);
3992       state[5] |= S_008F24_LAST_ARRAY(last_layer);
3993    }
3994 
3995    if (vi_dcc_enabled(tex, first_level)) {
3996       state[6] = S_008F28_ALPHA_IS_ON_MSB(vi_alpha_is_on_msb(screen, pipe_format));
3997    } else {
3998       /* The last dword is unused by hw. The shader uses it to clear
3999        * bits in the first dword of sampler state.
4000        */
4001       if (screen->info.chip_class <= GFX7 && res->nr_samples <= 1) {
4002          if (first_level == last_level)
4003             state[7] = C_008F30_MAX_ANISO_RATIO;
4004          else
4005             state[7] = 0xffffffff;
4006       }
4007    }
4008 
4009    /* Initialize the sampler view for FMASK. */
4010    if (tex->surface.fmask_offset) {
4011       uint32_t data_format, num_format;
4012 
4013       va = tex->buffer.gpu_address + tex->surface.fmask_offset;
4014 
4015 #define FMASK(s, f) (((unsigned)(MAX2(1, s)) * 16) + (MAX2(1, f)))
4016       if (screen->info.chip_class == GFX9) {
4017          data_format = V_008F14_IMG_DATA_FORMAT_FMASK;
4018          switch (FMASK(res->nr_samples, res->nr_storage_samples)) {
4019          case FMASK(2, 1):
4020             num_format = V_008F14_IMG_NUM_FORMAT_FMASK_8_2_1;
4021             break;
4022          case FMASK(2, 2):
4023             num_format = V_008F14_IMG_NUM_FORMAT_FMASK_8_2_2;
4024             break;
4025          case FMASK(4, 1):
4026             num_format = V_008F14_IMG_NUM_FORMAT_FMASK_8_4_1;
4027             break;
4028          case FMASK(4, 2):
4029             num_format = V_008F14_IMG_NUM_FORMAT_FMASK_8_4_2;
4030             break;
4031          case FMASK(4, 4):
4032             num_format = V_008F14_IMG_NUM_FORMAT_FMASK_8_4_4;
4033             break;
4034          case FMASK(8, 1):
4035             num_format = V_008F14_IMG_NUM_FORMAT_FMASK_8_8_1;
4036             break;
4037          case FMASK(8, 2):
4038             num_format = V_008F14_IMG_NUM_FORMAT_FMASK_16_8_2;
4039             break;
4040          case FMASK(8, 4):
4041             num_format = V_008F14_IMG_NUM_FORMAT_FMASK_32_8_4;
4042             break;
4043          case FMASK(8, 8):
4044             num_format = V_008F14_IMG_NUM_FORMAT_FMASK_32_8_8;
4045             break;
4046          case FMASK(16, 1):
4047             num_format = V_008F14_IMG_NUM_FORMAT_FMASK_16_16_1;
4048             break;
4049          case FMASK(16, 2):
4050             num_format = V_008F14_IMG_NUM_FORMAT_FMASK_32_16_2;
4051             break;
4052          case FMASK(16, 4):
4053             num_format = V_008F14_IMG_NUM_FORMAT_FMASK_64_16_4;
4054             break;
4055          case FMASK(16, 8):
4056             num_format = V_008F14_IMG_NUM_FORMAT_FMASK_64_16_8;
4057             break;
4058          default:
4059             unreachable("invalid nr_samples");
4060          }
4061       } else {
4062          switch (FMASK(res->nr_samples, res->nr_storage_samples)) {
4063          case FMASK(2, 1):
4064             data_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S2_F1;
4065             break;
4066          case FMASK(2, 2):
4067             data_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S2_F2;
4068             break;
4069          case FMASK(4, 1):
4070             data_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S4_F1;
4071             break;
4072          case FMASK(4, 2):
4073             data_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S4_F2;
4074             break;
4075          case FMASK(4, 4):
4076             data_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S4_F4;
4077             break;
4078          case FMASK(8, 1):
4079             data_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S8_F1;
4080             break;
4081          case FMASK(8, 2):
4082             data_format = V_008F14_IMG_DATA_FORMAT_FMASK16_S8_F2;
4083             break;
4084          case FMASK(8, 4):
4085             data_format = V_008F14_IMG_DATA_FORMAT_FMASK32_S8_F4;
4086             break;
4087          case FMASK(8, 8):
4088             data_format = V_008F14_IMG_DATA_FORMAT_FMASK32_S8_F8;
4089             break;
4090          case FMASK(16, 1):
4091             data_format = V_008F14_IMG_DATA_FORMAT_FMASK16_S16_F1;
4092             break;
4093          case FMASK(16, 2):
4094             data_format = V_008F14_IMG_DATA_FORMAT_FMASK32_S16_F2;
4095             break;
4096          case FMASK(16, 4):
4097             data_format = V_008F14_IMG_DATA_FORMAT_FMASK64_S16_F4;
4098             break;
4099          case FMASK(16, 8):
4100             data_format = V_008F14_IMG_DATA_FORMAT_FMASK64_S16_F8;
4101             break;
4102          default:
4103             unreachable("invalid nr_samples");
4104          }
4105          num_format = V_008F14_IMG_NUM_FORMAT_UINT;
4106       }
4107 #undef FMASK
4108 
4109       fmask_state[0] = (va >> 8) | tex->surface.fmask_tile_swizzle;
4110       fmask_state[1] = S_008F14_BASE_ADDRESS_HI(va >> 40) | S_008F14_DATA_FORMAT(data_format) |
4111                        S_008F14_NUM_FORMAT(num_format);
4112       fmask_state[2] = S_008F18_WIDTH(width - 1) | S_008F18_HEIGHT(height - 1);
4113       fmask_state[3] =
4114          S_008F1C_DST_SEL_X(V_008F1C_SQ_SEL_X) | S_008F1C_DST_SEL_Y(V_008F1C_SQ_SEL_X) |
4115          S_008F1C_DST_SEL_Z(V_008F1C_SQ_SEL_X) | S_008F1C_DST_SEL_W(V_008F1C_SQ_SEL_X) |
4116          S_008F1C_TYPE(si_tex_dim(screen, tex, target, 0));
4117       fmask_state[4] = 0;
4118       fmask_state[5] = S_008F24_BASE_ARRAY(first_layer);
4119       fmask_state[6] = 0;
4120       fmask_state[7] = 0;
4121 
4122       if (screen->info.chip_class == GFX9) {
4123          fmask_state[3] |= S_008F1C_SW_MODE(tex->surface.u.gfx9.fmask.swizzle_mode);
4124          fmask_state[4] |=
4125             S_008F20_DEPTH(last_layer) | S_008F20_PITCH(tex->surface.u.gfx9.fmask.epitch);
4126          fmask_state[5] |= S_008F24_META_PIPE_ALIGNED(1) |
4127                            S_008F24_META_RB_ALIGNED(1);
4128       } else {
4129          fmask_state[3] |= S_008F1C_TILING_INDEX(tex->surface.u.legacy.fmask.tiling_index);
4130          fmask_state[4] |= S_008F20_DEPTH(depth - 1) |
4131                            S_008F20_PITCH(tex->surface.u.legacy.fmask.pitch_in_pixels - 1);
4132          fmask_state[5] |= S_008F24_LAST_ARRAY(last_layer);
4133       }
4134    }
4135 }
4136 
4137 /**
4138  * Create a sampler view.
4139  *
4140  * @param ctx		context
4141  * @param texture	texture
4142  * @param state		sampler view template
4143  * @param width0	width0 override (for compressed textures as int)
4144  * @param height0	height0 override (for compressed textures as int)
4145  * @param force_level   set the base address to the level (for compressed textures)
4146  */
si_create_sampler_view_custom(struct pipe_context * ctx,struct pipe_resource * texture,const struct pipe_sampler_view * state,unsigned width0,unsigned height0,unsigned force_level)4147 struct pipe_sampler_view *si_create_sampler_view_custom(struct pipe_context *ctx,
4148                                                         struct pipe_resource *texture,
4149                                                         const struct pipe_sampler_view *state,
4150                                                         unsigned width0, unsigned height0,
4151                                                         unsigned force_level)
4152 {
4153    struct si_context *sctx = (struct si_context *)ctx;
4154    struct si_sampler_view *view = CALLOC_STRUCT(si_sampler_view);
4155    struct si_texture *tex = (struct si_texture *)texture;
4156    unsigned base_level, first_level, last_level;
4157    unsigned char state_swizzle[4];
4158    unsigned height, depth, width;
4159    unsigned last_layer = state->u.tex.last_layer;
4160    enum pipe_format pipe_format;
4161    const struct legacy_surf_level *surflevel;
4162 
4163    if (!view)
4164       return NULL;
4165 
4166    /* initialize base object */
4167    view->base = *state;
4168    view->base.texture = NULL;
4169    view->base.reference.count = 1;
4170    view->base.context = ctx;
4171 
4172    assert(texture);
4173    pipe_resource_reference(&view->base.texture, texture);
4174 
4175    if (state->format == PIPE_FORMAT_X24S8_UINT || state->format == PIPE_FORMAT_S8X24_UINT ||
4176        state->format == PIPE_FORMAT_X32_S8X24_UINT || state->format == PIPE_FORMAT_S8_UINT)
4177       view->is_stencil_sampler = true;
4178 
4179    /* Buffer resource. */
4180    if (texture->target == PIPE_BUFFER) {
4181       si_make_buffer_descriptor(sctx->screen, si_resource(texture), state->format,
4182                                 state->u.buf.offset, state->u.buf.size, view->state);
4183       return &view->base;
4184    }
4185 
4186    state_swizzle[0] = state->swizzle_r;
4187    state_swizzle[1] = state->swizzle_g;
4188    state_swizzle[2] = state->swizzle_b;
4189    state_swizzle[3] = state->swizzle_a;
4190 
4191    base_level = 0;
4192    first_level = state->u.tex.first_level;
4193    last_level = state->u.tex.last_level;
4194    width = width0;
4195    height = height0;
4196    depth = texture->depth0;
4197 
4198    if (sctx->chip_class <= GFX8 && force_level) {
4199       assert(force_level == first_level && force_level == last_level);
4200       base_level = force_level;
4201       first_level = 0;
4202       last_level = 0;
4203       width = u_minify(width, force_level);
4204       height = u_minify(height, force_level);
4205       depth = u_minify(depth, force_level);
4206    }
4207 
4208    /* This is not needed if gallium frontends set last_layer correctly. */
4209    if (state->target == PIPE_TEXTURE_1D || state->target == PIPE_TEXTURE_2D ||
4210        state->target == PIPE_TEXTURE_RECT || state->target == PIPE_TEXTURE_CUBE)
4211       last_layer = state->u.tex.first_layer;
4212 
4213    /* Texturing with separate depth and stencil. */
4214    pipe_format = state->format;
4215 
4216    /* Depth/stencil texturing sometimes needs separate texture. */
4217    if (tex->is_depth && !si_can_sample_zs(tex, view->is_stencil_sampler)) {
4218       if (!tex->flushed_depth_texture && !si_init_flushed_depth_texture(ctx, texture)) {
4219          pipe_resource_reference(&view->base.texture, NULL);
4220          FREE(view);
4221          return NULL;
4222       }
4223 
4224       assert(tex->flushed_depth_texture);
4225 
4226       /* Override format for the case where the flushed texture
4227        * contains only Z or only S.
4228        */
4229       if (tex->flushed_depth_texture->buffer.b.b.format != tex->buffer.b.b.format)
4230          pipe_format = tex->flushed_depth_texture->buffer.b.b.format;
4231 
4232       tex = tex->flushed_depth_texture;
4233    }
4234 
4235    surflevel = tex->surface.u.legacy.level;
4236 
4237    if (tex->db_compatible) {
4238       if (!view->is_stencil_sampler)
4239          pipe_format = tex->db_render_format;
4240 
4241       switch (pipe_format) {
4242       case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
4243          pipe_format = PIPE_FORMAT_Z32_FLOAT;
4244          break;
4245       case PIPE_FORMAT_X8Z24_UNORM:
4246       case PIPE_FORMAT_S8_UINT_Z24_UNORM:
4247          /* Z24 is always stored like this for DB
4248           * compatibility.
4249           */
4250          pipe_format = PIPE_FORMAT_Z24X8_UNORM;
4251          break;
4252       case PIPE_FORMAT_X24S8_UINT:
4253       case PIPE_FORMAT_S8X24_UINT:
4254       case PIPE_FORMAT_X32_S8X24_UINT:
4255          pipe_format = PIPE_FORMAT_S8_UINT;
4256          surflevel = tex->surface.u.legacy.stencil_level;
4257          break;
4258       default:;
4259       }
4260    }
4261 
4262    view->dcc_incompatible =
4263       vi_dcc_formats_are_incompatible(texture, state->u.tex.first_level, state->format);
4264 
4265    sctx->screen->make_texture_descriptor(
4266       sctx->screen, tex, true, state->target, pipe_format, state_swizzle, first_level, last_level,
4267       state->u.tex.first_layer, last_layer, width, height, depth, view->state, view->fmask_state);
4268 
4269    const struct util_format_description *desc = util_format_description(pipe_format);
4270    view->is_integer = false;
4271 
4272    for (unsigned i = 0; i < desc->nr_channels; ++i) {
4273       if (desc->channel[i].type == UTIL_FORMAT_TYPE_VOID)
4274          continue;
4275 
4276       /* Whether the number format is {U,S}{SCALED,INT} */
4277       view->is_integer = (desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED ||
4278                           desc->channel[i].type == UTIL_FORMAT_TYPE_SIGNED) &&
4279                          (desc->channel[i].pure_integer || !desc->channel[i].normalized);
4280       break;
4281    }
4282 
4283    view->base_level_info = &surflevel[base_level];
4284    view->base_level = base_level;
4285    view->block_width = util_format_get_blockwidth(pipe_format);
4286    return &view->base;
4287 }
4288 
si_create_sampler_view(struct pipe_context * ctx,struct pipe_resource * texture,const struct pipe_sampler_view * state)4289 static struct pipe_sampler_view *si_create_sampler_view(struct pipe_context *ctx,
4290                                                         struct pipe_resource *texture,
4291                                                         const struct pipe_sampler_view *state)
4292 {
4293    return si_create_sampler_view_custom(ctx, texture, state, texture ? texture->width0 : 0,
4294                                         texture ? texture->height0 : 0, 0);
4295 }
4296 
si_sampler_view_destroy(struct pipe_context * ctx,struct pipe_sampler_view * state)4297 static void si_sampler_view_destroy(struct pipe_context *ctx, struct pipe_sampler_view *state)
4298 {
4299    struct si_sampler_view *view = (struct si_sampler_view *)state;
4300 
4301    pipe_resource_reference(&state->texture, NULL);
4302    FREE(view);
4303 }
4304 
wrap_mode_uses_border_color(unsigned wrap,bool linear_filter)4305 static bool wrap_mode_uses_border_color(unsigned wrap, bool linear_filter)
4306 {
4307    return wrap == PIPE_TEX_WRAP_CLAMP_TO_BORDER || wrap == PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER ||
4308           (linear_filter && (wrap == PIPE_TEX_WRAP_CLAMP || wrap == PIPE_TEX_WRAP_MIRROR_CLAMP));
4309 }
4310 
si_translate_border_color(struct si_context * sctx,const struct pipe_sampler_state * state,const union pipe_color_union * color,bool is_integer)4311 static uint32_t si_translate_border_color(struct si_context *sctx,
4312                                           const struct pipe_sampler_state *state,
4313                                           const union pipe_color_union *color, bool is_integer)
4314 {
4315    bool linear_filter = state->min_img_filter != PIPE_TEX_FILTER_NEAREST ||
4316                         state->mag_img_filter != PIPE_TEX_FILTER_NEAREST;
4317 
4318    if (!wrap_mode_uses_border_color(state->wrap_s, linear_filter) &&
4319        !wrap_mode_uses_border_color(state->wrap_t, linear_filter) &&
4320        !wrap_mode_uses_border_color(state->wrap_r, linear_filter))
4321       return S_008F3C_BORDER_COLOR_TYPE(V_008F3C_SQ_TEX_BORDER_COLOR_TRANS_BLACK);
4322 
4323 #define simple_border_types(elt)                                                                   \
4324    do {                                                                                            \
4325       if (color->elt[0] == 0 && color->elt[1] == 0 && color->elt[2] == 0 && color->elt[3] == 0)    \
4326          return S_008F3C_BORDER_COLOR_TYPE(V_008F3C_SQ_TEX_BORDER_COLOR_TRANS_BLACK);              \
4327       if (color->elt[0] == 0 && color->elt[1] == 0 && color->elt[2] == 0 && color->elt[3] == 1)    \
4328          return S_008F3C_BORDER_COLOR_TYPE(V_008F3C_SQ_TEX_BORDER_COLOR_OPAQUE_BLACK);             \
4329       if (color->elt[0] == 1 && color->elt[1] == 1 && color->elt[2] == 1 && color->elt[3] == 1)    \
4330          return S_008F3C_BORDER_COLOR_TYPE(V_008F3C_SQ_TEX_BORDER_COLOR_OPAQUE_WHITE);             \
4331    } while (false)
4332 
4333    if (is_integer)
4334       simple_border_types(ui);
4335    else
4336       simple_border_types(f);
4337 
4338 #undef simple_border_types
4339 
4340    int i;
4341 
4342    /* Check if the border has been uploaded already. */
4343    for (i = 0; i < sctx->border_color_count; i++)
4344       if (memcmp(&sctx->border_color_table[i], color, sizeof(*color)) == 0)
4345          break;
4346 
4347    if (i >= SI_MAX_BORDER_COLORS) {
4348       /* Getting 4096 unique border colors is very unlikely. */
4349       fprintf(stderr, "radeonsi: The border color table is full. "
4350                       "Any new border colors will be just black. "
4351                       "Please file a bug.\n");
4352       return S_008F3C_BORDER_COLOR_TYPE(V_008F3C_SQ_TEX_BORDER_COLOR_TRANS_BLACK);
4353    }
4354 
4355    if (i == sctx->border_color_count) {
4356       /* Upload a new border color. */
4357       memcpy(&sctx->border_color_table[i], color, sizeof(*color));
4358       util_memcpy_cpu_to_le32(&sctx->border_color_map[i], color, sizeof(*color));
4359       sctx->border_color_count++;
4360    }
4361 
4362    return S_008F3C_BORDER_COLOR_PTR(i) |
4363           S_008F3C_BORDER_COLOR_TYPE(V_008F3C_SQ_TEX_BORDER_COLOR_REGISTER);
4364 }
4365 
S_FIXED(float value,unsigned frac_bits)4366 static inline int S_FIXED(float value, unsigned frac_bits)
4367 {
4368    return value * (1 << frac_bits);
4369 }
4370 
si_tex_filter(unsigned filter,unsigned max_aniso)4371 static inline unsigned si_tex_filter(unsigned filter, unsigned max_aniso)
4372 {
4373    if (filter == PIPE_TEX_FILTER_LINEAR)
4374       return max_aniso > 1 ? V_008F38_SQ_TEX_XY_FILTER_ANISO_BILINEAR
4375                            : V_008F38_SQ_TEX_XY_FILTER_BILINEAR;
4376    else
4377       return max_aniso > 1 ? V_008F38_SQ_TEX_XY_FILTER_ANISO_POINT
4378                            : V_008F38_SQ_TEX_XY_FILTER_POINT;
4379 }
4380 
si_tex_aniso_filter(unsigned filter)4381 static inline unsigned si_tex_aniso_filter(unsigned filter)
4382 {
4383    if (filter < 2)
4384       return 0;
4385    if (filter < 4)
4386       return 1;
4387    if (filter < 8)
4388       return 2;
4389    if (filter < 16)
4390       return 3;
4391    return 4;
4392 }
4393 
si_create_sampler_state(struct pipe_context * ctx,const struct pipe_sampler_state * state)4394 static void *si_create_sampler_state(struct pipe_context *ctx,
4395                                      const struct pipe_sampler_state *state)
4396 {
4397    struct si_context *sctx = (struct si_context *)ctx;
4398    struct si_screen *sscreen = sctx->screen;
4399    struct si_sampler_state *rstate = CALLOC_STRUCT(si_sampler_state);
4400    unsigned max_aniso = sscreen->force_aniso >= 0 ? sscreen->force_aniso : state->max_anisotropy;
4401    unsigned max_aniso_ratio = si_tex_aniso_filter(max_aniso);
4402    bool trunc_coord = !sscreen->options.no_trunc_coord &&
4403                       state->min_img_filter == PIPE_TEX_FILTER_NEAREST &&
4404                       state->mag_img_filter == PIPE_TEX_FILTER_NEAREST &&
4405                       state->compare_mode == PIPE_TEX_COMPARE_NONE;
4406    union pipe_color_union clamped_border_color;
4407 
4408    if (!rstate) {
4409       return NULL;
4410    }
4411 
4412 #ifndef NDEBUG
4413    rstate->magic = SI_SAMPLER_STATE_MAGIC;
4414 #endif
4415    rstate->val[0] =
4416       (S_008F30_CLAMP_X(si_tex_wrap(state->wrap_s)) | S_008F30_CLAMP_Y(si_tex_wrap(state->wrap_t)) |
4417        S_008F30_CLAMP_Z(si_tex_wrap(state->wrap_r)) | S_008F30_MAX_ANISO_RATIO(max_aniso_ratio) |
4418        S_008F30_DEPTH_COMPARE_FUNC(si_tex_compare(state->compare_func)) |
4419        S_008F30_FORCE_UNNORMALIZED(!state->normalized_coords) |
4420        S_008F30_ANISO_THRESHOLD(max_aniso_ratio >> 1) | S_008F30_ANISO_BIAS(max_aniso_ratio) |
4421        S_008F30_DISABLE_CUBE_WRAP(!state->seamless_cube_map) |
4422        S_008F30_TRUNC_COORD(trunc_coord) |
4423        S_008F30_COMPAT_MODE(sctx->chip_class == GFX8 || sctx->chip_class == GFX9));
4424    rstate->val[1] = (S_008F34_MIN_LOD(S_FIXED(CLAMP(state->min_lod, 0, 15), 8)) |
4425                      S_008F34_MAX_LOD(S_FIXED(CLAMP(state->max_lod, 0, 15), 8)) |
4426                      S_008F34_PERF_MIP(max_aniso_ratio ? max_aniso_ratio + 6 : 0));
4427    rstate->val[2] = (S_008F38_LOD_BIAS(S_FIXED(CLAMP(state->lod_bias, -16, 16), 8)) |
4428                      S_008F38_XY_MAG_FILTER(si_tex_filter(state->mag_img_filter, max_aniso)) |
4429                      S_008F38_XY_MIN_FILTER(si_tex_filter(state->min_img_filter, max_aniso)) |
4430                      S_008F38_MIP_FILTER(si_tex_mipfilter(state->min_mip_filter)) |
4431                      S_008F38_MIP_POINT_PRECLAMP(0));
4432    rstate->val[3] = si_translate_border_color(sctx, state, &state->border_color, false);
4433 
4434    if (sscreen->info.chip_class >= GFX10) {
4435       rstate->val[2] |= S_008F38_ANISO_OVERRIDE_GFX10(1);
4436    } else {
4437       rstate->val[2] |= S_008F38_DISABLE_LSB_CEIL(sctx->chip_class <= GFX8) |
4438                         S_008F38_FILTER_PREC_FIX(1) |
4439                         S_008F38_ANISO_OVERRIDE_GFX8(sctx->chip_class >= GFX8);
4440    }
4441 
4442    /* Create sampler resource for integer textures. */
4443    memcpy(rstate->integer_val, rstate->val, sizeof(rstate->val));
4444    rstate->integer_val[3] = si_translate_border_color(sctx, state, &state->border_color, true);
4445 
4446    /* Create sampler resource for upgraded depth textures. */
4447    memcpy(rstate->upgraded_depth_val, rstate->val, sizeof(rstate->val));
4448 
4449    for (unsigned i = 0; i < 4; ++i) {
4450       /* Use channel 0 on purpose, so that we can use OPAQUE_WHITE
4451        * when the border color is 1.0. */
4452       clamped_border_color.f[i] = CLAMP(state->border_color.f[0], 0, 1);
4453    }
4454 
4455    if (memcmp(&state->border_color, &clamped_border_color, sizeof(clamped_border_color)) == 0) {
4456       if (sscreen->info.chip_class <= GFX9)
4457          rstate->upgraded_depth_val[3] |= S_008F3C_UPGRADED_DEPTH(1);
4458    } else {
4459       rstate->upgraded_depth_val[3] =
4460          si_translate_border_color(sctx, state, &clamped_border_color, false);
4461    }
4462 
4463    return rstate;
4464 }
4465 
si_set_sample_mask(struct pipe_context * ctx,unsigned sample_mask)4466 static void si_set_sample_mask(struct pipe_context *ctx, unsigned sample_mask)
4467 {
4468    struct si_context *sctx = (struct si_context *)ctx;
4469 
4470    if (sctx->sample_mask == (uint16_t)sample_mask)
4471       return;
4472 
4473    sctx->sample_mask = sample_mask;
4474    si_mark_atom_dirty(sctx, &sctx->atoms.s.sample_mask);
4475 }
4476 
si_emit_sample_mask(struct si_context * sctx)4477 static void si_emit_sample_mask(struct si_context *sctx)
4478 {
4479    struct radeon_cmdbuf *cs = sctx->gfx_cs;
4480    unsigned mask = sctx->sample_mask;
4481 
4482    /* Needed for line and polygon smoothing as well as for the Polaris
4483     * small primitive filter. We expect the gallium frontend to take care of
4484     * this for us.
4485     */
4486    assert(mask == 0xffff || sctx->framebuffer.nr_samples > 1 ||
4487           (mask & 1 && sctx->blitter->running));
4488 
4489    radeon_set_context_reg_seq(cs, R_028C38_PA_SC_AA_MASK_X0Y0_X1Y0, 2);
4490    radeon_emit(cs, mask | (mask << 16));
4491    radeon_emit(cs, mask | (mask << 16));
4492 }
4493 
si_delete_sampler_state(struct pipe_context * ctx,void * state)4494 static void si_delete_sampler_state(struct pipe_context *ctx, void *state)
4495 {
4496 #ifndef NDEBUG
4497    struct si_sampler_state *s = state;
4498 
4499    assert(s->magic == SI_SAMPLER_STATE_MAGIC);
4500    s->magic = 0;
4501 #endif
4502    free(state);
4503 }
4504 
4505 /*
4506  * Vertex elements & buffers
4507  */
4508 
si_compute_fast_udiv_info32(uint32_t D,unsigned num_bits)4509 struct si_fast_udiv_info32 si_compute_fast_udiv_info32(uint32_t D, unsigned num_bits)
4510 {
4511    struct util_fast_udiv_info info = util_compute_fast_udiv_info(D, num_bits, 32);
4512 
4513    struct si_fast_udiv_info32 result = {
4514       info.multiplier,
4515       info.pre_shift,
4516       info.post_shift,
4517       info.increment,
4518    };
4519    return result;
4520 }
4521 
si_create_vertex_elements(struct pipe_context * ctx,unsigned count,const struct pipe_vertex_element * elements)4522 static void *si_create_vertex_elements(struct pipe_context *ctx, unsigned count,
4523                                        const struct pipe_vertex_element *elements)
4524 {
4525    struct si_screen *sscreen = (struct si_screen *)ctx->screen;
4526    struct si_vertex_elements *v = CALLOC_STRUCT(si_vertex_elements);
4527    bool used[SI_NUM_VERTEX_BUFFERS] = {};
4528    struct si_fast_udiv_info32 divisor_factors[SI_MAX_ATTRIBS] = {};
4529    STATIC_ASSERT(sizeof(struct si_fast_udiv_info32) == 16);
4530    STATIC_ASSERT(sizeof(divisor_factors[0].multiplier) == 4);
4531    STATIC_ASSERT(sizeof(divisor_factors[0].pre_shift) == 4);
4532    STATIC_ASSERT(sizeof(divisor_factors[0].post_shift) == 4);
4533    STATIC_ASSERT(sizeof(divisor_factors[0].increment) == 4);
4534    int i;
4535 
4536    assert(count <= SI_MAX_ATTRIBS);
4537    if (!v)
4538       return NULL;
4539 
4540    v->count = count;
4541 
4542    unsigned alloc_count =
4543       count > sscreen->num_vbos_in_user_sgprs ? count - sscreen->num_vbos_in_user_sgprs : 0;
4544    v->vb_desc_list_alloc_size = align(alloc_count * 16, SI_CPDMA_ALIGNMENT);
4545 
4546    for (i = 0; i < count; ++i) {
4547       const struct util_format_description *desc;
4548       const struct util_format_channel_description *channel;
4549       int first_non_void;
4550       unsigned vbo_index = elements[i].vertex_buffer_index;
4551 
4552       if (vbo_index >= SI_NUM_VERTEX_BUFFERS) {
4553          FREE(v);
4554          return NULL;
4555       }
4556 
4557       unsigned instance_divisor = elements[i].instance_divisor;
4558       if (instance_divisor) {
4559          v->uses_instance_divisors = true;
4560 
4561          if (instance_divisor == 1) {
4562             v->instance_divisor_is_one |= 1u << i;
4563          } else {
4564             v->instance_divisor_is_fetched |= 1u << i;
4565             divisor_factors[i] = si_compute_fast_udiv_info32(instance_divisor, 32);
4566          }
4567       }
4568 
4569       if (!used[vbo_index]) {
4570          v->first_vb_use_mask |= 1 << i;
4571          used[vbo_index] = true;
4572       }
4573 
4574       desc = util_format_description(elements[i].src_format);
4575       first_non_void = util_format_get_first_non_void_channel(elements[i].src_format);
4576       channel = first_non_void >= 0 ? &desc->channel[first_non_void] : NULL;
4577 
4578       v->format_size[i] = desc->block.bits / 8;
4579       v->src_offset[i] = elements[i].src_offset;
4580       v->vertex_buffer_index[i] = vbo_index;
4581 
4582       bool always_fix = false;
4583       union si_vs_fix_fetch fix_fetch;
4584       unsigned log_hw_load_size; /* the load element size as seen by the hardware */
4585 
4586       fix_fetch.bits = 0;
4587       log_hw_load_size = MIN2(2, util_logbase2(desc->block.bits) - 3);
4588 
4589       if (channel) {
4590          switch (channel->type) {
4591          case UTIL_FORMAT_TYPE_FLOAT:
4592             fix_fetch.u.format = AC_FETCH_FORMAT_FLOAT;
4593             break;
4594          case UTIL_FORMAT_TYPE_FIXED:
4595             fix_fetch.u.format = AC_FETCH_FORMAT_FIXED;
4596             break;
4597          case UTIL_FORMAT_TYPE_SIGNED: {
4598             if (channel->pure_integer)
4599                fix_fetch.u.format = AC_FETCH_FORMAT_SINT;
4600             else if (channel->normalized)
4601                fix_fetch.u.format = AC_FETCH_FORMAT_SNORM;
4602             else
4603                fix_fetch.u.format = AC_FETCH_FORMAT_SSCALED;
4604             break;
4605          }
4606          case UTIL_FORMAT_TYPE_UNSIGNED: {
4607             if (channel->pure_integer)
4608                fix_fetch.u.format = AC_FETCH_FORMAT_UINT;
4609             else if (channel->normalized)
4610                fix_fetch.u.format = AC_FETCH_FORMAT_UNORM;
4611             else
4612                fix_fetch.u.format = AC_FETCH_FORMAT_USCALED;
4613             break;
4614          }
4615          default:
4616             unreachable("bad format type");
4617          }
4618       } else {
4619          switch (elements[i].src_format) {
4620          case PIPE_FORMAT_R11G11B10_FLOAT:
4621             fix_fetch.u.format = AC_FETCH_FORMAT_FLOAT;
4622             break;
4623          default:
4624             unreachable("bad other format");
4625          }
4626       }
4627 
4628       if (desc->channel[0].size == 10) {
4629          fix_fetch.u.log_size = 3; /* special encoding for 2_10_10_10 */
4630          log_hw_load_size = 2;
4631 
4632          /* The hardware always treats the 2-bit alpha channel as
4633           * unsigned, so a shader workaround is needed. The affected
4634           * chips are GFX8 and older except Stoney (GFX8.1).
4635           */
4636          always_fix = sscreen->info.chip_class <= GFX8 && sscreen->info.family != CHIP_STONEY &&
4637                       channel->type == UTIL_FORMAT_TYPE_SIGNED;
4638       } else if (elements[i].src_format == PIPE_FORMAT_R11G11B10_FLOAT) {
4639          fix_fetch.u.log_size = 3; /* special encoding */
4640          fix_fetch.u.format = AC_FETCH_FORMAT_FIXED;
4641          log_hw_load_size = 2;
4642       } else {
4643          fix_fetch.u.log_size = util_logbase2(channel->size) - 3;
4644          fix_fetch.u.num_channels_m1 = desc->nr_channels - 1;
4645 
4646          /* Always fix up:
4647           * - doubles (multiple loads + truncate to float)
4648           * - 32-bit requiring a conversion
4649           */
4650          always_fix = (fix_fetch.u.log_size == 3) ||
4651                       (fix_fetch.u.log_size == 2 && fix_fetch.u.format != AC_FETCH_FORMAT_FLOAT &&
4652                        fix_fetch.u.format != AC_FETCH_FORMAT_UINT &&
4653                        fix_fetch.u.format != AC_FETCH_FORMAT_SINT);
4654 
4655          /* Also fixup 8_8_8 and 16_16_16. */
4656          if (desc->nr_channels == 3 && fix_fetch.u.log_size <= 1) {
4657             always_fix = true;
4658             log_hw_load_size = fix_fetch.u.log_size;
4659          }
4660       }
4661 
4662       if (desc->swizzle[0] != PIPE_SWIZZLE_X) {
4663          assert(desc->swizzle[0] == PIPE_SWIZZLE_Z &&
4664                 (desc->swizzle[2] == PIPE_SWIZZLE_X || desc->swizzle[2] == PIPE_SWIZZLE_0));
4665          fix_fetch.u.reverse = 1;
4666       }
4667 
4668       /* Force the workaround for unaligned access here already if the
4669        * offset relative to the vertex buffer base is unaligned.
4670        *
4671        * There is a theoretical case in which this is too conservative:
4672        * if the vertex buffer's offset is also unaligned in just the
4673        * right way, we end up with an aligned address after all.
4674        * However, this case should be extremely rare in practice (it
4675        * won't happen in well-behaved applications), and taking it
4676        * into account would complicate the fast path (where everything
4677        * is nicely aligned).
4678        */
4679       bool check_alignment =
4680             log_hw_load_size >= 1 &&
4681             (sscreen->info.chip_class == GFX6 || sscreen->info.chip_class >= GFX10);
4682       bool opencode = sscreen->options.vs_fetch_always_opencode;
4683 
4684       if (check_alignment && (elements[i].src_offset & ((1 << log_hw_load_size) - 1)) != 0)
4685          opencode = true;
4686 
4687       if (always_fix || check_alignment || opencode)
4688          v->fix_fetch[i] = fix_fetch.bits;
4689 
4690       if (opencode)
4691          v->fix_fetch_opencode |= 1 << i;
4692       if (opencode || always_fix)
4693          v->fix_fetch_always |= 1 << i;
4694 
4695       if (check_alignment && !opencode) {
4696          assert(log_hw_load_size == 1 || log_hw_load_size == 2);
4697 
4698          v->fix_fetch_unaligned |= 1 << i;
4699          v->hw_load_is_dword |= (log_hw_load_size - 1) << i;
4700          v->vb_alignment_check_mask |= 1 << vbo_index;
4701       }
4702 
4703       v->rsrc_word3[i] = S_008F0C_DST_SEL_X(si_map_swizzle(desc->swizzle[0])) |
4704                          S_008F0C_DST_SEL_Y(si_map_swizzle(desc->swizzle[1])) |
4705                          S_008F0C_DST_SEL_Z(si_map_swizzle(desc->swizzle[2])) |
4706                          S_008F0C_DST_SEL_W(si_map_swizzle(desc->swizzle[3]));
4707 
4708       if (sscreen->info.chip_class >= GFX10) {
4709          const struct gfx10_format *fmt = &gfx10_format_table[elements[i].src_format];
4710          assert(fmt->img_format != 0 && fmt->img_format < 128);
4711          v->rsrc_word3[i] |= S_008F0C_FORMAT(fmt->img_format) | S_008F0C_RESOURCE_LEVEL(1);
4712       } else {
4713          unsigned data_format, num_format;
4714          data_format = si_translate_buffer_dataformat(ctx->screen, desc, first_non_void);
4715          num_format = si_translate_buffer_numformat(ctx->screen, desc, first_non_void);
4716          v->rsrc_word3[i] |= S_008F0C_NUM_FORMAT(num_format) | S_008F0C_DATA_FORMAT(data_format);
4717       }
4718    }
4719 
4720    if (v->instance_divisor_is_fetched) {
4721       unsigned num_divisors = util_last_bit(v->instance_divisor_is_fetched);
4722 
4723       v->instance_divisor_factor_buffer = (struct si_resource *)pipe_buffer_create(
4724          &sscreen->b, 0, PIPE_USAGE_DEFAULT, num_divisors * sizeof(divisor_factors[0]));
4725       if (!v->instance_divisor_factor_buffer) {
4726          FREE(v);
4727          return NULL;
4728       }
4729       void *map =
4730          sscreen->ws->buffer_map(v->instance_divisor_factor_buffer->buf, NULL, PIPE_MAP_WRITE);
4731       memcpy(map, divisor_factors, num_divisors * sizeof(divisor_factors[0]));
4732    }
4733    return v;
4734 }
4735 
si_bind_vertex_elements(struct pipe_context * ctx,void * state)4736 static void si_bind_vertex_elements(struct pipe_context *ctx, void *state)
4737 {
4738    struct si_context *sctx = (struct si_context *)ctx;
4739    struct si_vertex_elements *old = sctx->vertex_elements;
4740    struct si_vertex_elements *v = (struct si_vertex_elements *)state;
4741 
4742    sctx->vertex_elements = v;
4743    sctx->num_vertex_elements = v ? v->count : 0;
4744 
4745    if (sctx->num_vertex_elements) {
4746       sctx->vertex_buffers_dirty = true;
4747    } else {
4748       sctx->vertex_buffer_pointer_dirty = false;
4749       sctx->vertex_buffer_user_sgprs_dirty = false;
4750    }
4751 
4752    if (v && (!old || old->count != v->count ||
4753              old->uses_instance_divisors != v->uses_instance_divisors ||
4754              /* we don't check which divisors changed */
4755              v->uses_instance_divisors ||
4756              (old->vb_alignment_check_mask ^ v->vb_alignment_check_mask) &
4757                 sctx->vertex_buffer_unaligned ||
4758              ((v->vb_alignment_check_mask & sctx->vertex_buffer_unaligned) &&
4759               memcmp(old->vertex_buffer_index, v->vertex_buffer_index,
4760                      sizeof(v->vertex_buffer_index[0]) * v->count)) ||
4761              /* fix_fetch_{always,opencode,unaligned} and hw_load_is_dword are
4762               * functions of fix_fetch and the src_offset alignment.
4763               * If they change and fix_fetch doesn't, it must be due to different
4764               * src_offset alignment, which is reflected in fix_fetch_opencode. */
4765              old->fix_fetch_opencode != v->fix_fetch_opencode ||
4766              memcmp(old->fix_fetch, v->fix_fetch, sizeof(v->fix_fetch[0]) * v->count)))
4767       sctx->do_update_shaders = true;
4768 
4769    if (v && v->instance_divisor_is_fetched) {
4770       struct pipe_constant_buffer cb;
4771 
4772       cb.buffer = &v->instance_divisor_factor_buffer->b.b;
4773       cb.user_buffer = NULL;
4774       cb.buffer_offset = 0;
4775       cb.buffer_size = 0xffffffff;
4776       si_set_rw_buffer(sctx, SI_VS_CONST_INSTANCE_DIVISORS, &cb);
4777    }
4778 }
4779 
si_delete_vertex_element(struct pipe_context * ctx,void * state)4780 static void si_delete_vertex_element(struct pipe_context *ctx, void *state)
4781 {
4782    struct si_context *sctx = (struct si_context *)ctx;
4783    struct si_vertex_elements *v = (struct si_vertex_elements *)state;
4784 
4785    if (sctx->vertex_elements == state) {
4786       sctx->vertex_elements = NULL;
4787       sctx->num_vertex_elements = 0;
4788    }
4789    si_resource_reference(&v->instance_divisor_factor_buffer, NULL);
4790    FREE(state);
4791 }
4792 
si_set_vertex_buffers(struct pipe_context * ctx,unsigned start_slot,unsigned count,const struct pipe_vertex_buffer * buffers)4793 static void si_set_vertex_buffers(struct pipe_context *ctx, unsigned start_slot, unsigned count,
4794                                   const struct pipe_vertex_buffer *buffers)
4795 {
4796    struct si_context *sctx = (struct si_context *)ctx;
4797    struct pipe_vertex_buffer *dst = sctx->vertex_buffer + start_slot;
4798    unsigned updated_mask = u_bit_consecutive(start_slot, count);
4799    uint32_t orig_unaligned = sctx->vertex_buffer_unaligned;
4800    uint32_t unaligned = 0;
4801    int i;
4802 
4803    assert(start_slot + count <= ARRAY_SIZE(sctx->vertex_buffer));
4804 
4805    if (buffers) {
4806       for (i = 0; i < count; i++) {
4807          const struct pipe_vertex_buffer *src = buffers + i;
4808          struct pipe_vertex_buffer *dsti = dst + i;
4809          struct pipe_resource *buf = src->buffer.resource;
4810          unsigned slot_bit = 1 << (start_slot + i);
4811 
4812          pipe_resource_reference(&dsti->buffer.resource, buf);
4813          dsti->buffer_offset = src->buffer_offset;
4814          dsti->stride = src->stride;
4815 
4816          if (dsti->buffer_offset & 3 || dsti->stride & 3)
4817             unaligned |= slot_bit;
4818 
4819          si_context_add_resource_size(sctx, buf);
4820          if (buf)
4821             si_resource(buf)->bind_history |= PIPE_BIND_VERTEX_BUFFER;
4822       }
4823    } else {
4824       for (i = 0; i < count; i++) {
4825          pipe_resource_reference(&dst[i].buffer.resource, NULL);
4826       }
4827       unaligned &= ~updated_mask;
4828    }
4829    sctx->vertex_buffers_dirty = true;
4830    sctx->vertex_buffer_unaligned = (orig_unaligned & ~updated_mask) | unaligned;
4831 
4832    /* Check whether alignment may have changed in a way that requires
4833     * shader changes. This check is conservative: a vertex buffer can only
4834     * trigger a shader change if the misalignment amount changes (e.g.
4835     * from byte-aligned to short-aligned), but we only keep track of
4836     * whether buffers are at least dword-aligned, since that should always
4837     * be the case in well-behaved applications anyway.
4838     */
4839    if (sctx->vertex_elements && (sctx->vertex_elements->vb_alignment_check_mask &
4840                                  (unaligned | orig_unaligned) & updated_mask))
4841       sctx->do_update_shaders = true;
4842 }
4843 
4844 /*
4845  * Misc
4846  */
4847 
si_set_tess_state(struct pipe_context * ctx,const float default_outer_level[4],const float default_inner_level[2])4848 static void si_set_tess_state(struct pipe_context *ctx, const float default_outer_level[4],
4849                               const float default_inner_level[2])
4850 {
4851    struct si_context *sctx = (struct si_context *)ctx;
4852    struct pipe_constant_buffer cb;
4853    float array[8];
4854 
4855    memcpy(array, default_outer_level, sizeof(float) * 4);
4856    memcpy(array + 4, default_inner_level, sizeof(float) * 2);
4857 
4858    cb.buffer = NULL;
4859    cb.user_buffer = NULL;
4860    cb.buffer_size = sizeof(array);
4861 
4862    si_upload_const_buffer(sctx, (struct si_resource **)&cb.buffer, (void *)array, sizeof(array),
4863                           &cb.buffer_offset);
4864 
4865    si_set_rw_buffer(sctx, SI_HS_CONST_DEFAULT_TESS_LEVELS, &cb);
4866    pipe_resource_reference(&cb.buffer, NULL);
4867 }
4868 
si_texture_barrier(struct pipe_context * ctx,unsigned flags)4869 static void si_texture_barrier(struct pipe_context *ctx, unsigned flags)
4870 {
4871    struct si_context *sctx = (struct si_context *)ctx;
4872 
4873    si_update_fb_dirtiness_after_rendering(sctx);
4874 
4875    /* Multisample surfaces are flushed in si_decompress_textures. */
4876    if (sctx->framebuffer.uncompressed_cb_mask) {
4877       si_make_CB_shader_coherent(sctx, sctx->framebuffer.nr_samples,
4878                                  sctx->framebuffer.CB_has_shader_readable_metadata,
4879                                  sctx->framebuffer.all_DCC_pipe_aligned);
4880    }
4881 }
4882 
4883 /* This only ensures coherency for shader image/buffer stores. */
si_memory_barrier(struct pipe_context * ctx,unsigned flags)4884 static void si_memory_barrier(struct pipe_context *ctx, unsigned flags)
4885 {
4886    struct si_context *sctx = (struct si_context *)ctx;
4887 
4888    if (!(flags & ~PIPE_BARRIER_UPDATE))
4889       return;
4890 
4891    /* Subsequent commands must wait for all shader invocations to
4892     * complete. */
4893    sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH;
4894 
4895    if (flags & PIPE_BARRIER_CONSTANT_BUFFER)
4896       sctx->flags |= SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE;
4897 
4898    if (flags & (PIPE_BARRIER_VERTEX_BUFFER | PIPE_BARRIER_SHADER_BUFFER | PIPE_BARRIER_TEXTURE |
4899                 PIPE_BARRIER_IMAGE | PIPE_BARRIER_STREAMOUT_BUFFER | PIPE_BARRIER_GLOBAL_BUFFER)) {
4900       /* As far as I can tell, L1 contents are written back to L2
4901        * automatically at end of shader, but the contents of other
4902        * L1 caches might still be stale. */
4903       sctx->flags |= SI_CONTEXT_INV_VCACHE;
4904    }
4905 
4906    if (flags & PIPE_BARRIER_INDEX_BUFFER) {
4907       /* Indices are read through TC L2 since GFX8.
4908        * L1 isn't used.
4909        */
4910       if (sctx->screen->info.chip_class <= GFX7)
4911          sctx->flags |= SI_CONTEXT_WB_L2;
4912    }
4913 
4914    /* MSAA color, any depth and any stencil are flushed in
4915     * si_decompress_textures when needed.
4916     */
4917    if (flags & PIPE_BARRIER_FRAMEBUFFER && sctx->framebuffer.uncompressed_cb_mask) {
4918       sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_CB;
4919 
4920       if (sctx->chip_class <= GFX8)
4921          sctx->flags |= SI_CONTEXT_WB_L2;
4922    }
4923 
4924    /* Indirect buffers use TC L2 on GFX9, but not older hw. */
4925    if (sctx->screen->info.chip_class <= GFX8 && flags & PIPE_BARRIER_INDIRECT_BUFFER)
4926       sctx->flags |= SI_CONTEXT_WB_L2;
4927 }
4928 
si_create_blend_custom(struct si_context * sctx,unsigned mode)4929 static void *si_create_blend_custom(struct si_context *sctx, unsigned mode)
4930 {
4931    struct pipe_blend_state blend;
4932 
4933    memset(&blend, 0, sizeof(blend));
4934    blend.independent_blend_enable = true;
4935    blend.rt[0].colormask = 0xf;
4936    return si_create_blend_state_mode(&sctx->b, &blend, mode);
4937 }
4938 
si_init_state_compute_functions(struct si_context * sctx)4939 void si_init_state_compute_functions(struct si_context *sctx)
4940 {
4941    sctx->b.create_sampler_state = si_create_sampler_state;
4942    sctx->b.delete_sampler_state = si_delete_sampler_state;
4943    sctx->b.create_sampler_view = si_create_sampler_view;
4944    sctx->b.sampler_view_destroy = si_sampler_view_destroy;
4945    sctx->b.memory_barrier = si_memory_barrier;
4946 }
4947 
si_init_state_functions(struct si_context * sctx)4948 void si_init_state_functions(struct si_context *sctx)
4949 {
4950    sctx->atoms.s.framebuffer.emit = si_emit_framebuffer_state;
4951    sctx->atoms.s.msaa_sample_locs.emit = si_emit_msaa_sample_locs;
4952    sctx->atoms.s.db_render_state.emit = si_emit_db_render_state;
4953    sctx->atoms.s.dpbb_state.emit = si_emit_dpbb_state;
4954    sctx->atoms.s.msaa_config.emit = si_emit_msaa_config;
4955    sctx->atoms.s.sample_mask.emit = si_emit_sample_mask;
4956    sctx->atoms.s.cb_render_state.emit = si_emit_cb_render_state;
4957    sctx->atoms.s.blend_color.emit = si_emit_blend_color;
4958    sctx->atoms.s.clip_regs.emit = si_emit_clip_regs;
4959    sctx->atoms.s.clip_state.emit = si_emit_clip_state;
4960    sctx->atoms.s.stencil_ref.emit = si_emit_stencil_ref;
4961 
4962    sctx->b.create_blend_state = si_create_blend_state;
4963    sctx->b.bind_blend_state = si_bind_blend_state;
4964    sctx->b.delete_blend_state = si_delete_blend_state;
4965    sctx->b.set_blend_color = si_set_blend_color;
4966 
4967    sctx->b.create_rasterizer_state = si_create_rs_state;
4968    sctx->b.bind_rasterizer_state = si_bind_rs_state;
4969    sctx->b.delete_rasterizer_state = si_delete_rs_state;
4970 
4971    sctx->b.create_depth_stencil_alpha_state = si_create_dsa_state;
4972    sctx->b.bind_depth_stencil_alpha_state = si_bind_dsa_state;
4973    sctx->b.delete_depth_stencil_alpha_state = si_delete_dsa_state;
4974 
4975    sctx->custom_dsa_flush = si_create_db_flush_dsa(sctx);
4976    sctx->custom_blend_resolve = si_create_blend_custom(sctx, V_028808_CB_RESOLVE);
4977    sctx->custom_blend_fmask_decompress = si_create_blend_custom(sctx, V_028808_CB_FMASK_DECOMPRESS);
4978    sctx->custom_blend_eliminate_fastclear =
4979       si_create_blend_custom(sctx, V_028808_CB_ELIMINATE_FAST_CLEAR);
4980    sctx->custom_blend_dcc_decompress = si_create_blend_custom(sctx, V_028808_CB_DCC_DECOMPRESS);
4981 
4982    sctx->b.set_clip_state = si_set_clip_state;
4983    sctx->b.set_stencil_ref = si_set_stencil_ref;
4984 
4985    sctx->b.set_framebuffer_state = si_set_framebuffer_state;
4986 
4987    sctx->b.set_sample_mask = si_set_sample_mask;
4988 
4989    sctx->b.create_vertex_elements_state = si_create_vertex_elements;
4990    sctx->b.bind_vertex_elements_state = si_bind_vertex_elements;
4991    sctx->b.delete_vertex_elements_state = si_delete_vertex_element;
4992    sctx->b.set_vertex_buffers = si_set_vertex_buffers;
4993 
4994    sctx->b.texture_barrier = si_texture_barrier;
4995    sctx->b.set_min_samples = si_set_min_samples;
4996    sctx->b.set_tess_state = si_set_tess_state;
4997 
4998    sctx->b.set_active_query_state = si_set_active_query_state;
4999 }
5000 
si_init_screen_state_functions(struct si_screen * sscreen)5001 void si_init_screen_state_functions(struct si_screen *sscreen)
5002 {
5003    sscreen->b.is_format_supported = si_is_format_supported;
5004 
5005    if (sscreen->info.chip_class >= GFX10) {
5006       sscreen->make_texture_descriptor = gfx10_make_texture_descriptor;
5007    } else {
5008       sscreen->make_texture_descriptor = si_make_texture_descriptor;
5009    }
5010 }
5011 
si_set_grbm_gfx_index(struct si_context * sctx,struct si_pm4_state * pm4,unsigned value)5012 static void si_set_grbm_gfx_index(struct si_context *sctx, struct si_pm4_state *pm4, unsigned value)
5013 {
5014    unsigned reg = sctx->chip_class >= GFX7 ? R_030800_GRBM_GFX_INDEX : R_00802C_GRBM_GFX_INDEX;
5015    si_pm4_set_reg(pm4, reg, value);
5016 }
5017 
si_set_grbm_gfx_index_se(struct si_context * sctx,struct si_pm4_state * pm4,unsigned se)5018 static void si_set_grbm_gfx_index_se(struct si_context *sctx, struct si_pm4_state *pm4, unsigned se)
5019 {
5020    assert(se == ~0 || se < sctx->screen->info.max_se);
5021    si_set_grbm_gfx_index(sctx, pm4,
5022                          (se == ~0 ? S_030800_SE_BROADCAST_WRITES(1) : S_030800_SE_INDEX(se)) |
5023                             S_030800_SH_BROADCAST_WRITES(1) |
5024                             S_030800_INSTANCE_BROADCAST_WRITES(1));
5025 }
5026 
si_write_harvested_raster_configs(struct si_context * sctx,struct si_pm4_state * pm4,unsigned raster_config,unsigned raster_config_1)5027 static void si_write_harvested_raster_configs(struct si_context *sctx, struct si_pm4_state *pm4,
5028                                               unsigned raster_config, unsigned raster_config_1)
5029 {
5030    unsigned num_se = MAX2(sctx->screen->info.max_se, 1);
5031    unsigned raster_config_se[4];
5032    unsigned se;
5033 
5034    ac_get_harvested_configs(&sctx->screen->info, raster_config, &raster_config_1, raster_config_se);
5035 
5036    for (se = 0; se < num_se; se++) {
5037       si_set_grbm_gfx_index_se(sctx, pm4, se);
5038       si_pm4_set_reg(pm4, R_028350_PA_SC_RASTER_CONFIG, raster_config_se[se]);
5039    }
5040    si_set_grbm_gfx_index(sctx, pm4, ~0);
5041 
5042    if (sctx->chip_class >= GFX7) {
5043       si_pm4_set_reg(pm4, R_028354_PA_SC_RASTER_CONFIG_1, raster_config_1);
5044    }
5045 }
5046 
si_set_raster_config(struct si_context * sctx,struct si_pm4_state * pm4)5047 static void si_set_raster_config(struct si_context *sctx, struct si_pm4_state *pm4)
5048 {
5049    struct si_screen *sscreen = sctx->screen;
5050    unsigned num_rb = MIN2(sscreen->info.num_render_backends, 16);
5051    unsigned rb_mask = sscreen->info.enabled_rb_mask;
5052    unsigned raster_config = sscreen->pa_sc_raster_config;
5053    unsigned raster_config_1 = sscreen->pa_sc_raster_config_1;
5054 
5055    if (!rb_mask || util_bitcount(rb_mask) >= num_rb) {
5056       /* Always use the default config when all backends are enabled
5057        * (or when we failed to determine the enabled backends).
5058        */
5059       si_pm4_set_reg(pm4, R_028350_PA_SC_RASTER_CONFIG, raster_config);
5060       if (sctx->chip_class >= GFX7)
5061          si_pm4_set_reg(pm4, R_028354_PA_SC_RASTER_CONFIG_1, raster_config_1);
5062    } else {
5063       si_write_harvested_raster_configs(sctx, pm4, raster_config, raster_config_1);
5064    }
5065 }
5066 
si_init_cs_preamble_state(struct si_context * sctx,bool uses_reg_shadowing)5067 void si_init_cs_preamble_state(struct si_context *sctx, bool uses_reg_shadowing)
5068 {
5069    struct si_screen *sscreen = sctx->screen;
5070    uint64_t border_color_va = sctx->border_color_buffer->gpu_address;
5071    bool has_clear_state = sscreen->info.has_clear_state;
5072    struct si_pm4_state *pm4 = CALLOC_STRUCT(si_pm4_state);
5073 
5074    if (!pm4)
5075       return;
5076 
5077    if (!uses_reg_shadowing) {
5078       si_pm4_cmd_add(pm4, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
5079       si_pm4_cmd_add(pm4, CC0_UPDATE_LOAD_ENABLES(1));
5080       si_pm4_cmd_add(pm4, CC1_UPDATE_SHADOW_ENABLES(1));
5081 
5082       if (has_clear_state) {
5083          si_pm4_cmd_add(pm4, PKT3(PKT3_CLEAR_STATE, 0, 0));
5084          si_pm4_cmd_add(pm4, 0);
5085       }
5086    }
5087 
5088    /* CLEAR_STATE doesn't restore these correctly. */
5089    si_pm4_set_reg(pm4, R_028240_PA_SC_GENERIC_SCISSOR_TL, S_028240_WINDOW_OFFSET_DISABLE(1));
5090    si_pm4_set_reg(pm4, R_028244_PA_SC_GENERIC_SCISSOR_BR,
5091                   S_028244_BR_X(16384) | S_028244_BR_Y(16384));
5092 
5093    si_pm4_set_reg(pm4, R_028A18_VGT_HOS_MAX_TESS_LEVEL, fui(64));
5094    if (!has_clear_state)
5095       si_pm4_set_reg(pm4, R_028A1C_VGT_HOS_MIN_TESS_LEVEL, fui(0));
5096 
5097    if (!has_clear_state) {
5098       si_pm4_set_reg(pm4, R_028230_PA_SC_EDGERULE,
5099                      S_028230_ER_TRI(0xA) | S_028230_ER_POINT(0xA) | S_028230_ER_RECT(0xA) |
5100                         /* Required by DX10_DIAMOND_TEST_ENA: */
5101                         S_028230_ER_LINE_LR(0x1A) | S_028230_ER_LINE_RL(0x26) |
5102                         S_028230_ER_LINE_TB(0xA) | S_028230_ER_LINE_BT(0xA));
5103       si_pm4_set_reg(pm4, R_028820_PA_CL_NANINF_CNTL, 0);
5104       si_pm4_set_reg(pm4, R_028AC0_DB_SRESULTS_COMPARE_STATE0, 0x0);
5105       si_pm4_set_reg(pm4, R_028AC4_DB_SRESULTS_COMPARE_STATE1, 0x0);
5106       si_pm4_set_reg(pm4, R_028AC8_DB_PRELOAD_CONTROL, 0x0);
5107       si_pm4_set_reg(pm4, R_02800C_DB_RENDER_OVERRIDE, 0);
5108       si_pm4_set_reg(pm4, R_028A5C_VGT_GS_PER_VS, 0x2);
5109       si_pm4_set_reg(pm4, R_028A8C_VGT_PRIMITIVEID_RESET, 0x0);
5110       si_pm4_set_reg(pm4, R_028B98_VGT_STRMOUT_BUFFER_CONFIG, 0x0);
5111       si_pm4_set_reg(pm4, R_028AB8_VGT_VTX_CNT_EN, 0x0);
5112    }
5113 
5114    si_pm4_set_reg(pm4, R_028080_TA_BC_BASE_ADDR, border_color_va >> 8);
5115    if (sctx->chip_class >= GFX7)
5116       si_pm4_set_reg(pm4, R_028084_TA_BC_BASE_ADDR_HI, S_028084_ADDRESS(border_color_va >> 40));
5117 
5118    if (sctx->chip_class == GFX6) {
5119       si_pm4_set_reg(pm4, R_008A14_PA_CL_ENHANCE,
5120                      S_008A14_NUM_CLIP_SEQ(3) | S_008A14_CLIP_VTX_REORDER_ENA(1));
5121    }
5122 
5123    if (sctx->chip_class <= GFX7 || !has_clear_state) {
5124       si_pm4_set_reg(pm4, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL, 14);
5125       si_pm4_set_reg(pm4, R_028C5C_VGT_OUT_DEALLOC_CNTL, 16);
5126 
5127       /* CLEAR_STATE doesn't clear these correctly on certain generations.
5128        * I don't know why. Deduced by trial and error.
5129        */
5130       si_pm4_set_reg(pm4, R_028B28_VGT_STRMOUT_DRAW_OPAQUE_OFFSET, 0);
5131       si_pm4_set_reg(pm4, R_028204_PA_SC_WINDOW_SCISSOR_TL, S_028204_WINDOW_OFFSET_DISABLE(1));
5132       si_pm4_set_reg(pm4, R_028030_PA_SC_SCREEN_SCISSOR_TL, 0);
5133       si_pm4_set_reg(pm4, R_028034_PA_SC_SCREEN_SCISSOR_BR,
5134                      S_028034_BR_X(16384) | S_028034_BR_Y(16384));
5135    }
5136 
5137    unsigned cu_mask_ps = 0xffffffff;
5138 
5139    /* It's wasteful to enable all CUs for PS if shader arrays have a different
5140     * number of CUs. The reason is that the hardware sends the same number of PS
5141     * waves to each shader array, so the slowest shader array limits the performance.
5142     * Disable the extra CUs for PS in other shader arrays to save power and thus
5143     * increase clocks for busy CUs. In the future, we might disable or enable this
5144     * tweak only for certain apps.
5145     */
5146    if (sctx->chip_class >= GFX10_3)
5147       cu_mask_ps = u_bit_consecutive(0, sscreen->info.min_good_cu_per_sa);
5148 
5149    if (sctx->chip_class >= GFX7) {
5150       /* Compute LATE_ALLOC_VS.LIMIT. */
5151       unsigned num_cu_per_sh = sscreen->info.min_good_cu_per_sa;
5152       unsigned late_alloc_wave64 = 0; /* The limit is per SA. */
5153       unsigned cu_mask_vs = 0xffff;
5154       unsigned cu_mask_gs = 0xffff;
5155 
5156       if (sctx->chip_class >= GFX10) {
5157          /* For Wave32, the hw will launch twice the number of late
5158           * alloc waves, so 1 == 2x wave32.
5159           */
5160          if (!sscreen->info.use_late_alloc) {
5161             late_alloc_wave64 = 0;
5162          } else if (num_cu_per_sh <= 6) {
5163             late_alloc_wave64 = num_cu_per_sh - 2;
5164          } else {
5165             late_alloc_wave64 = (num_cu_per_sh - 2) * 4;
5166 
5167             /* CU2 & CU3 disabled because of the dual CU design */
5168             /* Late alloc is not used for NGG on Navi14 due to a hw bug. */
5169             cu_mask_vs = 0xfff3;
5170             cu_mask_gs = sscreen->use_ngg && sctx->family != CHIP_NAVI14 ? 0xfff3 : 0xffff;
5171          }
5172       } else {
5173          if (!sscreen->info.use_late_alloc) {
5174             late_alloc_wave64 = 0;
5175          } else if (num_cu_per_sh <= 4) {
5176             /* Too few available compute units per SA. Disallowing
5177              * VS to run on one CU could hurt us more than late VS
5178              * allocation would help.
5179              *
5180              * 2 is the highest safe number that allows us to keep
5181              * all CUs enabled.
5182              */
5183             late_alloc_wave64 = 2;
5184          } else {
5185             /* This is a good initial value, allowing 1 late_alloc
5186              * wave per SIMD on num_cu - 2.
5187              */
5188             late_alloc_wave64 = (num_cu_per_sh - 2) * 4;
5189          }
5190 
5191          if (late_alloc_wave64 > 2)
5192             cu_mask_vs = 0xfffe; /* 1 CU disabled */
5193       }
5194 
5195       /* VS can't execute on one CU if the limit is > 2. */
5196       si_pm4_set_reg(pm4, R_00B118_SPI_SHADER_PGM_RSRC3_VS,
5197                      S_00B118_CU_EN(cu_mask_vs) | S_00B118_WAVE_LIMIT(0x3F));
5198       si_pm4_set_reg(pm4, R_00B11C_SPI_SHADER_LATE_ALLOC_VS, S_00B11C_LIMIT(late_alloc_wave64));
5199       si_pm4_set_reg(pm4, R_00B21C_SPI_SHADER_PGM_RSRC3_GS,
5200                      S_00B21C_CU_EN(cu_mask_gs) | S_00B21C_WAVE_LIMIT(0x3F));
5201       si_pm4_set_reg(pm4, R_00B01C_SPI_SHADER_PGM_RSRC3_PS,
5202                      S_00B01C_CU_EN(cu_mask_ps) | S_00B01C_WAVE_LIMIT(0x3F));
5203    }
5204 
5205    if (sctx->chip_class <= GFX8) {
5206       si_set_raster_config(sctx, pm4);
5207 
5208       /* FIXME calculate these values somehow ??? */
5209       si_pm4_set_reg(pm4, R_028A54_VGT_GS_PER_ES, SI_GS_PER_ES);
5210       si_pm4_set_reg(pm4, R_028A58_VGT_ES_PER_GS, 0x40);
5211 
5212       /* These registers, when written, also overwrite the CLEAR_STATE
5213        * context, so we can't rely on CLEAR_STATE setting them.
5214        * It would be an issue if there was another UMD changing them.
5215        */
5216       si_pm4_set_reg(pm4, R_028400_VGT_MAX_VTX_INDX, ~0);
5217       si_pm4_set_reg(pm4, R_028404_VGT_MIN_VTX_INDX, 0);
5218       si_pm4_set_reg(pm4, R_028408_VGT_INDX_OFFSET, 0);
5219    }
5220 
5221    if (sctx->chip_class >= GFX7 && sctx->chip_class <= GFX8) {
5222       si_pm4_set_reg(pm4, R_00B51C_SPI_SHADER_PGM_RSRC3_LS,
5223                      S_00B51C_CU_EN(0xffff) | S_00B51C_WAVE_LIMIT(0x3F));
5224       si_pm4_set_reg(pm4, R_00B41C_SPI_SHADER_PGM_RSRC3_HS, S_00B41C_WAVE_LIMIT(0x3F));
5225       si_pm4_set_reg(pm4, R_00B31C_SPI_SHADER_PGM_RSRC3_ES,
5226                      S_00B31C_CU_EN(0xffff) | S_00B31C_WAVE_LIMIT(0x3F));
5227 
5228       /* If this is 0, Bonaire can hang even if GS isn't being used.
5229        * Other chips are unaffected. These are suboptimal values,
5230        * but we don't use on-chip GS.
5231        */
5232       si_pm4_set_reg(pm4, R_028A44_VGT_GS_ONCHIP_CNTL,
5233                      S_028A44_ES_VERTS_PER_SUBGRP(64) | S_028A44_GS_PRIMS_PER_SUBGRP(4));
5234    }
5235 
5236    if (sctx->chip_class == GFX8) {
5237       unsigned vgt_tess_distribution;
5238 
5239       vgt_tess_distribution = S_028B50_ACCUM_ISOLINE(32) | S_028B50_ACCUM_TRI(11) |
5240                               S_028B50_ACCUM_QUAD(11) | S_028B50_DONUT_SPLIT(16);
5241 
5242       /* Testing with Unigine Heaven extreme tesselation yielded best results
5243        * with TRAP_SPLIT = 3.
5244        */
5245       if (sctx->family == CHIP_FIJI || sctx->family >= CHIP_POLARIS10)
5246          vgt_tess_distribution |= S_028B50_TRAP_SPLIT(3);
5247 
5248       si_pm4_set_reg(pm4, R_028B50_VGT_TESS_DISTRIBUTION, vgt_tess_distribution);
5249    }
5250 
5251    if (sscreen->info.chip_class <= GFX9) {
5252       si_pm4_set_reg(pm4, R_028AA0_VGT_INSTANCE_STEP_RATE_0, 1);
5253    }
5254 
5255    if (sctx->chip_class == GFX9) {
5256       si_pm4_set_reg(pm4, R_030920_VGT_MAX_VTX_INDX, ~0);
5257       si_pm4_set_reg(pm4, R_030924_VGT_MIN_VTX_INDX, 0);
5258       si_pm4_set_reg(pm4, R_030928_VGT_INDX_OFFSET, 0);
5259    }
5260 
5261    if (sctx->chip_class >= GFX9) {
5262       si_pm4_set_reg(pm4, R_00B41C_SPI_SHADER_PGM_RSRC3_HS,
5263                      S_00B41C_CU_EN(0xffff) | S_00B41C_WAVE_LIMIT(0x3F));
5264 
5265       si_pm4_set_reg(pm4, R_028B50_VGT_TESS_DISTRIBUTION,
5266                      S_028B50_ACCUM_ISOLINE(40) | S_028B50_ACCUM_TRI(30) | S_028B50_ACCUM_QUAD(24) |
5267                      S_028B50_DONUT_SPLIT(24) | S_028B50_TRAP_SPLIT(6));
5268       si_pm4_set_reg(pm4, R_028C48_PA_SC_BINNER_CNTL_1,
5269                      S_028C48_MAX_ALLOC_COUNT(sscreen->info.pbb_max_alloc_count - 1) |
5270                      S_028C48_MAX_PRIM_PER_BATCH(1023));
5271       si_pm4_set_reg(pm4, R_028C4C_PA_SC_CONSERVATIVE_RASTERIZATION_CNTL,
5272                      S_028C4C_NULL_SQUAD_AA_MASK_ENABLE(1));
5273 
5274       si_pm4_set_reg(pm4, R_030968_VGT_INSTANCE_BASE_ID, 0);
5275       si_pm4_set_reg(pm4, R_0301EC_CP_COHER_START_DELAY,
5276                      sctx->chip_class >= GFX10 ? 0x20 : 0);
5277    }
5278 
5279    if (sctx->chip_class >= GFX10) {
5280       /* Logical CUs 16 - 31 */
5281       si_pm4_set_reg(pm4, R_00B004_SPI_SHADER_PGM_RSRC4_PS, S_00B004_CU_EN(cu_mask_ps >> 16));
5282       si_pm4_set_reg(pm4, R_00B104_SPI_SHADER_PGM_RSRC4_VS, S_00B104_CU_EN(0xffff));
5283       si_pm4_set_reg(pm4, R_00B404_SPI_SHADER_PGM_RSRC4_HS, S_00B404_CU_EN(0xffff));
5284 
5285       si_pm4_set_reg(pm4, R_00B0C8_SPI_SHADER_USER_ACCUM_PS_0, 0);
5286       si_pm4_set_reg(pm4, R_00B0CC_SPI_SHADER_USER_ACCUM_PS_1, 0);
5287       si_pm4_set_reg(pm4, R_00B0D0_SPI_SHADER_USER_ACCUM_PS_2, 0);
5288       si_pm4_set_reg(pm4, R_00B0D4_SPI_SHADER_USER_ACCUM_PS_3, 0);
5289       si_pm4_set_reg(pm4, R_00B1C8_SPI_SHADER_USER_ACCUM_VS_0, 0);
5290       si_pm4_set_reg(pm4, R_00B1CC_SPI_SHADER_USER_ACCUM_VS_1, 0);
5291       si_pm4_set_reg(pm4, R_00B1D0_SPI_SHADER_USER_ACCUM_VS_2, 0);
5292       si_pm4_set_reg(pm4, R_00B1D4_SPI_SHADER_USER_ACCUM_VS_3, 0);
5293       si_pm4_set_reg(pm4, R_00B2C8_SPI_SHADER_USER_ACCUM_ESGS_0, 0);
5294       si_pm4_set_reg(pm4, R_00B2CC_SPI_SHADER_USER_ACCUM_ESGS_1, 0);
5295       si_pm4_set_reg(pm4, R_00B2D0_SPI_SHADER_USER_ACCUM_ESGS_2, 0);
5296       si_pm4_set_reg(pm4, R_00B2D4_SPI_SHADER_USER_ACCUM_ESGS_3, 0);
5297       si_pm4_set_reg(pm4, R_00B4C8_SPI_SHADER_USER_ACCUM_LSHS_0, 0);
5298       si_pm4_set_reg(pm4, R_00B4CC_SPI_SHADER_USER_ACCUM_LSHS_1, 0);
5299       si_pm4_set_reg(pm4, R_00B4D0_SPI_SHADER_USER_ACCUM_LSHS_2, 0);
5300       si_pm4_set_reg(pm4, R_00B4D4_SPI_SHADER_USER_ACCUM_LSHS_3, 0);
5301 
5302       si_pm4_set_reg(pm4, R_00B0C0_SPI_SHADER_REQ_CTRL_PS,
5303                      S_00B0C0_SOFT_GROUPING_EN(1) |
5304                      S_00B0C0_NUMBER_OF_REQUESTS_PER_CU(4 - 1));
5305       si_pm4_set_reg(pm4, R_00B1C0_SPI_SHADER_REQ_CTRL_VS, 0);
5306 
5307       /* Enable CMASK/FMASK/HTILE/DCC caching in L2 for small chips. */
5308       unsigned meta_write_policy, meta_read_policy;
5309       if (sscreen->info.num_render_backends <= 4) {
5310          meta_write_policy = V_02807C_CACHE_LRU_WR; /* cache writes */
5311          meta_read_policy = V_02807C_CACHE_LRU_RD;  /* cache reads */
5312       } else {
5313          meta_write_policy = V_02807C_CACHE_STREAM; /* write combine */
5314          meta_read_policy = V_02807C_CACHE_NOA;     /* don't cache reads */
5315       }
5316 
5317       si_pm4_set_reg(pm4, R_02807C_DB_RMI_L2_CACHE_CONTROL,
5318                      S_02807C_Z_WR_POLICY(V_02807C_CACHE_STREAM) |
5319                      S_02807C_S_WR_POLICY(V_02807C_CACHE_STREAM) |
5320                      S_02807C_HTILE_WR_POLICY(meta_write_policy) |
5321                      S_02807C_ZPCPSD_WR_POLICY(V_02807C_CACHE_STREAM) |
5322                      S_02807C_Z_RD_POLICY(V_02807C_CACHE_NOA) |
5323                      S_02807C_S_RD_POLICY(V_02807C_CACHE_NOA) |
5324                      S_02807C_HTILE_RD_POLICY(meta_read_policy));
5325       si_pm4_set_reg(pm4, R_028410_CB_RMI_GL2_CACHE_CONTROL,
5326                      S_028410_CMASK_WR_POLICY(meta_write_policy) |
5327                      S_028410_FMASK_WR_POLICY(meta_write_policy) |
5328                      S_028410_DCC_WR_POLICY(meta_write_policy) |
5329                      S_028410_COLOR_WR_POLICY(V_028410_CACHE_STREAM) |
5330                      S_028410_CMASK_RD_POLICY(meta_read_policy) |
5331                      S_028410_FMASK_RD_POLICY(meta_read_policy) | S_028410_DCC_RD_POLICY(meta_read_policy) |
5332                      S_028410_COLOR_RD_POLICY(V_028410_CACHE_NOA));
5333 
5334       si_pm4_set_reg(pm4, R_028428_CB_COVERAGE_OUT_CONTROL, 0);
5335       si_pm4_set_reg(pm4, R_028A98_VGT_DRAW_PAYLOAD_CNTL, 0);
5336 
5337       /* Break up a pixel wave if it contains deallocs for more than
5338        * half the parameter cache.
5339        *
5340        * To avoid a deadlock where pixel waves aren't launched
5341        * because they're waiting for more pixels while the frontend
5342        * is stuck waiting for PC space, the maximum allowed value is
5343        * the size of the PC minus the largest possible allocation for
5344        * a single primitive shader subgroup.
5345        */
5346       si_pm4_set_reg(pm4, R_028C50_PA_SC_NGG_MODE_CNTL, S_028C50_MAX_DEALLOCS_IN_WAVE(512));
5347       /* Reuse for legacy (non-NGG) only. */
5348       si_pm4_set_reg(pm4, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL, 14);
5349 
5350       if (!has_clear_state) {
5351          si_pm4_set_reg(pm4, R_02835C_PA_SC_TILE_STEERING_OVERRIDE,
5352                         sscreen->info.pa_sc_tile_steering_override);
5353       }
5354 
5355 
5356       si_pm4_set_reg(pm4, R_030964_GE_MAX_VTX_INDX, ~0);
5357       si_pm4_set_reg(pm4, R_030924_GE_MIN_VTX_INDX, 0);
5358       si_pm4_set_reg(pm4, R_030928_GE_INDX_OFFSET, 0);
5359       si_pm4_set_reg(pm4, R_03097C_GE_STEREO_CNTL, 0);
5360       si_pm4_set_reg(pm4, R_030988_GE_USER_VGPR_EN, 0);
5361    }
5362 
5363    if (sctx->chip_class >= GFX10_3) {
5364       si_pm4_set_reg(pm4, R_028750_SX_PS_DOWNCONVERT_CONTROL, 0xff);
5365       /* This allows sample shading. */
5366       si_pm4_set_reg(pm4, R_028848_PA_CL_VRS_CNTL,
5367                      S_028848_SAMPLE_ITER_COMBINER_MODE(1));
5368    }
5369 
5370    sctx->cs_preamble_state = pm4;
5371 }
5372