1 /*
2  * Copyright 2019-2020 Valve Corporation
3  * SPDX-License-Identifier: MIT
4  *
5  * Authors:
6  *    Jonathan Marek <jonathan@marek.ca>
7  */
8 
9 #include "tu_private.h"
10 
11 #include "tu_cs.h"
12 #include "vk_format.h"
13 
14 #include "util/format_r11g11b10f.h"
15 #include "util/format_rgb9e5.h"
16 #include "util/format_srgb.h"
17 #include "util/half_float.h"
18 
19 static uint32_t
tu_pack_float32_for_unorm(float val,int bits)20 tu_pack_float32_for_unorm(float val, int bits)
21 {
22    return _mesa_lroundevenf(CLAMP(val, 0.0f, 1.0f) * (float) ((1 << bits) - 1));
23 }
24 
25 /* r2d_ = BLIT_OP_SCALE operations */
26 
27 static enum a6xx_2d_ifmt
format_to_ifmt(VkFormat format)28 format_to_ifmt(VkFormat format)
29 {
30    if (format == VK_FORMAT_D24_UNORM_S8_UINT ||
31        format == VK_FORMAT_X8_D24_UNORM_PACK32)
32       return R2D_UNORM8;
33 
34    /* get_component_bits doesn't work with depth/stencil formats: */
35    if (format == VK_FORMAT_D16_UNORM || format == VK_FORMAT_D32_SFLOAT)
36       return R2D_FLOAT32;
37    if (format == VK_FORMAT_S8_UINT)
38       return R2D_INT8;
39 
40    /* use the size of the red channel to find the corresponding "ifmt" */
41    bool is_int = vk_format_is_int(format);
42    switch (vk_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, PIPE_SWIZZLE_X)) {
43    case 4: case 5: case 8:
44       return is_int ? R2D_INT8 : R2D_UNORM8;
45    case 10: case 11:
46       return is_int ? R2D_INT16 : R2D_FLOAT16;
47    case 16:
48       if (vk_format_is_float(format))
49          return R2D_FLOAT16;
50       return is_int ? R2D_INT16 : R2D_FLOAT32;
51    case 32:
52       return is_int ? R2D_INT32 : R2D_FLOAT32;
53     default:
54       unreachable("bad format");
55       return 0;
56    }
57 }
58 
59 static void
r2d_coords(struct tu_cs * cs,const VkOffset2D * dst,const VkOffset2D * src,const VkExtent2D * extent)60 r2d_coords(struct tu_cs *cs,
61            const VkOffset2D *dst,
62            const VkOffset2D *src,
63            const VkExtent2D *extent)
64 {
65    tu_cs_emit_regs(cs,
66       A6XX_GRAS_2D_DST_TL(.x = dst->x,                     .y = dst->y),
67       A6XX_GRAS_2D_DST_BR(.x = dst->x + extent->width - 1, .y = dst->y + extent->height - 1));
68 
69    if (!src)
70       return;
71 
72    tu_cs_emit_regs(cs,
73                    A6XX_GRAS_2D_SRC_TL_X(src->x),
74                    A6XX_GRAS_2D_SRC_BR_X(src->x + extent->width - 1),
75                    A6XX_GRAS_2D_SRC_TL_Y(src->y),
76                    A6XX_GRAS_2D_SRC_BR_Y(src->y + extent->height - 1));
77 }
78 
79 static void
r2d_clear_value(struct tu_cs * cs,VkFormat format,const VkClearValue * val)80 r2d_clear_value(struct tu_cs *cs, VkFormat format, const VkClearValue *val)
81 {
82    uint32_t clear_value[4] = {};
83 
84    switch (format) {
85    case VK_FORMAT_X8_D24_UNORM_PACK32:
86    case VK_FORMAT_D24_UNORM_S8_UINT:
87       /* cleared as r8g8b8a8_unorm using special format */
88       clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 24);
89       clear_value[1] = clear_value[0] >> 8;
90       clear_value[2] = clear_value[0] >> 16;
91       clear_value[3] = val->depthStencil.stencil;
92       break;
93    case VK_FORMAT_D16_UNORM:
94    case VK_FORMAT_D32_SFLOAT:
95       /* R2D_FLOAT32 */
96       clear_value[0] = fui(val->depthStencil.depth);
97       break;
98    case VK_FORMAT_S8_UINT:
99       clear_value[0] = val->depthStencil.stencil;
100       break;
101    case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
102       /* cleared as UINT32 */
103       clear_value[0] = float3_to_rgb9e5(val->color.float32);
104       break;
105    default:
106       assert(!vk_format_is_depth_or_stencil(format));
107       const struct util_format_description *desc = vk_format_description(format);
108       enum a6xx_2d_ifmt ifmt = format_to_ifmt(format);
109 
110       assert(desc && (desc->layout == UTIL_FORMAT_LAYOUT_PLAIN ||
111                       format == VK_FORMAT_B10G11R11_UFLOAT_PACK32));
112 
113       for (unsigned i = 0; i < desc->nr_channels; i++) {
114          const struct util_format_channel_description *ch = &desc->channel[i];
115          if (ifmt == R2D_UNORM8) {
116             float linear = val->color.float32[i];
117             if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB && i < 3)
118                linear = util_format_linear_to_srgb_float(val->color.float32[i]);
119 
120             if (ch->type == UTIL_FORMAT_TYPE_SIGNED)
121                clear_value[i] = _mesa_lroundevenf(CLAMP(linear, -1.0f, 1.0f) * 127.0f);
122             else
123                clear_value[i] = tu_pack_float32_for_unorm(linear, 8);
124          } else if (ifmt == R2D_FLOAT16) {
125             clear_value[i] = _mesa_float_to_half(val->color.float32[i]);
126          } else {
127             assert(ifmt == R2D_FLOAT32 || ifmt == R2D_INT32 ||
128                    ifmt == R2D_INT16 || ifmt == R2D_INT8);
129             clear_value[i] = val->color.uint32[i];
130          }
131       }
132       break;
133    }
134 
135    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_SRC_SOLID_C0, 4);
136    tu_cs_emit_array(cs, clear_value, 4);
137 }
138 
139 static void
r2d_src(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer,VkFilter filter)140 r2d_src(struct tu_cmd_buffer *cmd,
141         struct tu_cs *cs,
142         const struct tu_image_view *iview,
143         uint32_t layer,
144         VkFilter filter)
145 {
146    uint32_t src_info = iview->SP_PS_2D_SRC_INFO;
147    if (filter != VK_FILTER_NEAREST)
148       src_info |= A6XX_SP_PS_2D_SRC_INFO_FILTER;
149 
150    tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_INFO, 5);
151    tu_cs_emit(cs, src_info);
152    tu_cs_emit(cs, iview->SP_PS_2D_SRC_SIZE);
153    tu_cs_image_ref_2d(cs, iview, layer, true);
154 
155    tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_FLAGS_LO, 3);
156    tu_cs_image_flag_ref(cs, iview, layer);
157 }
158 
159 static void
r2d_src_buffer(struct tu_cmd_buffer * cmd,struct tu_cs * cs,VkFormat vk_format,uint64_t va,uint32_t pitch,uint32_t width,uint32_t height)160 r2d_src_buffer(struct tu_cmd_buffer *cmd,
161                struct tu_cs *cs,
162                VkFormat vk_format,
163                uint64_t va, uint32_t pitch,
164                uint32_t width, uint32_t height)
165 {
166    struct tu_native_format format = tu6_format_texture(vk_format, TILE6_LINEAR);
167 
168    tu_cs_emit_regs(cs,
169                    A6XX_SP_PS_2D_SRC_INFO(
170                       .color_format = format.fmt,
171                       .color_swap = format.swap,
172                       .srgb = vk_format_is_srgb(vk_format),
173                       .unk20 = 1,
174                       .unk22 = 1),
175                    A6XX_SP_PS_2D_SRC_SIZE(.width = width, .height = height),
176                    A6XX_SP_PS_2D_SRC_LO((uint32_t) va),
177                    A6XX_SP_PS_2D_SRC_HI(va >> 32),
178                    A6XX_SP_PS_2D_SRC_PITCH(.pitch = pitch));
179 }
180 
181 static void
r2d_dst(struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer)182 r2d_dst(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
183 {
184    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_INFO, 4);
185    tu_cs_emit(cs, iview->RB_2D_DST_INFO);
186    tu_cs_image_ref_2d(cs, iview, layer, false);
187 
188    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_FLAGS_LO, 3);
189    tu_cs_image_flag_ref(cs, iview, layer);
190 }
191 
192 static void
r2d_dst_stencil(struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer)193 r2d_dst_stencil(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
194 {
195    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_INFO, 4);
196    tu_cs_emit(cs, tu_image_view_stencil(iview, RB_2D_DST_INFO) & ~A6XX_RB_2D_DST_INFO_FLAGS);
197    tu_cs_emit_qw(cs, iview->stencil_base_addr + iview->stencil_layer_size * layer);
198    tu_cs_emit(cs, iview->stencil_PITCH);
199 }
200 
201 static void
r2d_dst_buffer(struct tu_cs * cs,VkFormat vk_format,uint64_t va,uint32_t pitch)202 r2d_dst_buffer(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch)
203 {
204    struct tu_native_format format = tu6_format_color(vk_format, TILE6_LINEAR);
205 
206    tu_cs_emit_regs(cs,
207                    A6XX_RB_2D_DST_INFO(
208                       .color_format = format.fmt,
209                       .color_swap = format.swap,
210                       .srgb = vk_format_is_srgb(vk_format)),
211                    A6XX_RB_2D_DST_LO((uint32_t) va),
212                    A6XX_RB_2D_DST_HI(va >> 32),
213                    A6XX_RB_2D_DST_PITCH(pitch));
214 }
215 
216 static void
r2d_setup_common(struct tu_cmd_buffer * cmd,struct tu_cs * cs,VkFormat vk_format,VkImageAspectFlags aspect_mask,enum a6xx_rotation rotation,bool clear,bool ubwc,bool scissor)217 r2d_setup_common(struct tu_cmd_buffer *cmd,
218                  struct tu_cs *cs,
219                  VkFormat vk_format,
220                  VkImageAspectFlags aspect_mask,
221                  enum a6xx_rotation rotation,
222                  bool clear,
223                  bool ubwc,
224                  bool scissor)
225 {
226    enum a6xx_format format = tu6_base_format(vk_format);
227    enum a6xx_2d_ifmt ifmt = format_to_ifmt(vk_format);
228    uint32_t unknown_8c01 = 0;
229 
230    if ((vk_format == VK_FORMAT_D24_UNORM_S8_UINT ||
231         vk_format == VK_FORMAT_X8_D24_UNORM_PACK32) && ubwc) {
232       format = FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8;
233    }
234 
235    /* note: the only format with partial clearing is D24S8 */
236    if (vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
237       /* preserve stencil channel */
238       if (aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT)
239          unknown_8c01 = 0x08000041;
240       /* preserve depth channels */
241       if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
242          unknown_8c01 = 0x00084001;
243    }
244 
245    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_UNKNOWN_8C01, 1);
246    tu_cs_emit(cs, unknown_8c01);
247 
248    uint32_t blit_cntl = A6XX_RB_2D_BLIT_CNTL(
249          .scissor = scissor,
250          .rotate = rotation,
251          .solid_color = clear,
252          .d24s8 = format == FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8 && !clear,
253          .color_format = format,
254          .mask = 0xf,
255          .ifmt = vk_format_is_srgb(vk_format) ? R2D_UNORM8_SRGB : ifmt,
256       ).value;
257 
258    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_BLIT_CNTL, 1);
259    tu_cs_emit(cs, blit_cntl);
260 
261    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_2D_BLIT_CNTL, 1);
262    tu_cs_emit(cs, blit_cntl);
263 
264    if (format == FMT6_10_10_10_2_UNORM_DEST)
265       format = FMT6_16_16_16_16_FLOAT;
266 
267    tu_cs_emit_regs(cs, A6XX_SP_2D_DST_FORMAT(
268          .sint = vk_format_is_sint(vk_format),
269          .uint = vk_format_is_uint(vk_format),
270          .color_format = format,
271          .srgb = vk_format_is_srgb(vk_format),
272          .mask = 0xf));
273 }
274 
275 static void
r2d_setup(struct tu_cmd_buffer * cmd,struct tu_cs * cs,VkFormat vk_format,VkImageAspectFlags aspect_mask,enum a6xx_rotation rotation,bool clear,bool ubwc)276 r2d_setup(struct tu_cmd_buffer *cmd,
277           struct tu_cs *cs,
278           VkFormat vk_format,
279           VkImageAspectFlags aspect_mask,
280           enum a6xx_rotation rotation,
281           bool clear,
282           bool ubwc)
283 {
284    tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM);
285 
286    r2d_setup_common(cmd, cs, vk_format, aspect_mask, rotation, clear, ubwc, false);
287 }
288 
289 static void
r2d_teardown(struct tu_cmd_buffer * cmd,struct tu_cs * cs)290 r2d_teardown(struct tu_cmd_buffer *cmd,
291              struct tu_cs *cs)
292 {
293    /* nothing to do here */
294 }
295 
296 static void
r2d_run(struct tu_cmd_buffer * cmd,struct tu_cs * cs)297 r2d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
298 {
299    tu_cs_emit_pkt7(cs, CP_BLIT, 1);
300    tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE));
301 }
302 
303 /* r3d_ = shader path operations */
304 
305 void
tu_init_clear_blit_shaders(struct tu6_global * global)306 tu_init_clear_blit_shaders(struct tu6_global *global)
307 {
308 #define MOV(args...) { .cat1 = { .opc_cat = 1, .src_type = TYPE_S32, .dst_type = TYPE_S32, args } }
309 #define CAT2(op, args...) { .cat2 = { .opc_cat = 2, .opc = (op) & 63, .full = 1, args } }
310 #define CAT3(op, args...) { .cat3 = { .opc_cat = 3, .opc = (op) & 63, args } }
311 
312    static const instr_t vs_code[] = {
313       /* r0.xyz = r0.w ? c1.xyz : c0.xyz
314        * r1.xy = r0.w ? c1.zw : c0.zw
315        * r0.w = 1.0f
316        */
317       CAT3(OPC_SEL_B32, .repeat = 2, .dst = 0,
318          .c1 = {.src1_c = 1, .src1 = 4}, .src1_r = 1,
319          .src2 = 3,
320          .c2 = {.src3_c = 1, .dummy = 1, .src3 = 0}),
321       CAT3(OPC_SEL_B32, .repeat = 1, .dst = 4,
322          .c1 = {.src1_c = 1, .src1 = 6}, .src1_r = 1,
323          .src2 = 3,
324          .c2 = {.src3_c = 1, .dummy = 1, .src3 = 2}),
325       MOV(.dst = 3, .src_im = 1, .fim_val = 1.0f ),
326       { .cat0 = { .opc = OPC_END } },
327    };
328 
329    static const instr_t fs_blit[] = {
330       /* " bary.f (ei)r63.x, 0, r0.x" note the blob doesn't have this in its
331        * blit path (its not clear what allows it to not have it)
332        */
333       CAT2(OPC_BARY_F, .ei = 1, .full = 1, .dst = 63 * 4, .src1_im = 1),
334       { .cat0 = { .opc = OPC_END } },
335    };
336 
337    memcpy(&global->shaders[GLOBAL_SH_VS], vs_code, sizeof(vs_code));
338    memcpy(&global->shaders[GLOBAL_SH_FS_BLIT], fs_blit, sizeof(fs_blit));
339 
340    for (uint32_t num_rts = 0; num_rts <= MAX_RTS; num_rts++) {
341       instr_t *code = global->shaders[GLOBAL_SH_FS_CLEAR0 + num_rts];
342       for (uint32_t i = 0; i < num_rts; i++) {
343          /* (rpt3)mov.s32s32 r0.x, (r)c[i].x */
344          *code++ = (instr_t) MOV(.repeat = 3, .dst = i * 4, .src_c = 1, .src_r = 1, .src = i * 4);
345       }
346       *code++ = (instr_t) { .cat0 = { .opc = OPC_END } };
347    }
348 }
349 
350 static void
r3d_common(struct tu_cmd_buffer * cmd,struct tu_cs * cs,bool blit,uint32_t num_rts,bool layered_clear)351 r3d_common(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool blit, uint32_t num_rts,
352            bool layered_clear)
353 {
354    struct ir3_const_state dummy_const_state = {};
355    struct ir3_shader dummy_shader = {};
356 
357    struct ir3_shader_variant vs = {
358       .type = MESA_SHADER_VERTEX,
359       .instrlen = 1,
360       .constlen = 4,
361       .info.max_reg = 1,
362       .inputs_count = 1,
363       .inputs[0] = {
364          .slot = SYSTEM_VALUE_VERTEX_ID,
365          .regid = regid(0, 3),
366          .sysval = true,
367       },
368       .outputs_count = blit ? 2 : 1,
369       .outputs[0] = {
370          .slot = VARYING_SLOT_POS,
371          .regid = regid(0, 0),
372       },
373       .outputs[1] = {
374          .slot = VARYING_SLOT_VAR0,
375          .regid = regid(1, 0),
376       },
377       .shader = &dummy_shader,
378       .const_state = &dummy_const_state,
379    };
380    if (layered_clear) {
381       vs.outputs[1].slot = VARYING_SLOT_LAYER;
382       vs.outputs[1].regid = regid(1, 1);
383       vs.outputs_count = 2;
384    }
385 
386    struct ir3_shader_variant fs = {
387       .type = MESA_SHADER_FRAGMENT,
388       .instrlen = 1, /* max of 9 instructions with num_rts = 8 */
389       .constlen = align(num_rts, 4),
390       .info.max_reg = MAX2(num_rts, 1) - 1,
391       .total_in = blit ? 2 : 0,
392       .num_samp = blit ? 1 : 0,
393       .inputs_count = blit ? 2 : 0,
394       .inputs[0] = {
395          .slot = VARYING_SLOT_VAR0,
396          .inloc = 0,
397          .compmask = 3,
398          .bary = true,
399       },
400       .inputs[1] = {
401          .slot = SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL,
402          .regid = regid(0, 0),
403          .sysval = 1,
404       },
405       .num_sampler_prefetch = blit ? 1 : 0,
406       .sampler_prefetch[0] = {
407          .src = 0,
408          .wrmask = 0xf,
409          .cmd = 4,
410       },
411       .shader = &dummy_shader,
412       .const_state = &dummy_const_state,
413    };
414 
415    tu_cs_emit_regs(cs, A6XX_HLSQ_INVALIDATE_CMD(
416          .vs_state = true,
417          .hs_state = true,
418          .ds_state = true,
419          .gs_state = true,
420          .fs_state = true,
421          .cs_state = true,
422          .gfx_ibo = true,
423          .cs_ibo = true,
424          .gfx_shared_const = true,
425          .gfx_bindless = 0x1f,
426          .cs_bindless = 0x1f));
427 
428    tu6_emit_xs_config(cs, MESA_SHADER_VERTEX, &vs, global_iova(cmd, shaders[GLOBAL_SH_VS]));
429    tu6_emit_xs_config(cs, MESA_SHADER_TESS_CTRL, NULL, 0);
430    tu6_emit_xs_config(cs, MESA_SHADER_TESS_EVAL, NULL, 0);
431    tu6_emit_xs_config(cs, MESA_SHADER_GEOMETRY, NULL, 0);
432    tu6_emit_xs_config(cs, MESA_SHADER_FRAGMENT, &fs,
433          global_iova(cmd, shaders[blit ? GLOBAL_SH_FS_BLIT : (GLOBAL_SH_FS_CLEAR0 + num_rts)]));
434 
435    tu_cs_emit_regs(cs, A6XX_PC_PRIMITIVE_CNTL_0());
436    tu_cs_emit_regs(cs, A6XX_VFD_CONTROL_0());
437 
438    /* Copy what the blob does here. This will emit an extra 0x3f
439     * CP_EVENT_WRITE when multiview is disabled. I'm not exactly sure what
440     * this is working around yet.
441     */
442    tu_cs_emit_pkt7(cs, CP_REG_WRITE, 3);
443    tu_cs_emit(cs, CP_REG_WRITE_0_TRACKER(UNK_EVENT_WRITE));
444    tu_cs_emit(cs, REG_A6XX_PC_MULTIVIEW_CNTL);
445    tu_cs_emit(cs, 0);
446    tu_cs_emit_regs(cs, A6XX_VFD_MULTIVIEW_CNTL());
447 
448    tu6_emit_vpc(cs, &vs, NULL, NULL, NULL, &fs, 0, false);
449 
450    /* REPL_MODE for varying with RECTLIST (2 vertices only) */
451    tu_cs_emit_regs(cs, A6XX_VPC_VARYING_INTERP_MODE(0, 0));
452    tu_cs_emit_regs(cs, A6XX_VPC_VARYING_PS_REPL_MODE(0, 2 << 2 | 1 << 0));
453 
454    tu6_emit_fs_inputs(cs, &fs);
455 
456    tu_cs_emit_regs(cs,
457                    A6XX_GRAS_CL_CNTL(
458                       .persp_division_disable = 1,
459                       .vp_xform_disable = 1,
460                       .vp_clip_code_ignore = 1,
461                       .clip_disable = 1));
462    tu_cs_emit_regs(cs, A6XX_GRAS_SU_CNTL()); // XXX msaa enable?
463 
464    tu_cs_emit_regs(cs, A6XX_PC_RASTER_CNTL());
465    tu_cs_emit_regs(cs, A6XX_VPC_UNKNOWN_9107());
466 
467    tu_cs_emit_regs(cs,
468                    A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL(0, .x = 0, .y = 0),
469                    A6XX_GRAS_SC_VIEWPORT_SCISSOR_BR(0, .x = 0x7fff, .y = 0x7fff));
470    tu_cs_emit_regs(cs,
471                    A6XX_GRAS_SC_SCREEN_SCISSOR_TL(0, .x = 0, .y = 0),
472                    A6XX_GRAS_SC_SCREEN_SCISSOR_BR(0, .x = 0x7fff, .y = 0x7fff));
473 
474    tu_cs_emit_regs(cs,
475                    A6XX_VFD_INDEX_OFFSET(),
476                    A6XX_VFD_INSTANCE_START_OFFSET());
477 }
478 
479 static void
r3d_coords_raw(struct tu_cs * cs,const float * coords)480 r3d_coords_raw(struct tu_cs *cs, const float *coords)
481 {
482    tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_GEOM, 3 + 8);
483    tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
484                   CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
485                   CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
486                   CP_LOAD_STATE6_0_STATE_BLOCK(SB6_VS_SHADER) |
487                   CP_LOAD_STATE6_0_NUM_UNIT(2));
488    tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
489    tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
490    tu_cs_emit_array(cs, (const uint32_t *) coords, 8);
491 }
492 
493 static void
r3d_coords(struct tu_cs * cs,const VkOffset2D * dst,const VkOffset2D * src,const VkExtent2D * extent)494 r3d_coords(struct tu_cs *cs,
495            const VkOffset2D *dst,
496            const VkOffset2D *src,
497            const VkExtent2D *extent)
498 {
499    int32_t src_x1 = src ? src->x : 0;
500    int32_t src_y1 = src ? src->y : 0;
501    r3d_coords_raw(cs, (float[]) {
502       dst->x,                 dst->y,
503       src_x1,                 src_y1,
504       dst->x + extent->width, dst->y + extent->height,
505       src_x1 + extent->width, src_y1 + extent->height,
506    });
507 }
508 
509 static void
r3d_clear_value(struct tu_cs * cs,VkFormat format,const VkClearValue * val)510 r3d_clear_value(struct tu_cs *cs, VkFormat format, const VkClearValue *val)
511 {
512    tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3 + 4);
513    tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
514                   CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
515                   CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
516                   CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_SHADER) |
517                   CP_LOAD_STATE6_0_NUM_UNIT(1));
518    tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
519    tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
520    switch (format) {
521    case VK_FORMAT_X8_D24_UNORM_PACK32:
522    case VK_FORMAT_D24_UNORM_S8_UINT: {
523       /* cleared as r8g8b8a8_unorm using special format */
524       uint32_t tmp = tu_pack_float32_for_unorm(val->depthStencil.depth, 24);
525       tu_cs_emit(cs, fui((tmp & 0xff) / 255.0f));
526       tu_cs_emit(cs, fui((tmp >> 8 & 0xff) / 255.0f));
527       tu_cs_emit(cs, fui((tmp >> 16 & 0xff) / 255.0f));
528       tu_cs_emit(cs, fui((val->depthStencil.stencil & 0xff) / 255.0f));
529    } break;
530    case VK_FORMAT_D16_UNORM:
531    case VK_FORMAT_D32_SFLOAT:
532       tu_cs_emit(cs, fui(val->depthStencil.depth));
533       tu_cs_emit(cs, 0);
534       tu_cs_emit(cs, 0);
535       tu_cs_emit(cs, 0);
536       break;
537    case VK_FORMAT_S8_UINT:
538       tu_cs_emit(cs, val->depthStencil.stencil & 0xff);
539       tu_cs_emit(cs, 0);
540       tu_cs_emit(cs, 0);
541       tu_cs_emit(cs, 0);
542       break;
543    default:
544       /* as color formats use clear value as-is */
545       assert(!vk_format_is_depth_or_stencil(format));
546       tu_cs_emit_array(cs, val->color.uint32, 4);
547       break;
548    }
549 }
550 
551 static void
r3d_src_common(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const uint32_t * tex_const,uint32_t offset_base,uint32_t offset_ubwc,VkFilter filter)552 r3d_src_common(struct tu_cmd_buffer *cmd,
553                struct tu_cs *cs,
554                const uint32_t *tex_const,
555                uint32_t offset_base,
556                uint32_t offset_ubwc,
557                VkFilter filter)
558 {
559    struct tu_cs_memory texture = { };
560    VkResult result = tu_cs_alloc(&cmd->sub_cs,
561                                  2, /* allocate space for a sampler too */
562                                  A6XX_TEX_CONST_DWORDS, &texture);
563    if (result != VK_SUCCESS) {
564       cmd->record_result = result;
565       return;
566    }
567 
568    memcpy(texture.map, tex_const, A6XX_TEX_CONST_DWORDS * 4);
569 
570    /* patch addresses for layer offset */
571    *(uint64_t*) (texture.map + 4) += offset_base;
572    uint64_t ubwc_addr = (texture.map[7] | (uint64_t) texture.map[8] << 32) + offset_ubwc;
573    texture.map[7] = ubwc_addr;
574    texture.map[8] = ubwc_addr >> 32;
575 
576    texture.map[A6XX_TEX_CONST_DWORDS + 0] =
577       A6XX_TEX_SAMP_0_XY_MAG(tu6_tex_filter(filter, false)) |
578       A6XX_TEX_SAMP_0_XY_MIN(tu6_tex_filter(filter, false)) |
579       A6XX_TEX_SAMP_0_WRAP_S(A6XX_TEX_CLAMP_TO_EDGE) |
580       A6XX_TEX_SAMP_0_WRAP_T(A6XX_TEX_CLAMP_TO_EDGE) |
581       A6XX_TEX_SAMP_0_WRAP_R(A6XX_TEX_CLAMP_TO_EDGE) |
582       0x60000; /* XXX used by blob, doesn't seem necessary */
583    texture.map[A6XX_TEX_CONST_DWORDS + 1] =
584       0x1 | /* XXX used by blob, doesn't seem necessary */
585       A6XX_TEX_SAMP_1_UNNORM_COORDS |
586       A6XX_TEX_SAMP_1_MIPFILTER_LINEAR_FAR;
587    texture.map[A6XX_TEX_CONST_DWORDS + 2] = 0;
588    texture.map[A6XX_TEX_CONST_DWORDS + 3] = 0;
589 
590    tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3);
591    tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
592                CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
593                CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
594                CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |
595                CP_LOAD_STATE6_0_NUM_UNIT(1));
596    tu_cs_emit_qw(cs, texture.iova + A6XX_TEX_CONST_DWORDS * 4);
597 
598    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_TEX_SAMP_LO, 2);
599    tu_cs_emit_qw(cs, texture.iova + A6XX_TEX_CONST_DWORDS * 4);
600 
601    tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3);
602    tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
603       CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
604       CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
605       CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |
606       CP_LOAD_STATE6_0_NUM_UNIT(1));
607    tu_cs_emit_qw(cs, texture.iova);
608 
609    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_TEX_CONST_LO, 2);
610    tu_cs_emit_qw(cs, texture.iova);
611 
612    tu_cs_emit_regs(cs, A6XX_SP_FS_TEX_COUNT(1));
613 }
614 
615 static void
r3d_src(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer,VkFilter filter)616 r3d_src(struct tu_cmd_buffer *cmd,
617         struct tu_cs *cs,
618         const struct tu_image_view *iview,
619         uint32_t layer,
620         VkFilter filter)
621 {
622    r3d_src_common(cmd, cs, iview->descriptor,
623                   iview->layer_size * layer,
624                   iview->ubwc_layer_size * layer,
625                   filter);
626 }
627 
628 static void
r3d_src_buffer(struct tu_cmd_buffer * cmd,struct tu_cs * cs,VkFormat vk_format,uint64_t va,uint32_t pitch,uint32_t width,uint32_t height)629 r3d_src_buffer(struct tu_cmd_buffer *cmd,
630                struct tu_cs *cs,
631                VkFormat vk_format,
632                uint64_t va, uint32_t pitch,
633                uint32_t width, uint32_t height)
634 {
635    uint32_t desc[A6XX_TEX_CONST_DWORDS];
636 
637    struct tu_native_format format = tu6_format_texture(vk_format, TILE6_LINEAR);
638 
639    desc[0] =
640       COND(vk_format_is_srgb(vk_format), A6XX_TEX_CONST_0_SRGB) |
641       A6XX_TEX_CONST_0_FMT(format.fmt) |
642       A6XX_TEX_CONST_0_SWAP(format.swap) |
643       A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_X) |
644       // XXX to swizzle into .w for stencil buffer_to_image
645       A6XX_TEX_CONST_0_SWIZ_Y(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_Y) |
646       A6XX_TEX_CONST_0_SWIZ_Z(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_Z) |
647       A6XX_TEX_CONST_0_SWIZ_W(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_W);
648    desc[1] = A6XX_TEX_CONST_1_WIDTH(width) | A6XX_TEX_CONST_1_HEIGHT(height);
649    desc[2] =
650       A6XX_TEX_CONST_2_PITCH(pitch) |
651       A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D);
652    desc[3] = 0;
653    desc[4] = va;
654    desc[5] = va >> 32;
655    for (uint32_t i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
656       desc[i] = 0;
657 
658    r3d_src_common(cmd, cs, desc, 0, 0, VK_FILTER_NEAREST);
659 }
660 
661 static void
r3d_dst(struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer)662 r3d_dst(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
663 {
664    tu6_emit_msaa(cs, iview->image->layout[0].nr_samples); /* TODO: move to setup */
665 
666    tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_BUF_INFO(0), 6);
667    tu_cs_emit(cs, iview->RB_MRT_BUF_INFO);
668    tu_cs_image_ref(cs, iview, layer);
669    tu_cs_emit(cs, 0);
670 
671    tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_FLAG_BUFFER(0), 3);
672    tu_cs_image_flag_ref(cs, iview, layer);
673 
674    tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL(.flag_mrts = iview->ubwc_enabled));
675 }
676 
677 static void
r3d_dst_stencil(struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer)678 r3d_dst_stencil(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
679 {
680    tu6_emit_msaa(cs, iview->image->layout[0].nr_samples); /* TODO: move to setup */
681 
682    tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_BUF_INFO(0), 6);
683    tu_cs_emit(cs, tu_image_view_stencil(iview, RB_MRT_BUF_INFO));
684    tu_cs_image_stencil_ref(cs, iview, layer);
685    tu_cs_emit(cs, 0);
686 
687    tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL());
688 }
689 
690 static void
r3d_dst_buffer(struct tu_cs * cs,VkFormat vk_format,uint64_t va,uint32_t pitch)691 r3d_dst_buffer(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch)
692 {
693    struct tu_native_format format = tu6_format_color(vk_format, TILE6_LINEAR);
694 
695    tu6_emit_msaa(cs, 1); /* TODO: move to setup */
696 
697    tu_cs_emit_regs(cs,
698                    A6XX_RB_MRT_BUF_INFO(0, .color_format = format.fmt, .color_swap = format.swap),
699                    A6XX_RB_MRT_PITCH(0, pitch),
700                    A6XX_RB_MRT_ARRAY_PITCH(0, 0),
701                    A6XX_RB_MRT_BASE_LO(0, (uint32_t) va),
702                    A6XX_RB_MRT_BASE_HI(0, va >> 32),
703                    A6XX_RB_MRT_BASE_GMEM(0, 0));
704 
705    tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL());
706 }
707 
708 static uint8_t
aspect_write_mask(VkFormat vk_format,VkImageAspectFlags aspect_mask)709 aspect_write_mask(VkFormat vk_format, VkImageAspectFlags aspect_mask)
710 {
711    uint8_t mask = 0xf;
712    assert(aspect_mask);
713    /* note: the only format with partial writing is D24S8,
714     * clear/blit uses the _AS_R8G8B8A8 format to access it
715     */
716    if (vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
717       if (aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT)
718          mask = 0x7;
719       if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
720          mask = 0x8;
721    }
722    return mask;
723 }
724 
725 static void
r3d_setup(struct tu_cmd_buffer * cmd,struct tu_cs * cs,VkFormat vk_format,VkImageAspectFlags aspect_mask,enum a6xx_rotation rotation,bool clear,bool ubwc)726 r3d_setup(struct tu_cmd_buffer *cmd,
727           struct tu_cs *cs,
728           VkFormat vk_format,
729           VkImageAspectFlags aspect_mask,
730           enum a6xx_rotation rotation,
731           bool clear,
732           bool ubwc)
733 {
734    enum a6xx_format format = tu6_base_format(vk_format);
735 
736    if ((vk_format == VK_FORMAT_D24_UNORM_S8_UINT ||
737         vk_format == VK_FORMAT_X8_D24_UNORM_PACK32) && ubwc) {
738       format = FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8;
739    }
740 
741    if (!cmd->state.pass) {
742       tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM);
743       tu6_emit_window_scissor(cs, 0, 0, 0x3fff, 0x3fff);
744    }
745 
746    tu_cs_emit_regs(cs, A6XX_GRAS_BIN_CONTROL(.dword = 0xc00000));
747    tu_cs_emit_regs(cs, A6XX_RB_BIN_CONTROL(.dword = 0xc00000));
748 
749    r3d_common(cmd, cs, !clear, clear ? 1 : 0, false);
750 
751    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2);
752    tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(0xfc) |
753                   A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(0xfc) |
754                   0xfc000000);
755    tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(1));
756 
757    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), 1);
758    tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_REG_REGID(0));
759 
760    tu_cs_emit_regs(cs,
761                    A6XX_RB_FS_OUTPUT_CNTL0(),
762                    A6XX_RB_FS_OUTPUT_CNTL1(.mrt = 1));
763 
764    tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL());
765    tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.sample_mask = 0xffff));
766 
767    tu_cs_emit_regs(cs, A6XX_RB_DEPTH_PLANE_CNTL());
768    tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL());
769    tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL());
770    tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL());
771    tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK());
772    tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK());
773    tu_cs_emit_regs(cs, A6XX_RB_STENCILREF());
774 
775    tu_cs_emit_regs(cs, A6XX_RB_RENDER_COMPONENTS(.rt0 = 0xf));
776    tu_cs_emit_regs(cs, A6XX_SP_FS_RENDER_COMPONENTS(.rt0 = 0xf));
777 
778    tu_cs_emit_regs(cs, A6XX_SP_FS_MRT_REG(0,
779                         .color_format = format,
780                         .color_sint = vk_format_is_sint(vk_format),
781                         .color_uint = vk_format_is_uint(vk_format)));
782 
783    tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(0,
784       .component_enable = aspect_write_mask(vk_format, aspect_mask)));
785    tu_cs_emit_regs(cs, A6XX_RB_SRGB_CNTL(vk_format_is_srgb(vk_format)));
786    tu_cs_emit_regs(cs, A6XX_SP_SRGB_CNTL(vk_format_is_srgb(vk_format)));
787 
788    tu_cs_emit_regs(cs, A6XX_GRAS_LRZ_CNTL(0));
789    tu_cs_emit_regs(cs, A6XX_RB_LRZ_CNTL(0));
790 
791    if (cmd->state.predication_active) {
792       tu_cs_emit_pkt7(cs, CP_DRAW_PRED_ENABLE_LOCAL, 1);
793       tu_cs_emit(cs, 0);
794    }
795 }
796 
797 static void
r3d_run(struct tu_cmd_buffer * cmd,struct tu_cs * cs)798 r3d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
799 {
800    tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3);
801    tu_cs_emit(cs, CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(DI_PT_RECTLIST) |
802                   CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_AUTO_INDEX) |
803                   CP_DRAW_INDX_OFFSET_0_VIS_CULL(IGNORE_VISIBILITY));
804    tu_cs_emit(cs, 1); /* instance count */
805    tu_cs_emit(cs, 2); /* vertex count */
806 }
807 
808 static void
r3d_teardown(struct tu_cmd_buffer * cmd,struct tu_cs * cs)809 r3d_teardown(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
810 {
811    if (cmd->state.predication_active) {
812       tu_cs_emit_pkt7(cs, CP_DRAW_PRED_ENABLE_LOCAL, 1);
813       tu_cs_emit(cs, 1);
814    }
815 }
816 
817 /* blit ops - common interface for 2d/shader paths */
818 
819 struct blit_ops {
820    void (*coords)(struct tu_cs *cs,
821                   const VkOffset2D *dst,
822                   const VkOffset2D *src,
823                   const VkExtent2D *extent);
824    void (*clear_value)(struct tu_cs *cs, VkFormat format, const VkClearValue *val);
825    void (*src)(
826         struct tu_cmd_buffer *cmd,
827         struct tu_cs *cs,
828         const struct tu_image_view *iview,
829         uint32_t layer,
830         VkFilter filter);
831    void (*src_buffer)(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
832                       VkFormat vk_format,
833                       uint64_t va, uint32_t pitch,
834                       uint32_t width, uint32_t height);
835    void (*dst)(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer);
836    void (*dst_buffer)(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch);
837    void (*setup)(struct tu_cmd_buffer *cmd,
838                  struct tu_cs *cs,
839                  VkFormat vk_format,
840                  VkImageAspectFlags aspect_mask,
841                  enum a6xx_rotation rotation,
842                  bool clear,
843                  bool ubwc);
844    void (*run)(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
845    void (*teardown)(struct tu_cmd_buffer *cmd,
846                     struct tu_cs *cs);
847 };
848 
849 static const struct blit_ops r2d_ops = {
850    .coords = r2d_coords,
851    .clear_value = r2d_clear_value,
852    .src = r2d_src,
853    .src_buffer = r2d_src_buffer,
854    .dst = r2d_dst,
855    .dst_buffer = r2d_dst_buffer,
856    .setup = r2d_setup,
857    .run = r2d_run,
858    .teardown = r2d_teardown,
859 };
860 
861 static const struct blit_ops r3d_ops = {
862    .coords = r3d_coords,
863    .clear_value = r3d_clear_value,
864    .src = r3d_src,
865    .src_buffer = r3d_src_buffer,
866    .dst = r3d_dst,
867    .dst_buffer = r3d_dst_buffer,
868    .setup = r3d_setup,
869    .run = r3d_run,
870    .teardown = r3d_teardown,
871 };
872 
873 /* passthrough set coords from 3D extents */
874 static void
coords(const struct blit_ops * ops,struct tu_cs * cs,const VkOffset3D * dst,const VkOffset3D * src,const VkExtent3D * extent)875 coords(const struct blit_ops *ops,
876        struct tu_cs *cs,
877        const VkOffset3D *dst,
878        const VkOffset3D *src,
879        const VkExtent3D *extent)
880 {
881    ops->coords(cs, (const VkOffset2D*) dst, (const VkOffset2D*) src, (const VkExtent2D*) extent);
882 }
883 
884 static VkFormat
copy_format(VkFormat format,VkImageAspectFlags aspect_mask,bool copy_buffer)885 copy_format(VkFormat format, VkImageAspectFlags aspect_mask, bool copy_buffer)
886 {
887    if (vk_format_is_compressed(format)) {
888       switch (vk_format_get_blocksize(format)) {
889       case 1: return VK_FORMAT_R8_UINT;
890       case 2: return VK_FORMAT_R16_UINT;
891       case 4: return VK_FORMAT_R32_UINT;
892       case 8: return VK_FORMAT_R32G32_UINT;
893       case 16:return VK_FORMAT_R32G32B32A32_UINT;
894       default:
895          unreachable("unhandled format size");
896       }
897    }
898 
899    switch (format) {
900    case VK_FORMAT_G8_B8R8_2PLANE_420_UNORM:
901       if (aspect_mask == VK_IMAGE_ASPECT_PLANE_1_BIT)
902          return VK_FORMAT_R8G8_UNORM;
903       /* fallthrough */
904    case VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM:
905       return VK_FORMAT_R8_UNORM;
906    case VK_FORMAT_D24_UNORM_S8_UINT:
907       if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT && copy_buffer)
908          return VK_FORMAT_R8_UNORM;
909       /* fallthrough */
910    default:
911       return format;
912    case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
913       return VK_FORMAT_R32_UINT;
914    case VK_FORMAT_D32_SFLOAT_S8_UINT:
915       if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
916          return VK_FORMAT_S8_UINT;
917       assert(aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT);
918       return VK_FORMAT_D32_SFLOAT;
919    }
920 }
921 
922 void
tu6_clear_lrz(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_image * image,const VkClearValue * value)923 tu6_clear_lrz(struct tu_cmd_buffer *cmd,
924               struct tu_cs *cs,
925               struct tu_image *image,
926               const VkClearValue *value)
927 {
928    const struct blit_ops *ops = &r2d_ops;
929 
930    ops->setup(cmd, cs, VK_FORMAT_D16_UNORM, VK_IMAGE_ASPECT_DEPTH_BIT, ROTATE_0, true, false);
931    ops->clear_value(cs, VK_FORMAT_D16_UNORM, value);
932    ops->dst_buffer(cs, VK_FORMAT_D16_UNORM,
933                    image->bo->iova + image->bo_offset + image->lrz_offset,
934                    image->lrz_pitch * 2);
935    ops->coords(cs, &(VkOffset2D) {}, NULL, &(VkExtent2D) {image->lrz_pitch, image->lrz_height});
936    ops->run(cmd, cs);
937    ops->teardown(cmd, cs);
938 }
939 
940 static void
tu_image_view_copy_blit(struct tu_image_view * iview,struct tu_image * image,VkFormat format,const VkImageSubresourceLayers * subres,uint32_t layer,bool stencil_read)941 tu_image_view_copy_blit(struct tu_image_view *iview,
942                         struct tu_image *image,
943                         VkFormat format,
944                         const VkImageSubresourceLayers *subres,
945                         uint32_t layer,
946                         bool stencil_read)
947 {
948    VkImageAspectFlags aspect_mask = subres->aspectMask;
949 
950    /* always use the AS_R8G8B8A8 format for these */
951    if (format == VK_FORMAT_D24_UNORM_S8_UINT ||
952        format == VK_FORMAT_X8_D24_UNORM_PACK32) {
953       aspect_mask = VK_IMAGE_ASPECT_COLOR_BIT;
954    }
955 
956    tu_image_view_init(iview, &(VkImageViewCreateInfo) {
957       .image = tu_image_to_handle(image),
958       .viewType = VK_IMAGE_VIEW_TYPE_2D,
959       .format = format,
960       /* image_to_buffer from d24s8 with stencil aspect mask writes out to r8 */
961       .components.r = stencil_read ? VK_COMPONENT_SWIZZLE_A : VK_COMPONENT_SWIZZLE_R,
962       .subresourceRange = {
963          .aspectMask = aspect_mask,
964          .baseMipLevel = subres->mipLevel,
965          .levelCount = 1,
966          .baseArrayLayer = subres->baseArrayLayer + layer,
967          .layerCount = 1,
968       },
969    }, false);
970 }
971 
972 static void
tu_image_view_copy(struct tu_image_view * iview,struct tu_image * image,VkFormat format,const VkImageSubresourceLayers * subres,uint32_t layer,bool stencil_read)973 tu_image_view_copy(struct tu_image_view *iview,
974                    struct tu_image *image,
975                    VkFormat format,
976                    const VkImageSubresourceLayers *subres,
977                    uint32_t layer,
978                    bool stencil_read)
979 {
980    format = copy_format(format, subres->aspectMask, false);
981    tu_image_view_copy_blit(iview, image, format, subres, layer, stencil_read);
982 }
983 
984 static void
tu_image_view_blit(struct tu_image_view * iview,struct tu_image * image,const VkImageSubresourceLayers * subres,uint32_t layer)985 tu_image_view_blit(struct tu_image_view *iview,
986                    struct tu_image *image,
987                    const VkImageSubresourceLayers *subres,
988                    uint32_t layer)
989 {
990    tu_image_view_copy_blit(iview, image, image->vk_format, subres, layer, false);
991 }
992 
993 static void
tu6_blit_image(struct tu_cmd_buffer * cmd,struct tu_image * src_image,struct tu_image * dst_image,const VkImageBlit * info,VkFilter filter)994 tu6_blit_image(struct tu_cmd_buffer *cmd,
995                struct tu_image *src_image,
996                struct tu_image *dst_image,
997                const VkImageBlit *info,
998                VkFilter filter)
999 {
1000    const struct blit_ops *ops = &r2d_ops;
1001    struct tu_cs *cs = &cmd->cs;
1002    uint32_t layers;
1003 
1004    /* 2D blit can't do rotation mirroring from just coordinates */
1005    static const enum a6xx_rotation rotate[2][2] = {
1006       {ROTATE_0, ROTATE_HFLIP},
1007       {ROTATE_VFLIP, ROTATE_180},
1008    };
1009 
1010    bool mirror_x = (info->srcOffsets[1].x < info->srcOffsets[0].x) !=
1011                    (info->dstOffsets[1].x < info->dstOffsets[0].x);
1012    bool mirror_y = (info->srcOffsets[1].y < info->srcOffsets[0].y) !=
1013                    (info->dstOffsets[1].y < info->dstOffsets[0].y);
1014    bool mirror_z = (info->srcOffsets[1].z < info->srcOffsets[0].z) !=
1015                    (info->dstOffsets[1].z < info->dstOffsets[0].z);
1016 
1017    if (mirror_z) {
1018       tu_finishme("blit z mirror\n");
1019       return;
1020    }
1021 
1022    if (info->srcOffsets[1].z - info->srcOffsets[0].z !=
1023        info->dstOffsets[1].z - info->dstOffsets[0].z) {
1024       tu_finishme("blit z filter\n");
1025       return;
1026    }
1027 
1028    layers = info->srcOffsets[1].z - info->srcOffsets[0].z;
1029    if (info->dstSubresource.layerCount > 1) {
1030       assert(layers <= 1);
1031       layers = info->dstSubresource.layerCount;
1032    }
1033 
1034    /* BC1_RGB_* formats need to have their last components overriden with 1
1035     * when sampling, which is normally handled with the texture descriptor
1036     * swizzle. The 2d path can't handle that, so use the 3d path.
1037     *
1038     * TODO: we could use RB_2D_BLIT_CNTL::MASK to make these formats work with
1039     * the 2d path.
1040     */
1041 
1042    if (dst_image->layout[0].nr_samples > 1 ||
1043        src_image->vk_format == VK_FORMAT_BC1_RGB_UNORM_BLOCK ||
1044        src_image->vk_format == VK_FORMAT_BC1_RGB_SRGB_BLOCK ||
1045        filter == VK_FILTER_CUBIC_EXT)
1046       ops = &r3d_ops;
1047 
1048    /* use the right format in setup() for D32_S8
1049     * TODO: this probably should use a helper
1050     */
1051    VkFormat format = dst_image->vk_format;
1052    if (format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
1053       if (info->dstSubresource.aspectMask == VK_IMAGE_ASPECT_DEPTH_BIT)
1054          format = VK_FORMAT_D32_SFLOAT;
1055       else if (info->dstSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT)
1056          format = VK_FORMAT_S8_UINT;
1057       else
1058          unreachable("unexpected D32_S8 aspect mask in blit_image");
1059    }
1060 
1061    ops->setup(cmd, cs, format, info->dstSubresource.aspectMask,
1062               rotate[mirror_y][mirror_x], false, dst_image->layout[0].ubwc);
1063 
1064    if (ops == &r3d_ops) {
1065       r3d_coords_raw(cs, (float[]) {
1066          info->dstOffsets[0].x, info->dstOffsets[0].y,
1067          info->srcOffsets[0].x, info->srcOffsets[0].y,
1068          info->dstOffsets[1].x, info->dstOffsets[1].y,
1069          info->srcOffsets[1].x, info->srcOffsets[1].y
1070       });
1071    } else {
1072       tu_cs_emit_regs(cs,
1073          A6XX_GRAS_2D_DST_TL(.x = MIN2(info->dstOffsets[0].x, info->dstOffsets[1].x),
1074                              .y = MIN2(info->dstOffsets[0].y, info->dstOffsets[1].y)),
1075          A6XX_GRAS_2D_DST_BR(.x = MAX2(info->dstOffsets[0].x, info->dstOffsets[1].x) - 1,
1076                              .y = MAX2(info->dstOffsets[0].y, info->dstOffsets[1].y) - 1));
1077       tu_cs_emit_regs(cs,
1078          A6XX_GRAS_2D_SRC_TL_X(MIN2(info->srcOffsets[0].x, info->srcOffsets[1].x)),
1079          A6XX_GRAS_2D_SRC_BR_X(MAX2(info->srcOffsets[0].x, info->srcOffsets[1].x) - 1),
1080          A6XX_GRAS_2D_SRC_TL_Y(MIN2(info->srcOffsets[0].y, info->srcOffsets[1].y)),
1081          A6XX_GRAS_2D_SRC_BR_Y(MAX2(info->srcOffsets[0].y, info->srcOffsets[1].y) - 1));
1082    }
1083 
1084    struct tu_image_view dst, src;
1085    tu_image_view_blit(&dst, dst_image, &info->dstSubresource, info->dstOffsets[0].z);
1086    tu_image_view_blit(&src, src_image, &info->srcSubresource, info->srcOffsets[0].z);
1087 
1088    for (uint32_t i = 0; i < layers; i++) {
1089       ops->dst(cs, &dst, i);
1090       ops->src(cmd, cs, &src, i, filter);
1091       ops->run(cmd, cs);
1092    }
1093 
1094    ops->teardown(cmd, cs);
1095 }
1096 
1097 void
tu_CmdBlitImage(VkCommandBuffer commandBuffer,VkImage srcImage,VkImageLayout srcImageLayout,VkImage dstImage,VkImageLayout dstImageLayout,uint32_t regionCount,const VkImageBlit * pRegions,VkFilter filter)1098 tu_CmdBlitImage(VkCommandBuffer commandBuffer,
1099                 VkImage srcImage,
1100                 VkImageLayout srcImageLayout,
1101                 VkImage dstImage,
1102                 VkImageLayout dstImageLayout,
1103                 uint32_t regionCount,
1104                 const VkImageBlit *pRegions,
1105                 VkFilter filter)
1106 
1107 {
1108    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1109    TU_FROM_HANDLE(tu_image, src_image, srcImage);
1110    TU_FROM_HANDLE(tu_image, dst_image, dstImage);
1111 
1112    for (uint32_t i = 0; i < regionCount; ++i) {
1113       /* can't blit both depth and stencil at once with D32_S8
1114        * TODO: more advanced 3D blit path to support it instead?
1115        */
1116       if (src_image->vk_format == VK_FORMAT_D32_SFLOAT_S8_UINT ||
1117           dst_image->vk_format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
1118          VkImageBlit region = pRegions[i];
1119          uint32_t b;
1120          for_each_bit(b, pRegions[i].dstSubresource.aspectMask) {
1121             region.srcSubresource.aspectMask = BIT(b);
1122             region.dstSubresource.aspectMask = BIT(b);
1123             tu6_blit_image(cmd, src_image, dst_image, &region, filter);
1124          }
1125          continue;
1126       }
1127       tu6_blit_image(cmd, src_image, dst_image, pRegions + i, filter);
1128    }
1129 }
1130 
1131 static void
copy_compressed(VkFormat format,VkOffset3D * offset,VkExtent3D * extent,uint32_t * width,uint32_t * height)1132 copy_compressed(VkFormat format,
1133                 VkOffset3D *offset,
1134                 VkExtent3D *extent,
1135                 uint32_t *width,
1136                 uint32_t *height)
1137 {
1138    if (!vk_format_is_compressed(format))
1139       return;
1140 
1141    uint32_t block_width = vk_format_get_blockwidth(format);
1142    uint32_t block_height = vk_format_get_blockheight(format);
1143 
1144    offset->x /= block_width;
1145    offset->y /= block_height;
1146 
1147    if (extent) {
1148       extent->width = DIV_ROUND_UP(extent->width, block_width);
1149       extent->height = DIV_ROUND_UP(extent->height, block_height);
1150    }
1151    if (width)
1152       *width = DIV_ROUND_UP(*width, block_width);
1153    if (height)
1154       *height = DIV_ROUND_UP(*height, block_height);
1155 }
1156 
1157 static void
tu_copy_buffer_to_image(struct tu_cmd_buffer * cmd,struct tu_buffer * src_buffer,struct tu_image * dst_image,const VkBufferImageCopy * info)1158 tu_copy_buffer_to_image(struct tu_cmd_buffer *cmd,
1159                         struct tu_buffer *src_buffer,
1160                         struct tu_image *dst_image,
1161                         const VkBufferImageCopy *info)
1162 {
1163    struct tu_cs *cs = &cmd->cs;
1164    uint32_t layers = MAX2(info->imageExtent.depth, info->imageSubresource.layerCount);
1165    VkFormat src_format =
1166       copy_format(dst_image->vk_format, info->imageSubresource.aspectMask, true);
1167    const struct blit_ops *ops = &r2d_ops;
1168 
1169    /* special case for buffer to stencil */
1170    if (dst_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT &&
1171        info->imageSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) {
1172       ops = &r3d_ops;
1173    }
1174 
1175    /* TODO: G8_B8R8_2PLANE_420_UNORM Y plane has different hardware format,
1176     * which matters for UBWC. buffer_to_image/etc can fail because of this
1177     */
1178 
1179    VkOffset3D offset = info->imageOffset;
1180    VkExtent3D extent = info->imageExtent;
1181    uint32_t src_width = info->bufferRowLength ?: extent.width;
1182    uint32_t src_height = info->bufferImageHeight ?: extent.height;
1183 
1184    copy_compressed(dst_image->vk_format, &offset, &extent, &src_width, &src_height);
1185 
1186    uint32_t pitch = src_width * vk_format_get_blocksize(src_format);
1187    uint32_t layer_size = src_height * pitch;
1188 
1189    ops->setup(cmd, cs,
1190               copy_format(dst_image->vk_format, info->imageSubresource.aspectMask, false),
1191               info->imageSubresource.aspectMask, ROTATE_0, false, dst_image->layout[0].ubwc);
1192 
1193    struct tu_image_view dst;
1194    tu_image_view_copy(&dst, dst_image, dst_image->vk_format, &info->imageSubresource, offset.z, false);
1195 
1196    for (uint32_t i = 0; i < layers; i++) {
1197       ops->dst(cs, &dst, i);
1198 
1199       uint64_t src_va = tu_buffer_iova(src_buffer) + info->bufferOffset + layer_size * i;
1200       if ((src_va & 63) || (pitch & 63)) {
1201          for (uint32_t y = 0; y < extent.height; y++) {
1202             uint32_t x = (src_va & 63) / vk_format_get_blocksize(src_format);
1203             ops->src_buffer(cmd, cs, src_format, src_va & ~63, pitch,
1204                             x + extent.width, 1);
1205             ops->coords(cs, &(VkOffset2D){offset.x, offset.y + y},  &(VkOffset2D){x},
1206                         &(VkExtent2D) {extent.width, 1});
1207             ops->run(cmd, cs);
1208             src_va += pitch;
1209          }
1210       } else {
1211          ops->src_buffer(cmd, cs, src_format, src_va, pitch, extent.width, extent.height);
1212          coords(ops, cs, &offset, &(VkOffset3D){}, &extent);
1213          ops->run(cmd, cs);
1214       }
1215    }
1216 
1217    ops->teardown(cmd, cs);
1218 }
1219 
1220 void
tu_CmdCopyBufferToImage(VkCommandBuffer commandBuffer,VkBuffer srcBuffer,VkImage dstImage,VkImageLayout dstImageLayout,uint32_t regionCount,const VkBufferImageCopy * pRegions)1221 tu_CmdCopyBufferToImage(VkCommandBuffer commandBuffer,
1222                         VkBuffer srcBuffer,
1223                         VkImage dstImage,
1224                         VkImageLayout dstImageLayout,
1225                         uint32_t regionCount,
1226                         const VkBufferImageCopy *pRegions)
1227 {
1228    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1229    TU_FROM_HANDLE(tu_image, dst_image, dstImage);
1230    TU_FROM_HANDLE(tu_buffer, src_buffer, srcBuffer);
1231 
1232    for (unsigned i = 0; i < regionCount; ++i)
1233       tu_copy_buffer_to_image(cmd, src_buffer, dst_image, pRegions + i);
1234 }
1235 
1236 static void
tu_copy_image_to_buffer(struct tu_cmd_buffer * cmd,struct tu_image * src_image,struct tu_buffer * dst_buffer,const VkBufferImageCopy * info)1237 tu_copy_image_to_buffer(struct tu_cmd_buffer *cmd,
1238                         struct tu_image *src_image,
1239                         struct tu_buffer *dst_buffer,
1240                         const VkBufferImageCopy *info)
1241 {
1242    struct tu_cs *cs = &cmd->cs;
1243    uint32_t layers = MAX2(info->imageExtent.depth, info->imageSubresource.layerCount);
1244    VkFormat dst_format =
1245       copy_format(src_image->vk_format, info->imageSubresource.aspectMask, true);
1246    bool stencil_read = false;
1247 
1248    if (src_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT &&
1249        info->imageSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) {
1250       stencil_read = true;
1251    }
1252 
1253    const struct blit_ops *ops = stencil_read ? &r3d_ops : &r2d_ops;
1254    VkOffset3D offset = info->imageOffset;
1255    VkExtent3D extent = info->imageExtent;
1256    uint32_t dst_width = info->bufferRowLength ?: extent.width;
1257    uint32_t dst_height = info->bufferImageHeight ?: extent.height;
1258 
1259    copy_compressed(src_image->vk_format, &offset, &extent, &dst_width, &dst_height);
1260 
1261    uint32_t pitch = dst_width * vk_format_get_blocksize(dst_format);
1262    uint32_t layer_size = pitch * dst_height;
1263 
1264    ops->setup(cmd, cs, dst_format, VK_IMAGE_ASPECT_COLOR_BIT, ROTATE_0, false, false);
1265 
1266    struct tu_image_view src;
1267    tu_image_view_copy(&src, src_image, src_image->vk_format, &info->imageSubresource, offset.z, stencil_read);
1268 
1269    for (uint32_t i = 0; i < layers; i++) {
1270       ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);
1271 
1272       uint64_t dst_va = tu_buffer_iova(dst_buffer) + info->bufferOffset + layer_size * i;
1273       if ((dst_va & 63) || (pitch & 63)) {
1274          for (uint32_t y = 0; y < extent.height; y++) {
1275             uint32_t x = (dst_va & 63) / vk_format_get_blocksize(dst_format);
1276             ops->dst_buffer(cs, dst_format, dst_va & ~63, 0);
1277             ops->coords(cs, &(VkOffset2D) {x}, &(VkOffset2D){offset.x, offset.y + y},
1278                         &(VkExtent2D) {extent.width, 1});
1279             ops->run(cmd, cs);
1280             dst_va += pitch;
1281          }
1282       } else {
1283          ops->dst_buffer(cs, dst_format, dst_va, pitch);
1284          coords(ops, cs, &(VkOffset3D) {0, 0}, &offset, &extent);
1285          ops->run(cmd, cs);
1286       }
1287    }
1288 
1289    ops->teardown(cmd, cs);
1290 }
1291 
1292 void
tu_CmdCopyImageToBuffer(VkCommandBuffer commandBuffer,VkImage srcImage,VkImageLayout srcImageLayout,VkBuffer dstBuffer,uint32_t regionCount,const VkBufferImageCopy * pRegions)1293 tu_CmdCopyImageToBuffer(VkCommandBuffer commandBuffer,
1294                         VkImage srcImage,
1295                         VkImageLayout srcImageLayout,
1296                         VkBuffer dstBuffer,
1297                         uint32_t regionCount,
1298                         const VkBufferImageCopy *pRegions)
1299 {
1300    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1301    TU_FROM_HANDLE(tu_image, src_image, srcImage);
1302    TU_FROM_HANDLE(tu_buffer, dst_buffer, dstBuffer);
1303 
1304    for (unsigned i = 0; i < regionCount; ++i)
1305       tu_copy_image_to_buffer(cmd, src_image, dst_buffer, pRegions + i);
1306 }
1307 
1308 /* Tiled formats don't support swapping, which means that we can't support
1309  * formats that require a non-WZYX swap like B8G8R8A8 natively. Also, some
1310  * formats like B5G5R5A1 have a separate linear-only format when sampling.
1311  * Currently we fake support for tiled swapped formats and use the unswapped
1312  * format instead, but this means that reinterpreting copies to and from
1313  * swapped formats can't be performed correctly unless we can swizzle the
1314  * components by reinterpreting the other image as the "correct" swapped
1315  * format, i.e. only when the other image is linear.
1316  */
1317 
1318 static bool
is_swapped_format(VkFormat format)1319 is_swapped_format(VkFormat format)
1320 {
1321    struct tu_native_format linear = tu6_format_texture(format, TILE6_LINEAR);
1322    struct tu_native_format tiled = tu6_format_texture(format, TILE6_3);
1323    return linear.fmt != tiled.fmt || linear.swap != tiled.swap;
1324 }
1325 
1326 /* R8G8_* formats have a different tiling layout than other cpp=2 formats, and
1327  * therefore R8G8 images can't be reinterpreted as non-R8G8 images (and vice
1328  * versa). This should mirror the logic in fdl6_layout.
1329  */
1330 static bool
image_is_r8g8(struct tu_image * image)1331 image_is_r8g8(struct tu_image *image)
1332 {
1333    return image->layout[0].cpp == 2 &&
1334       vk_format_get_nr_components(image->vk_format) == 2;
1335 }
1336 
1337 static void
tu_copy_image_to_image(struct tu_cmd_buffer * cmd,struct tu_image * src_image,struct tu_image * dst_image,const VkImageCopy * info)1338 tu_copy_image_to_image(struct tu_cmd_buffer *cmd,
1339                        struct tu_image *src_image,
1340                        struct tu_image *dst_image,
1341                        const VkImageCopy *info)
1342 {
1343    const struct blit_ops *ops = &r2d_ops;
1344    struct tu_cs *cs = &cmd->cs;
1345 
1346    if (dst_image->layout[0].nr_samples > 1)
1347       ops = &r3d_ops;
1348 
1349    VkFormat format = VK_FORMAT_UNDEFINED;
1350    VkOffset3D src_offset = info->srcOffset;
1351    VkOffset3D dst_offset = info->dstOffset;
1352    VkExtent3D extent = info->extent;
1353 
1354    /* From the Vulkan 1.2.140 spec, section 19.3 "Copying Data Between
1355     * Images":
1356     *
1357     *    When copying between compressed and uncompressed formats the extent
1358     *    members represent the texel dimensions of the source image and not
1359     *    the destination. When copying from a compressed image to an
1360     *    uncompressed image the image texel dimensions written to the
1361     *    uncompressed image will be source extent divided by the compressed
1362     *    texel block dimensions. When copying from an uncompressed image to a
1363     *    compressed image the image texel dimensions written to the compressed
1364     *    image will be the source extent multiplied by the compressed texel
1365     *    block dimensions.
1366     *
1367     * This means we only have to adjust the extent if the source image is
1368     * compressed.
1369     */
1370    copy_compressed(src_image->vk_format, &src_offset, &extent, NULL, NULL);
1371    copy_compressed(dst_image->vk_format, &dst_offset, NULL, NULL, NULL);
1372 
1373    VkFormat dst_format = copy_format(dst_image->vk_format, info->dstSubresource.aspectMask, false);
1374    VkFormat src_format = copy_format(src_image->vk_format, info->srcSubresource.aspectMask, false);
1375 
1376    bool use_staging_blit = false;
1377 
1378    if (src_format == dst_format) {
1379       /* Images that share a format can always be copied directly because it's
1380        * the same as a blit.
1381        */
1382       format = src_format;
1383    } else if (!src_image->layout[0].tile_mode) {
1384       /* If an image is linear, we can always safely reinterpret it with the
1385        * other image's format and then do a regular blit.
1386        */
1387       format = dst_format;
1388    } else if (!dst_image->layout[0].tile_mode) {
1389       format = src_format;
1390    } else if (image_is_r8g8(src_image) != image_is_r8g8(dst_image)) {
1391       /* We can't currently copy r8g8 images to/from other cpp=2 images,
1392        * due to the different tile layout.
1393        */
1394       use_staging_blit = true;
1395    } else if (is_swapped_format(src_format) ||
1396               is_swapped_format(dst_format)) {
1397       /* If either format has a non-identity swap, then we can't copy
1398        * to/from it.
1399        */
1400       use_staging_blit = true;
1401    } else if (!src_image->layout[0].ubwc) {
1402       format = dst_format;
1403    } else if (!dst_image->layout[0].ubwc) {
1404       format = src_format;
1405    } else {
1406       /* Both formats use UBWC and so neither can be reinterpreted.
1407        * TODO: We could do an in-place decompression of the dst instead.
1408        */
1409       use_staging_blit = true;
1410    }
1411 
1412    struct tu_image_view dst, src;
1413 
1414    if (use_staging_blit) {
1415       tu_image_view_copy(&dst, dst_image, dst_format, &info->dstSubresource, dst_offset.z, false);
1416       tu_image_view_copy(&src, src_image, src_format, &info->srcSubresource, src_offset.z, false);
1417 
1418       struct tu_image staging_image = {
1419          .vk_format = src_format,
1420          .level_count = 1,
1421          .layer_count = info->srcSubresource.layerCount,
1422          .bo_offset = 0,
1423       };
1424 
1425       VkImageSubresourceLayers staging_subresource = {
1426          .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
1427          .mipLevel = 0,
1428          .baseArrayLayer = 0,
1429          .layerCount = info->srcSubresource.layerCount,
1430       };
1431 
1432       VkOffset3D staging_offset = { 0 };
1433 
1434       staging_image.layout[0].tile_mode = TILE6_LINEAR;
1435       staging_image.layout[0].ubwc = false;
1436 
1437       fdl6_layout(&staging_image.layout[0],
1438                   vk_format_to_pipe_format(staging_image.vk_format),
1439                   src_image->layout[0].nr_samples,
1440                   extent.width,
1441                   extent.height,
1442                   extent.depth,
1443                   staging_image.level_count,
1444                   staging_image.layer_count,
1445                   extent.depth > 1,
1446                   NULL);
1447 
1448       VkResult result = tu_get_scratch_bo(cmd->device,
1449                                           staging_image.layout[0].size,
1450                                           &staging_image.bo);
1451       if (result != VK_SUCCESS) {
1452          cmd->record_result = result;
1453          return;
1454       }
1455 
1456       struct tu_image_view staging;
1457       tu_image_view_copy(&staging, &staging_image, src_format,
1458                          &staging_subresource, 0, false);
1459 
1460       ops->setup(cmd, cs, src_format, VK_IMAGE_ASPECT_COLOR_BIT, ROTATE_0, false, false);
1461       coords(ops, cs, &staging_offset, &src_offset, &extent);
1462 
1463       for (uint32_t i = 0; i < info->extent.depth; i++) {
1464          ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);
1465          ops->dst(cs, &staging, i);
1466          ops->run(cmd, cs);
1467       }
1468 
1469       /* When executed by the user there has to be a pipeline barrier here,
1470        * but since we're doing it manually we'll have to flush ourselves.
1471        */
1472       tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
1473       tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
1474 
1475       tu_image_view_copy(&staging, &staging_image, dst_format,
1476                          &staging_subresource, 0, false);
1477 
1478       ops->setup(cmd, cs, dst_format, info->dstSubresource.aspectMask,
1479                  ROTATE_0, false, dst_image->layout[0].ubwc);
1480       coords(ops, cs, &dst_offset, &staging_offset, &extent);
1481 
1482       for (uint32_t i = 0; i < info->extent.depth; i++) {
1483          ops->src(cmd, cs, &staging, i, VK_FILTER_NEAREST);
1484          ops->dst(cs, &dst, i);
1485          ops->run(cmd, cs);
1486       }
1487    } else {
1488       tu_image_view_copy(&dst, dst_image, format, &info->dstSubresource, dst_offset.z, false);
1489       tu_image_view_copy(&src, src_image, format, &info->srcSubresource, src_offset.z, false);
1490 
1491       ops->setup(cmd, cs, format, info->dstSubresource.aspectMask,
1492                  ROTATE_0, false, dst_image->layout[0].ubwc);
1493       coords(ops, cs, &dst_offset, &src_offset, &extent);
1494 
1495       for (uint32_t i = 0; i < info->extent.depth; i++) {
1496          ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);
1497          ops->dst(cs, &dst, i);
1498          ops->run(cmd, cs);
1499       }
1500    }
1501 
1502    ops->teardown(cmd, cs);
1503 }
1504 
1505 void
tu_CmdCopyImage(VkCommandBuffer commandBuffer,VkImage srcImage,VkImageLayout srcImageLayout,VkImage destImage,VkImageLayout destImageLayout,uint32_t regionCount,const VkImageCopy * pRegions)1506 tu_CmdCopyImage(VkCommandBuffer commandBuffer,
1507                 VkImage srcImage,
1508                 VkImageLayout srcImageLayout,
1509                 VkImage destImage,
1510                 VkImageLayout destImageLayout,
1511                 uint32_t regionCount,
1512                 const VkImageCopy *pRegions)
1513 {
1514    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1515    TU_FROM_HANDLE(tu_image, src_image, srcImage);
1516    TU_FROM_HANDLE(tu_image, dst_image, destImage);
1517 
1518    for (uint32_t i = 0; i < regionCount; ++i)
1519       tu_copy_image_to_image(cmd, src_image, dst_image, pRegions + i);
1520 }
1521 
1522 static void
copy_buffer(struct tu_cmd_buffer * cmd,uint64_t dst_va,uint64_t src_va,uint64_t size,uint32_t block_size)1523 copy_buffer(struct tu_cmd_buffer *cmd,
1524             uint64_t dst_va,
1525             uint64_t src_va,
1526             uint64_t size,
1527             uint32_t block_size)
1528 {
1529    const struct blit_ops *ops = &r2d_ops;
1530    struct tu_cs *cs = &cmd->cs;
1531    VkFormat format = block_size == 4 ? VK_FORMAT_R32_UINT : VK_FORMAT_R8_UNORM;
1532    uint64_t blocks = size / block_size;
1533 
1534    ops->setup(cmd, cs, format, VK_IMAGE_ASPECT_COLOR_BIT, ROTATE_0, false, false);
1535 
1536    while (blocks) {
1537       uint32_t src_x = (src_va & 63) / block_size;
1538       uint32_t dst_x = (dst_va & 63) / block_size;
1539       uint32_t width = MIN2(MIN2(blocks, 0x4000 - src_x), 0x4000 - dst_x);
1540 
1541       ops->src_buffer(cmd, cs, format, src_va & ~63, 0, src_x + width, 1);
1542       ops->dst_buffer(     cs, format, dst_va & ~63, 0);
1543       ops->coords(cs, &(VkOffset2D) {dst_x}, &(VkOffset2D) {src_x}, &(VkExtent2D) {width, 1});
1544       ops->run(cmd, cs);
1545 
1546       src_va += width * block_size;
1547       dst_va += width * block_size;
1548       blocks -= width;
1549    }
1550 
1551    ops->teardown(cmd, cs);
1552 }
1553 
1554 void
tu_CmdCopyBuffer(VkCommandBuffer commandBuffer,VkBuffer srcBuffer,VkBuffer dstBuffer,uint32_t regionCount,const VkBufferCopy * pRegions)1555 tu_CmdCopyBuffer(VkCommandBuffer commandBuffer,
1556                  VkBuffer srcBuffer,
1557                  VkBuffer dstBuffer,
1558                  uint32_t regionCount,
1559                  const VkBufferCopy *pRegions)
1560 {
1561    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1562    TU_FROM_HANDLE(tu_buffer, src_buffer, srcBuffer);
1563    TU_FROM_HANDLE(tu_buffer, dst_buffer, dstBuffer);
1564 
1565    for (unsigned i = 0; i < regionCount; ++i) {
1566       copy_buffer(cmd,
1567                   tu_buffer_iova(dst_buffer) + pRegions[i].dstOffset,
1568                   tu_buffer_iova(src_buffer) + pRegions[i].srcOffset,
1569                   pRegions[i].size, 1);
1570    }
1571 }
1572 
1573 void
tu_CmdUpdateBuffer(VkCommandBuffer commandBuffer,VkBuffer dstBuffer,VkDeviceSize dstOffset,VkDeviceSize dataSize,const void * pData)1574 tu_CmdUpdateBuffer(VkCommandBuffer commandBuffer,
1575                    VkBuffer dstBuffer,
1576                    VkDeviceSize dstOffset,
1577                    VkDeviceSize dataSize,
1578                    const void *pData)
1579 {
1580    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1581    TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
1582 
1583    struct tu_cs_memory tmp;
1584    VkResult result = tu_cs_alloc(&cmd->sub_cs, DIV_ROUND_UP(dataSize, 64), 64, &tmp);
1585    if (result != VK_SUCCESS) {
1586       cmd->record_result = result;
1587       return;
1588    }
1589 
1590    memcpy(tmp.map, pData, dataSize);
1591    copy_buffer(cmd, tu_buffer_iova(buffer) + dstOffset, tmp.iova, dataSize, 4);
1592 }
1593 
1594 void
tu_CmdFillBuffer(VkCommandBuffer commandBuffer,VkBuffer dstBuffer,VkDeviceSize dstOffset,VkDeviceSize fillSize,uint32_t data)1595 tu_CmdFillBuffer(VkCommandBuffer commandBuffer,
1596                  VkBuffer dstBuffer,
1597                  VkDeviceSize dstOffset,
1598                  VkDeviceSize fillSize,
1599                  uint32_t data)
1600 {
1601    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1602    TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
1603    const struct blit_ops *ops = &r2d_ops;
1604    struct tu_cs *cs = &cmd->cs;
1605 
1606    if (fillSize == VK_WHOLE_SIZE)
1607       fillSize = buffer->size - dstOffset;
1608 
1609    uint64_t dst_va = tu_buffer_iova(buffer) + dstOffset;
1610    uint32_t blocks = fillSize / 4;
1611 
1612    ops->setup(cmd, cs, VK_FORMAT_R32_UINT, VK_IMAGE_ASPECT_COLOR_BIT, ROTATE_0, true, false);
1613    ops->clear_value(cs, VK_FORMAT_R32_UINT, &(VkClearValue){.color = {.uint32[0] = data}});
1614 
1615    while (blocks) {
1616       uint32_t dst_x = (dst_va & 63) / 4;
1617       uint32_t width = MIN2(blocks, 0x4000 - dst_x);
1618 
1619       ops->dst_buffer(cs, VK_FORMAT_R32_UINT, dst_va & ~63, 0);
1620       ops->coords(cs, &(VkOffset2D) {dst_x}, NULL, &(VkExtent2D) {width, 1});
1621       ops->run(cmd, cs);
1622 
1623       dst_va += width * 4;
1624       blocks -= width;
1625    }
1626 
1627    ops->teardown(cmd, cs);
1628 }
1629 
1630 void
tu_CmdResolveImage(VkCommandBuffer commandBuffer,VkImage srcImage,VkImageLayout srcImageLayout,VkImage dstImage,VkImageLayout dstImageLayout,uint32_t regionCount,const VkImageResolve * pRegions)1631 tu_CmdResolveImage(VkCommandBuffer commandBuffer,
1632                    VkImage srcImage,
1633                    VkImageLayout srcImageLayout,
1634                    VkImage dstImage,
1635                    VkImageLayout dstImageLayout,
1636                    uint32_t regionCount,
1637                    const VkImageResolve *pRegions)
1638 {
1639    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1640    TU_FROM_HANDLE(tu_image, src_image, srcImage);
1641    TU_FROM_HANDLE(tu_image, dst_image, dstImage);
1642    const struct blit_ops *ops = &r2d_ops;
1643    struct tu_cs *cs = &cmd->cs;
1644 
1645    ops->setup(cmd, cs, dst_image->vk_format, VK_IMAGE_ASPECT_COLOR_BIT,
1646               ROTATE_0, false, dst_image->layout[0].ubwc);
1647 
1648    for (uint32_t i = 0; i < regionCount; ++i) {
1649       const VkImageResolve *info = &pRegions[i];
1650       uint32_t layers = MAX2(info->extent.depth, info->dstSubresource.layerCount);
1651 
1652       assert(info->srcSubresource.layerCount == info->dstSubresource.layerCount);
1653       /* TODO: aspect masks possible ? */
1654 
1655       coords(ops, cs, &info->dstOffset, &info->srcOffset, &info->extent);
1656 
1657       struct tu_image_view dst, src;
1658       tu_image_view_blit(&dst, dst_image, &info->dstSubresource, info->dstOffset.z);
1659       tu_image_view_blit(&src, src_image, &info->srcSubresource, info->srcOffset.z);
1660 
1661       for (uint32_t i = 0; i < layers; i++) {
1662          ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);
1663          ops->dst(cs, &dst, i);
1664          ops->run(cmd, cs);
1665       }
1666    }
1667 
1668    ops->teardown(cmd, cs);
1669 }
1670 
1671 #define for_each_layer(layer, layer_mask, layers) \
1672    for (uint32_t layer = 0; \
1673         layer < ((layer_mask) ? (util_logbase2(layer_mask) + 1) : layers); \
1674         layer++) \
1675       if (!layer_mask || (layer_mask & BIT(layer)))
1676 
1677 void
tu_resolve_sysmem(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_image_view * src,struct tu_image_view * dst,uint32_t layer_mask,uint32_t layers,const VkRect2D * rect)1678 tu_resolve_sysmem(struct tu_cmd_buffer *cmd,
1679                   struct tu_cs *cs,
1680                   struct tu_image_view *src,
1681                   struct tu_image_view *dst,
1682                   uint32_t layer_mask,
1683                   uint32_t layers,
1684                   const VkRect2D *rect)
1685 {
1686    const struct blit_ops *ops = &r2d_ops;
1687 
1688    assert(src->image->vk_format == dst->image->vk_format);
1689 
1690    ops->setup(cmd, cs, dst->image->vk_format, VK_IMAGE_ASPECT_COLOR_BIT,
1691               ROTATE_0, false, dst->ubwc_enabled);
1692    ops->coords(cs, &rect->offset, &rect->offset, &rect->extent);
1693 
1694    for_each_layer(i, layer_mask, layers) {
1695       ops->src(cmd, cs, src, i, VK_FILTER_NEAREST);
1696       ops->dst(cs, dst, i);
1697       ops->run(cmd, cs);
1698    }
1699 
1700    ops->teardown(cmd, cs);
1701 }
1702 
1703 static void
clear_image(struct tu_cmd_buffer * cmd,struct tu_image * image,const VkClearValue * clear_value,const VkImageSubresourceRange * range,VkImageAspectFlags aspect_mask)1704 clear_image(struct tu_cmd_buffer *cmd,
1705             struct tu_image *image,
1706             const VkClearValue *clear_value,
1707             const VkImageSubresourceRange *range,
1708             VkImageAspectFlags aspect_mask)
1709 {
1710    uint32_t level_count = tu_get_levelCount(image, range);
1711    uint32_t layer_count = tu_get_layerCount(image, range);
1712    struct tu_cs *cs = &cmd->cs;
1713    VkFormat format = image->vk_format;
1714    if (format == VK_FORMAT_D32_SFLOAT_S8_UINT || format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32)
1715       format = copy_format(format, aspect_mask, false);
1716 
1717    if (image->layout[0].depth0 > 1) {
1718       assert(layer_count == 1);
1719       assert(range->baseArrayLayer == 0);
1720    }
1721 
1722    const struct blit_ops *ops = image->layout[0].nr_samples > 1 ? &r3d_ops : &r2d_ops;
1723 
1724    ops->setup(cmd, cs, format, aspect_mask, ROTATE_0, true, image->layout[0].ubwc);
1725    if (image->vk_format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32)
1726       ops->clear_value(cs, VK_FORMAT_E5B9G9R9_UFLOAT_PACK32, clear_value);
1727    else
1728       ops->clear_value(cs, format, clear_value);
1729 
1730    for (unsigned j = 0; j < level_count; j++) {
1731       if (image->layout[0].depth0 > 1)
1732          layer_count = u_minify(image->layout[0].depth0, range->baseMipLevel + j);
1733 
1734       ops->coords(cs, &(VkOffset2D){}, NULL, &(VkExtent2D) {
1735                      u_minify(image->layout[0].width0, range->baseMipLevel + j),
1736                      u_minify(image->layout[0].height0, range->baseMipLevel + j)
1737                   });
1738 
1739       struct tu_image_view dst;
1740       tu_image_view_copy_blit(&dst, image, format, &(VkImageSubresourceLayers) {
1741          .aspectMask = aspect_mask,
1742          .mipLevel = range->baseMipLevel + j,
1743          .baseArrayLayer = range->baseArrayLayer,
1744          .layerCount = 1,
1745       }, 0, false);
1746 
1747       for (uint32_t i = 0; i < layer_count; i++) {
1748          ops->dst(cs, &dst, i);
1749          ops->run(cmd, cs);
1750       }
1751    }
1752 
1753    ops->teardown(cmd, cs);
1754 }
1755 
1756 void
tu_CmdClearColorImage(VkCommandBuffer commandBuffer,VkImage image_h,VkImageLayout imageLayout,const VkClearColorValue * pColor,uint32_t rangeCount,const VkImageSubresourceRange * pRanges)1757 tu_CmdClearColorImage(VkCommandBuffer commandBuffer,
1758                       VkImage image_h,
1759                       VkImageLayout imageLayout,
1760                       const VkClearColorValue *pColor,
1761                       uint32_t rangeCount,
1762                       const VkImageSubresourceRange *pRanges)
1763 {
1764    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1765    TU_FROM_HANDLE(tu_image, image, image_h);
1766 
1767    for (unsigned i = 0; i < rangeCount; i++)
1768       clear_image(cmd, image, (const VkClearValue*) pColor, pRanges + i, VK_IMAGE_ASPECT_COLOR_BIT);
1769 }
1770 
1771 void
tu_CmdClearDepthStencilImage(VkCommandBuffer commandBuffer,VkImage image_h,VkImageLayout imageLayout,const VkClearDepthStencilValue * pDepthStencil,uint32_t rangeCount,const VkImageSubresourceRange * pRanges)1772 tu_CmdClearDepthStencilImage(VkCommandBuffer commandBuffer,
1773                              VkImage image_h,
1774                              VkImageLayout imageLayout,
1775                              const VkClearDepthStencilValue *pDepthStencil,
1776                              uint32_t rangeCount,
1777                              const VkImageSubresourceRange *pRanges)
1778 {
1779    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1780    TU_FROM_HANDLE(tu_image, image, image_h);
1781 
1782    for (unsigned i = 0; i < rangeCount; i++) {
1783       const VkImageSubresourceRange *range = &pRanges[i];
1784 
1785       if (image->vk_format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
1786          /* can't clear both depth and stencil at once, split up the aspect mask */
1787          uint32_t b;
1788          for_each_bit(b, range->aspectMask)
1789             clear_image(cmd, image, (const VkClearValue*) pDepthStencil, range, BIT(b));
1790          continue;
1791       }
1792 
1793       clear_image(cmd, image, (const VkClearValue*) pDepthStencil, range, range->aspectMask);
1794    }
1795 }
1796 
1797 static void
tu_clear_sysmem_attachments(struct tu_cmd_buffer * cmd,uint32_t attachment_count,const VkClearAttachment * attachments,uint32_t rect_count,const VkClearRect * rects)1798 tu_clear_sysmem_attachments(struct tu_cmd_buffer *cmd,
1799                             uint32_t attachment_count,
1800                             const VkClearAttachment *attachments,
1801                             uint32_t rect_count,
1802                             const VkClearRect *rects)
1803 {
1804    /* the shader path here is special, it avoids changing MRT/etc state */
1805    const struct tu_render_pass *pass = cmd->state.pass;
1806    const struct tu_subpass *subpass = cmd->state.subpass;
1807    const uint32_t mrt_count = subpass->color_count;
1808    struct tu_cs *cs = &cmd->draw_cs;
1809    uint32_t clear_value[MAX_RTS][4];
1810    float z_clear_val = 0.0f;
1811    uint8_t s_clear_val = 0;
1812    uint32_t clear_rts = 0, clear_components = 0, num_rts = 0, b;
1813    bool z_clear = false;
1814    bool s_clear = false;
1815    bool layered_clear = false;
1816    uint32_t max_samples = 1;
1817 
1818    for (uint32_t i = 0; i < attachment_count; i++) {
1819       uint32_t a;
1820       if (attachments[i].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
1821          uint32_t c = attachments[i].colorAttachment;
1822          a = subpass->color_attachments[c].attachment;
1823          if (a == VK_ATTACHMENT_UNUSED)
1824             continue;
1825 
1826          clear_rts |= 1 << c;
1827          clear_components |= 0xf << (c * 4);
1828          memcpy(clear_value[c], &attachments[i].clearValue, 4 * sizeof(uint32_t));
1829       } else {
1830          a = subpass->depth_stencil_attachment.attachment;
1831          if (a == VK_ATTACHMENT_UNUSED)
1832             continue;
1833 
1834          if (attachments[i].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
1835             z_clear = true;
1836             z_clear_val = attachments[i].clearValue.depthStencil.depth;
1837          }
1838 
1839          if (attachments[i].aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) {
1840             s_clear = true;
1841             s_clear_val = attachments[i].clearValue.depthStencil.stencil & 0xff;
1842          }
1843       }
1844 
1845       max_samples = MAX2(max_samples, pass->attachments[a].samples);
1846    }
1847 
1848    /* disable all draw states so they don't interfere
1849     * TODO: use and re-use draw states
1850     * we have to disable draw states individually to preserve
1851     * input attachment states, because a secondary command buffer
1852     * won't be able to restore them
1853     */
1854    tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * (TU_DRAW_STATE_COUNT - 2));
1855    for (uint32_t i = 0; i < TU_DRAW_STATE_COUNT; i++) {
1856       if (i == TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM ||
1857           i == TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM)
1858          continue;
1859       tu_cs_emit(cs, CP_SET_DRAW_STATE__0_GROUP_ID(i) |
1860                      CP_SET_DRAW_STATE__0_DISABLE);
1861       tu_cs_emit_qw(cs, 0);
1862    }
1863    cmd->state.dirty |= TU_CMD_DIRTY_DRAW_STATE;
1864 
1865    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2);
1866    tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(0xfc) |
1867                   A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(0xfc) |
1868                   0xfc000000);
1869    tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(mrt_count));
1870 
1871    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), mrt_count);
1872    for (uint32_t i = 0; i < mrt_count; i++) {
1873       if (clear_rts & (1 << i))
1874          tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_REG_REGID(num_rts++ * 4));
1875       else
1876          tu_cs_emit(cs, 0);
1877    }
1878 
1879    for (uint32_t i = 0; i < rect_count; i++) {
1880       if (rects[i].baseArrayLayer || rects[i].layerCount > 1)
1881          layered_clear = true;
1882    }
1883 
1884    /* a630 doesn't support multiview masks, which means that we can't use the
1885     * normal multiview path without potentially recompiling a shader on-demand
1886     * or using a more complicated variant that takes the mask as a const. Just
1887     * use the layered path instead, since it shouldn't be much worse.
1888     */
1889    if (subpass->multiview_mask) {
1890       layered_clear = true;
1891    }
1892 
1893    r3d_common(cmd, cs, false, num_rts, layered_clear);
1894 
1895    tu_cs_emit_regs(cs,
1896                    A6XX_SP_FS_RENDER_COMPONENTS(.dword = clear_components));
1897    tu_cs_emit_regs(cs,
1898                    A6XX_RB_RENDER_COMPONENTS(.dword = clear_components));
1899 
1900    tu_cs_emit_regs(cs,
1901                    A6XX_RB_FS_OUTPUT_CNTL0(),
1902                    A6XX_RB_FS_OUTPUT_CNTL1(.mrt = mrt_count));
1903 
1904    tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL());
1905    tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.independent_blend = 1, .sample_mask = 0xffff));
1906    for (uint32_t i = 0; i < mrt_count; i++) {
1907       tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(i,
1908             .component_enable = COND(clear_rts & (1 << i), 0xf)));
1909    }
1910 
1911    tu_cs_emit_regs(cs, A6XX_GRAS_LRZ_CNTL(0));
1912    tu_cs_emit_regs(cs, A6XX_RB_LRZ_CNTL(0));
1913 
1914    tu_cs_emit_regs(cs, A6XX_RB_DEPTH_PLANE_CNTL());
1915    tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL(
1916          .z_enable = z_clear,
1917          .z_write_enable = z_clear,
1918          .zfunc = FUNC_ALWAYS));
1919    tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL());
1920    tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL(
1921          .stencil_enable = s_clear,
1922          .func = FUNC_ALWAYS,
1923          .zpass = STENCIL_REPLACE));
1924    tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK(.mask = 0xff));
1925    tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK(.wrmask = 0xff));
1926    tu_cs_emit_regs(cs, A6XX_RB_STENCILREF(.ref = s_clear_val));
1927 
1928    tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3 + 4 * num_rts);
1929    tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
1930                   CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
1931                   CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
1932                   CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_SHADER) |
1933                   CP_LOAD_STATE6_0_NUM_UNIT(num_rts));
1934    tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
1935    tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
1936    for_each_bit(b, clear_rts)
1937       tu_cs_emit_array(cs, clear_value[b], 4);
1938 
1939    for (uint32_t i = 0; i < rect_count; i++) {
1940       /* This should be true because of this valid usage for
1941        * vkCmdClearAttachments:
1942        *
1943        *    "If the render pass instance this is recorded in uses multiview,
1944        *    then baseArrayLayer must be zero and layerCount must be one"
1945        */
1946       assert(!subpass->multiview_mask || rects[i].baseArrayLayer == 0);
1947 
1948       for_each_layer(layer, subpass->multiview_mask, rects[i].layerCount) {
1949          r3d_coords_raw(cs, (float[]) {
1950             rects[i].rect.offset.x, rects[i].rect.offset.y,
1951             z_clear_val, uif(rects[i].baseArrayLayer + layer),
1952             rects[i].rect.offset.x + rects[i].rect.extent.width,
1953             rects[i].rect.offset.y + rects[i].rect.extent.height,
1954             z_clear_val, 1.0f,
1955          });
1956          r3d_run(cmd, cs);
1957       }
1958    }
1959 }
1960 
1961 static void
pack_gmem_clear_value(const VkClearValue * val,VkFormat format,uint32_t clear_value[4])1962 pack_gmem_clear_value(const VkClearValue *val, VkFormat format, uint32_t clear_value[4])
1963 {
1964    switch (format) {
1965    case VK_FORMAT_X8_D24_UNORM_PACK32:
1966    case VK_FORMAT_D24_UNORM_S8_UINT:
1967       clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 24) |
1968                        val->depthStencil.stencil << 24;
1969       return;
1970    case VK_FORMAT_D16_UNORM:
1971       clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 16);
1972       return;
1973    case VK_FORMAT_D32_SFLOAT:
1974       clear_value[0] = fui(val->depthStencil.depth);
1975       return;
1976    case VK_FORMAT_S8_UINT:
1977       clear_value[0] = val->depthStencil.stencil;
1978       return;
1979    default:
1980       break;
1981    }
1982 
1983    float tmp[4];
1984    memcpy(tmp, val->color.float32, 4 * sizeof(float));
1985    if (vk_format_is_srgb(format)) {
1986       for (int i = 0; i < 3; i++)
1987          tmp[i] = util_format_linear_to_srgb_float(tmp[i]);
1988    }
1989 
1990 #define PACK_F(type) util_format_##type##_pack_rgba_float \
1991    ( (uint8_t*) &clear_value[0], 0, tmp, 0, 1, 1)
1992    switch (vk_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, PIPE_SWIZZLE_X)) {
1993    case 4:
1994       PACK_F(r4g4b4a4_unorm);
1995       break;
1996    case 5:
1997       if (vk_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, PIPE_SWIZZLE_Y) == 6)
1998          PACK_F(r5g6b5_unorm);
1999       else
2000          PACK_F(r5g5b5a1_unorm);
2001       break;
2002    case 8:
2003       if (vk_format_is_snorm(format))
2004          PACK_F(r8g8b8a8_snorm);
2005       else if (vk_format_is_unorm(format))
2006          PACK_F(r8g8b8a8_unorm);
2007       else
2008          pack_int8(clear_value, val->color.uint32);
2009       break;
2010    case 10:
2011       if (vk_format_is_int(format))
2012          pack_int10_2(clear_value, val->color.uint32);
2013       else
2014          PACK_F(r10g10b10a2_unorm);
2015       break;
2016    case 11:
2017       clear_value[0] = float3_to_r11g11b10f(val->color.float32);
2018       break;
2019    case 16:
2020       if (vk_format_is_snorm(format))
2021          PACK_F(r16g16b16a16_snorm);
2022       else if (vk_format_is_unorm(format))
2023          PACK_F(r16g16b16a16_unorm);
2024       else if (vk_format_is_float(format))
2025          PACK_F(r16g16b16a16_float);
2026       else
2027          pack_int16(clear_value, val->color.uint32);
2028       break;
2029    case 32:
2030       memcpy(clear_value, val->color.float32, 4 * sizeof(float));
2031       break;
2032    default:
2033       unreachable("unexpected channel size");
2034    }
2035 #undef PACK_F
2036 }
2037 
2038 static void
clear_gmem_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,VkFormat format,uint8_t clear_mask,uint32_t gmem_offset,const VkClearValue * value)2039 clear_gmem_attachment(struct tu_cmd_buffer *cmd,
2040                       struct tu_cs *cs,
2041                       VkFormat format,
2042                       uint8_t clear_mask,
2043                       uint32_t gmem_offset,
2044                       const VkClearValue *value)
2045 {
2046    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 1);
2047    tu_cs_emit(cs, A6XX_RB_BLIT_DST_INFO_COLOR_FORMAT(tu6_base_format(format)));
2048 
2049    tu_cs_emit_regs(cs, A6XX_RB_BLIT_INFO(.gmem = 1, .clear_mask = clear_mask));
2050 
2051    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_BASE_GMEM, 1);
2052    tu_cs_emit(cs, gmem_offset);
2053 
2054    tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_88D0, 1);
2055    tu_cs_emit(cs, 0);
2056 
2057    uint32_t clear_vals[4] = {};
2058    pack_gmem_clear_value(value, format, clear_vals);
2059 
2060    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_CLEAR_COLOR_DW0, 4);
2061    tu_cs_emit_array(cs, clear_vals, 4);
2062 
2063    tu6_emit_event_write(cmd, cs, BLIT);
2064 }
2065 
2066 static void
tu_emit_clear_gmem_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,uint32_t attachment,VkImageAspectFlags mask,const VkClearValue * value)2067 tu_emit_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
2068                               struct tu_cs *cs,
2069                               uint32_t attachment,
2070                               VkImageAspectFlags mask,
2071                               const VkClearValue *value)
2072 {
2073    const struct tu_render_pass_attachment *att =
2074       &cmd->state.pass->attachments[attachment];
2075 
2076    if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
2077       if (mask & VK_IMAGE_ASPECT_DEPTH_BIT)
2078          clear_gmem_attachment(cmd, cs, VK_FORMAT_D32_SFLOAT, 0xf, att->gmem_offset, value);
2079       if (mask & VK_IMAGE_ASPECT_STENCIL_BIT)
2080          clear_gmem_attachment(cmd, cs, VK_FORMAT_S8_UINT, 0xf, att->gmem_offset_stencil, value);
2081       return;
2082    }
2083 
2084    clear_gmem_attachment(cmd, cs, att->format, aspect_write_mask(att->format, mask), att->gmem_offset, value);
2085 }
2086 
2087 static void
tu_clear_gmem_attachments(struct tu_cmd_buffer * cmd,uint32_t attachment_count,const VkClearAttachment * attachments,uint32_t rect_count,const VkClearRect * rects)2088 tu_clear_gmem_attachments(struct tu_cmd_buffer *cmd,
2089                           uint32_t attachment_count,
2090                           const VkClearAttachment *attachments,
2091                           uint32_t rect_count,
2092                           const VkClearRect *rects)
2093 {
2094    const struct tu_subpass *subpass = cmd->state.subpass;
2095    struct tu_cs *cs = &cmd->draw_cs;
2096 
2097    /* TODO: swap the loops for smaller cmdstream */
2098    for (unsigned i = 0; i < rect_count; i++) {
2099       unsigned x1 = rects[i].rect.offset.x;
2100       unsigned y1 = rects[i].rect.offset.y;
2101       unsigned x2 = x1 + rects[i].rect.extent.width - 1;
2102       unsigned y2 = y1 + rects[i].rect.extent.height - 1;
2103 
2104       tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_SCISSOR_TL, 2);
2105       tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_TL_X(x1) | A6XX_RB_BLIT_SCISSOR_TL_Y(y1));
2106       tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_BR_X(x2) | A6XX_RB_BLIT_SCISSOR_BR_Y(y2));
2107 
2108       for (unsigned j = 0; j < attachment_count; j++) {
2109          uint32_t a;
2110          if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT)
2111             a = subpass->color_attachments[attachments[j].colorAttachment].attachment;
2112          else
2113             a = subpass->depth_stencil_attachment.attachment;
2114 
2115          if (a == VK_ATTACHMENT_UNUSED)
2116                continue;
2117 
2118          tu_emit_clear_gmem_attachment(cmd, cs, a, attachments[j].aspectMask,
2119                                        &attachments[j].clearValue);
2120       }
2121    }
2122 }
2123 
2124 void
tu_CmdClearAttachments(VkCommandBuffer commandBuffer,uint32_t attachmentCount,const VkClearAttachment * pAttachments,uint32_t rectCount,const VkClearRect * pRects)2125 tu_CmdClearAttachments(VkCommandBuffer commandBuffer,
2126                        uint32_t attachmentCount,
2127                        const VkClearAttachment *pAttachments,
2128                        uint32_t rectCount,
2129                        const VkClearRect *pRects)
2130 {
2131    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2132    struct tu_cs *cs = &cmd->draw_cs;
2133 
2134    /* sysmem path behaves like a draw, note we don't have a way of using different
2135     * flushes for sysmem/gmem, so this needs to be outside of the cond_exec
2136     */
2137    tu_emit_cache_flush_renderpass(cmd, cs);
2138 
2139    for (uint32_t j = 0; j < attachmentCount; j++) {
2140       if ((pAttachments[j].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) == 0)
2141          continue;
2142       cmd->state.lrz.valid = false;
2143       cmd->state.dirty |= TU_CMD_DIRTY_LRZ;
2144    }
2145 
2146    /* vkCmdClearAttachments is supposed to respect the predicate if active.
2147     * The easiest way to do this is to always use the 3d path, which always
2148     * works even with GMEM because it's just a simple draw using the existing
2149     * attachment state. However it seems that IGNORE_VISIBILITY draws must be
2150     * skipped in the binning pass, since otherwise they produce binning data
2151     * which isn't consumed and leads to the wrong binning data being read, so
2152     * condition on GMEM | SYSMEM.
2153     */
2154    if (cmd->state.predication_active) {
2155       tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM |
2156                              CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
2157       tu_clear_sysmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects);
2158       tu_cond_exec_end(cs);
2159       return;
2160    }
2161 
2162    tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM);
2163    tu_clear_gmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects);
2164    tu_cond_exec_end(cs);
2165 
2166    tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
2167    tu_clear_sysmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects);
2168    tu_cond_exec_end(cs);
2169 }
2170 
2171 static void
clear_sysmem_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,VkFormat format,VkImageAspectFlags clear_mask,const VkRenderPassBeginInfo * info,uint32_t a,bool separate_stencil)2172 clear_sysmem_attachment(struct tu_cmd_buffer *cmd,
2173                         struct tu_cs *cs,
2174                         VkFormat format,
2175                         VkImageAspectFlags clear_mask,
2176                         const VkRenderPassBeginInfo *info,
2177                         uint32_t a,
2178                         bool separate_stencil)
2179 {
2180    const struct tu_framebuffer *fb = cmd->state.framebuffer;
2181    const struct tu_image_view *iview = fb->attachments[a].attachment;
2182    const uint32_t clear_views = cmd->state.pass->attachments[a].clear_views;
2183    const struct blit_ops *ops = &r2d_ops;
2184    if (cmd->state.pass->attachments[a].samples > 1)
2185       ops = &r3d_ops;
2186 
2187    ops->setup(cmd, cs, format, clear_mask, ROTATE_0, true, iview->ubwc_enabled);
2188    ops->coords(cs, &info->renderArea.offset, NULL, &info->renderArea.extent);
2189    ops->clear_value(cs, format, &info->pClearValues[a]);
2190 
2191    for_each_layer(i, clear_views, fb->layers) {
2192       if (separate_stencil) {
2193          if (ops == &r3d_ops)
2194             r3d_dst_stencil(cs, iview, i);
2195          else
2196             r2d_dst_stencil(cs, iview, i);
2197       } else {
2198          ops->dst(cs, iview, i);
2199       }
2200       ops->run(cmd, cs);
2201    }
2202 
2203    ops->teardown(cmd, cs);
2204 }
2205 
2206 void
tu_clear_sysmem_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,uint32_t a,const VkRenderPassBeginInfo * info)2207 tu_clear_sysmem_attachment(struct tu_cmd_buffer *cmd,
2208                            struct tu_cs *cs,
2209                            uint32_t a,
2210                            const VkRenderPassBeginInfo *info)
2211 {
2212    const struct tu_render_pass_attachment *attachment =
2213       &cmd->state.pass->attachments[a];
2214 
2215    if (!attachment->clear_mask)
2216       return;
2217 
2218    /* Wait for any flushes at the beginning of the renderpass to complete */
2219    tu_cs_emit_wfi(cs);
2220 
2221    if (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
2222       if (attachment->clear_mask & VK_IMAGE_ASPECT_DEPTH_BIT) {
2223          clear_sysmem_attachment(cmd, cs, VK_FORMAT_D32_SFLOAT, VK_IMAGE_ASPECT_COLOR_BIT,
2224                                  info, a, false);
2225       }
2226       if (attachment->clear_mask & VK_IMAGE_ASPECT_STENCIL_BIT) {
2227          clear_sysmem_attachment(cmd, cs, VK_FORMAT_S8_UINT, VK_IMAGE_ASPECT_COLOR_BIT,
2228                                  info, a, true);
2229       }
2230    } else {
2231       clear_sysmem_attachment(cmd, cs, attachment->format, attachment->clear_mask,
2232                               info, a, false);
2233    }
2234 
2235    /* The spec doesn't explicitly say, but presumably the initial renderpass
2236     * clear is considered part of the renderpass, and therefore barriers
2237     * aren't required inside the subpass/renderpass.  Therefore we need to
2238     * flush CCU color into CCU depth here, just like with
2239     * vkCmdClearAttachments(). Note that because this only happens at the
2240     * beginning of a renderpass, and renderpass writes are considered
2241     * "incoherent", we shouldn't have to worry about syncing depth into color
2242     * beforehand as depth should already be flushed.
2243     */
2244    if (vk_format_is_depth_or_stencil(attachment->format)) {
2245       tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2246       tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_DEPTH);
2247    } else {
2248       tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2249       tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR);
2250    }
2251 }
2252 
2253 void
tu_clear_gmem_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,uint32_t a,const VkRenderPassBeginInfo * info)2254 tu_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
2255                          struct tu_cs *cs,
2256                          uint32_t a,
2257                          const VkRenderPassBeginInfo *info)
2258 {
2259    const struct tu_render_pass_attachment *attachment =
2260       &cmd->state.pass->attachments[a];
2261 
2262    if (!attachment->clear_mask)
2263       return;
2264 
2265    tu_cs_emit_regs(cs, A6XX_RB_MSAA_CNTL(tu_msaa_samples(attachment->samples)));
2266 
2267    tu_emit_clear_gmem_attachment(cmd, cs, a, attachment->clear_mask,
2268                                  &info->pClearValues[a]);
2269 }
2270 
2271 static void
tu_emit_blit(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,const struct tu_render_pass_attachment * attachment,bool resolve,bool separate_stencil)2272 tu_emit_blit(struct tu_cmd_buffer *cmd,
2273              struct tu_cs *cs,
2274              const struct tu_image_view *iview,
2275              const struct tu_render_pass_attachment *attachment,
2276              bool resolve,
2277              bool separate_stencil)
2278 {
2279    tu_cs_emit_regs(cs,
2280                    A6XX_RB_MSAA_CNTL(tu_msaa_samples(attachment->samples)));
2281 
2282    tu_cs_emit_regs(cs, A6XX_RB_BLIT_INFO(
2283       .unk0 = !resolve,
2284       .gmem = !resolve,
2285       /* "integer" bit disables msaa resolve averaging */
2286       .integer = vk_format_is_int(attachment->format)));
2287 
2288    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 4);
2289    if (separate_stencil) {
2290       tu_cs_emit(cs, tu_image_view_stencil(iview, RB_BLIT_DST_INFO) & ~A6XX_RB_BLIT_DST_INFO_FLAGS);
2291       tu_cs_emit_qw(cs, iview->stencil_base_addr);
2292       tu_cs_emit(cs, iview->stencil_PITCH);
2293 
2294       tu_cs_emit_regs(cs,
2295                       A6XX_RB_BLIT_BASE_GMEM(attachment->gmem_offset_stencil));
2296    } else {
2297       tu_cs_emit(cs, iview->RB_BLIT_DST_INFO);
2298       tu_cs_image_ref_2d(cs, iview, 0, false);
2299 
2300       tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_FLAG_DST_LO, 3);
2301       tu_cs_image_flag_ref(cs, iview, 0);
2302 
2303       tu_cs_emit_regs(cs,
2304                       A6XX_RB_BLIT_BASE_GMEM(attachment->gmem_offset));
2305    }
2306 
2307    tu6_emit_event_write(cmd, cs, BLIT);
2308 }
2309 
2310 static bool
blit_can_resolve(VkFormat format)2311 blit_can_resolve(VkFormat format)
2312 {
2313    const struct util_format_description *desc = vk_format_description(format);
2314 
2315    /* blit event can only do resolve for simple cases:
2316     * averaging samples as unsigned integers or choosing only one sample
2317     */
2318    if (vk_format_is_snorm(format) || vk_format_is_srgb(format))
2319       return false;
2320 
2321    /* can't do formats with larger channel sizes
2322     * note: this includes all float formats
2323     * note2: single channel integer formats seem OK
2324     */
2325    if (desc->channel[0].size > 10)
2326       return false;
2327 
2328    switch (format) {
2329    /* for unknown reasons blit event can't msaa resolve these formats when tiled
2330     * likely related to these formats having different layout from other cpp=2 formats
2331     */
2332    case VK_FORMAT_R8G8_UNORM:
2333    case VK_FORMAT_R8G8_UINT:
2334    case VK_FORMAT_R8G8_SINT:
2335    /* TODO: this one should be able to work? */
2336    case VK_FORMAT_D24_UNORM_S8_UINT:
2337       return false;
2338    default:
2339       break;
2340    }
2341 
2342    return true;
2343 }
2344 
2345 void
tu_load_gmem_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,uint32_t a,bool force_load)2346 tu_load_gmem_attachment(struct tu_cmd_buffer *cmd,
2347                         struct tu_cs *cs,
2348                         uint32_t a,
2349                         bool force_load)
2350 {
2351    const struct tu_image_view *iview =
2352       cmd->state.framebuffer->attachments[a].attachment;
2353    const struct tu_render_pass_attachment *attachment =
2354       &cmd->state.pass->attachments[a];
2355 
2356    if (attachment->load || force_load)
2357       tu_emit_blit(cmd, cs, iview, attachment, false, false);
2358 
2359    if (attachment->load_stencil || (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT && force_load))
2360       tu_emit_blit(cmd, cs, iview, attachment, false, true);
2361 }
2362 
2363 static void
store_cp_blit(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_image_view * iview,uint32_t samples,bool separate_stencil,VkFormat format,uint32_t gmem_offset,uint32_t cpp)2364 store_cp_blit(struct tu_cmd_buffer *cmd,
2365               struct tu_cs *cs,
2366               struct tu_image_view *iview,
2367               uint32_t samples,
2368               bool separate_stencil,
2369               VkFormat format,
2370               uint32_t gmem_offset,
2371               uint32_t cpp)
2372 {
2373    r2d_setup_common(cmd, cs, format, VK_IMAGE_ASPECT_COLOR_BIT, ROTATE_0, false,
2374                     iview->ubwc_enabled, true);
2375    if (separate_stencil)
2376       r2d_dst_stencil(cs, iview, 0);
2377    else
2378       r2d_dst(cs, iview, 0);
2379 
2380    tu_cs_emit_regs(cs,
2381                    A6XX_SP_PS_2D_SRC_INFO(
2382                       .color_format = tu6_format_texture(format, TILE6_2).fmt,
2383                       .tile_mode = TILE6_2,
2384                       .srgb = vk_format_is_srgb(format),
2385                       .samples = tu_msaa_samples(samples),
2386                       .samples_average = !vk_format_is_int(format),
2387                       .unk20 = 1,
2388                       .unk22 = 1),
2389                    /* note: src size does not matter when not scaling */
2390                    A6XX_SP_PS_2D_SRC_SIZE( .width = 0x3fff, .height = 0x3fff),
2391                    A6XX_SP_PS_2D_SRC_LO(cmd->device->physical_device->gmem_base + gmem_offset),
2392                    A6XX_SP_PS_2D_SRC_HI(),
2393                    A6XX_SP_PS_2D_SRC_PITCH(.pitch = cmd->state.framebuffer->tile0.width * cpp));
2394 
2395    /* sync GMEM writes with CACHE. */
2396    tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
2397 
2398    /* Wait for CACHE_INVALIDATE to land */
2399    tu_cs_emit_wfi(cs);
2400 
2401    tu_cs_emit_pkt7(cs, CP_BLIT, 1);
2402    tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE));
2403 
2404    /* CP_BLIT writes to the CCU, unlike CP_EVENT_WRITE::BLIT which writes to
2405     * sysmem, and we generally assume that GMEM renderpasses leave their
2406     * results in sysmem, so we need to flush manually here.
2407     */
2408    tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2409 }
2410 
2411 void
tu_store_gmem_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,uint32_t a,uint32_t gmem_a)2412 tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
2413                          struct tu_cs *cs,
2414                          uint32_t a,
2415                          uint32_t gmem_a)
2416 {
2417    struct tu_physical_device *phys_dev = cmd->device->physical_device;
2418    const VkRect2D *render_area = &cmd->state.render_area;
2419    struct tu_render_pass_attachment *dst = &cmd->state.pass->attachments[a];
2420    struct tu_image_view *iview = cmd->state.framebuffer->attachments[a].attachment;
2421    struct tu_render_pass_attachment *src = &cmd->state.pass->attachments[gmem_a];
2422 
2423    if (!dst->store && !dst->store_stencil)
2424       return;
2425 
2426    uint32_t x1 = render_area->offset.x;
2427    uint32_t y1 = render_area->offset.y;
2428    uint32_t x2 = x1 + render_area->extent.width;
2429    uint32_t y2 = y1 + render_area->extent.height;
2430    /* x2/y2 can be unaligned if equal to the size of the image,
2431     * since it will write into padding space
2432     * the one exception is linear levels which don't have the
2433     * required y padding in the layout (except for the last level)
2434     */
2435    bool need_y2_align =
2436       y2 != iview->extent.height || iview->need_y2_align;
2437 
2438    bool unaligned =
2439       x1 % phys_dev->info.gmem_align_w ||
2440       (x2 % phys_dev->info.gmem_align_w && x2 != iview->extent.width) ||
2441       y1 % phys_dev->info.gmem_align_h || (y2 % phys_dev->info.gmem_align_h && need_y2_align);
2442 
2443    /* use fast path when render area is aligned, except for unsupported resolve cases */
2444    if (!unaligned && (a == gmem_a || blit_can_resolve(dst->format))) {
2445       if (dst->store)
2446          tu_emit_blit(cmd, cs, iview, src, true, false);
2447       if (dst->store_stencil)
2448          tu_emit_blit(cmd, cs, iview, src, true, true);
2449       return;
2450    }
2451 
2452    if (dst->samples > 1) {
2453       /* I guess we need to use shader path in this case?
2454        * need a testcase which fails because of this
2455        */
2456       tu_finishme("unaligned store of msaa attachment\n");
2457       return;
2458    }
2459 
2460    r2d_coords(cs, &render_area->offset, &render_area->offset, &render_area->extent);
2461 
2462    VkFormat format = src->format;
2463    if (format == VK_FORMAT_D32_SFLOAT_S8_UINT)
2464       format = VK_FORMAT_D32_SFLOAT;
2465 
2466    if (dst->store) {
2467       store_cp_blit(cmd, cs, iview, src->samples, false, format,
2468                     src->gmem_offset, src->cpp);
2469    }
2470    if (dst->store_stencil) {
2471       store_cp_blit(cmd, cs, iview, src->samples, true, VK_FORMAT_S8_UINT,
2472                     src->gmem_offset_stencil, src->samples);
2473    }
2474 }
2475