1 /*
2  * Copyright © 2016 Red Hat.
3  * Copyright © 2016 Bas Nieuwenhuizen
4  *
5  * based in part on anv driver which is:
6  * Copyright © 2015 Intel Corporation
7  *
8  * Permission is hereby granted, free of charge, to any person obtaining a
9  * copy of this software and associated documentation files (the "Software"),
10  * to deal in the Software without restriction, including without limitation
11  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
12  * and/or sell copies of the Software, and to permit persons to whom the
13  * Software is furnished to do so, subject to the following conditions:
14  *
15  * The above copyright notice and this permission notice (including the next
16  * paragraph) shall be included in all copies or substantial portions of the
17  * Software.
18  *
19  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
22  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25  * DEALINGS IN THE SOFTWARE.
26  */
27 
28 #include "common/freedreno_guardband.h"
29 #include "tu_private.h"
30 
31 #include "ir3/ir3_nir.h"
32 #include "main/menums.h"
33 #include "nir/nir.h"
34 #include "nir/nir_builder.h"
35 #include "spirv/nir_spirv.h"
36 #include "util/debug.h"
37 #include "util/mesa-sha1.h"
38 #include "util/u_atomic.h"
39 #include "vk_format.h"
40 #include "vk_util.h"
41 
42 #include "tu_cs.h"
43 
44 /* Emit IB that preloads the descriptors that the shader uses */
45 
46 static void
emit_load_state(struct tu_cs * cs,unsigned opcode,enum a6xx_state_type st,enum a6xx_state_block sb,unsigned base,unsigned offset,unsigned count)47 emit_load_state(struct tu_cs *cs, unsigned opcode, enum a6xx_state_type st,
48                 enum a6xx_state_block sb, unsigned base, unsigned offset,
49                 unsigned count)
50 {
51    /* Note: just emit one packet, even if count overflows NUM_UNIT. It's not
52     * clear if emitting more packets will even help anything. Presumably the
53     * descriptor cache is relatively small, and these packets stop doing
54     * anything when there are too many descriptors.
55     */
56    tu_cs_emit_pkt7(cs, opcode, 3);
57    tu_cs_emit(cs,
58               CP_LOAD_STATE6_0_STATE_TYPE(st) |
59               CP_LOAD_STATE6_0_STATE_SRC(SS6_BINDLESS) |
60               CP_LOAD_STATE6_0_STATE_BLOCK(sb) |
61               CP_LOAD_STATE6_0_NUM_UNIT(MIN2(count, 1024-1)));
62    tu_cs_emit_qw(cs, offset | (base << 28));
63 }
64 
65 static unsigned
tu6_load_state_size(struct tu_pipeline * pipeline,bool compute)66 tu6_load_state_size(struct tu_pipeline *pipeline, bool compute)
67 {
68    const unsigned load_state_size = 4;
69    unsigned size = 0;
70    for (unsigned i = 0; i < pipeline->layout->num_sets; i++) {
71       if (pipeline && !(pipeline->active_desc_sets & (1u << i)))
72          continue;
73 
74       struct tu_descriptor_set_layout *set_layout = pipeline->layout->set[i].layout;
75       for (unsigned j = 0; j < set_layout->binding_count; j++) {
76          struct tu_descriptor_set_binding_layout *binding = &set_layout->binding[j];
77          unsigned count = 0;
78          /* Note: some users, like amber for example, pass in
79           * VK_SHADER_STAGE_ALL which includes a bunch of extra bits, so
80           * filter these out by using VK_SHADER_STAGE_ALL_GRAPHICS explicitly.
81           */
82          VkShaderStageFlags stages = compute ?
83             binding->shader_stages & VK_SHADER_STAGE_COMPUTE_BIT :
84             binding->shader_stages & VK_SHADER_STAGE_ALL_GRAPHICS;
85          unsigned stage_count = util_bitcount(stages);
86 
87          if (!binding->array_size)
88             continue;
89 
90          switch (binding->type) {
91          case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
92          case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
93          case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
94          case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
95             /* IBO-backed resources only need one packet for all graphics stages */
96             if (stages & ~VK_SHADER_STAGE_COMPUTE_BIT)
97                count += 1;
98             if (stages & VK_SHADER_STAGE_COMPUTE_BIT)
99                count += 1;
100             break;
101          case VK_DESCRIPTOR_TYPE_SAMPLER:
102          case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
103          case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
104          case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
105          case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
106             /* Textures and UBO's needs a packet for each stage */
107             count = stage_count;
108             break;
109          case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
110             /* Because of how we pack combined images and samplers, we
111              * currently can't use one packet for the whole array.
112              */
113             count = stage_count * binding->array_size * 2;
114             break;
115          case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
116             break;
117          default:
118             unreachable("bad descriptor type");
119          }
120          size += count * load_state_size;
121       }
122    }
123    return size;
124 }
125 
126 static void
tu6_emit_load_state(struct tu_pipeline * pipeline,bool compute)127 tu6_emit_load_state(struct tu_pipeline *pipeline, bool compute)
128 {
129    unsigned size = tu6_load_state_size(pipeline, compute);
130    if (size == 0)
131       return;
132 
133    struct tu_cs cs;
134    tu_cs_begin_sub_stream(&pipeline->cs, size, &cs);
135 
136    struct tu_pipeline_layout *layout = pipeline->layout;
137    for (unsigned i = 0; i < layout->num_sets; i++) {
138       /* From 13.2.7. Descriptor Set Binding:
139        *
140        *    A compatible descriptor set must be bound for all set numbers that
141        *    any shaders in a pipeline access, at the time that a draw or
142        *    dispatch command is recorded to execute using that pipeline.
143        *    However, if none of the shaders in a pipeline statically use any
144        *    bindings with a particular set number, then no descriptor set need
145        *    be bound for that set number, even if the pipeline layout includes
146        *    a non-trivial descriptor set layout for that set number.
147        *
148        * This means that descriptor sets unused by the pipeline may have a
149        * garbage or 0 BINDLESS_BASE register, which will cause context faults
150        * when prefetching descriptors from these sets. Skip prefetching for
151        * descriptors from them to avoid this. This is also an optimization,
152        * since these prefetches would be useless.
153        */
154       if (!(pipeline->active_desc_sets & (1u << i)))
155          continue;
156 
157       struct tu_descriptor_set_layout *set_layout = layout->set[i].layout;
158       for (unsigned j = 0; j < set_layout->binding_count; j++) {
159          struct tu_descriptor_set_binding_layout *binding = &set_layout->binding[j];
160          unsigned base = i;
161          unsigned offset = binding->offset / 4;
162          /* Note: some users, like amber for example, pass in
163           * VK_SHADER_STAGE_ALL which includes a bunch of extra bits, so
164           * filter these out by using VK_SHADER_STAGE_ALL_GRAPHICS explicitly.
165           */
166          VkShaderStageFlags stages = compute ?
167             binding->shader_stages & VK_SHADER_STAGE_COMPUTE_BIT :
168             binding->shader_stages & VK_SHADER_STAGE_ALL_GRAPHICS;
169          unsigned count = binding->array_size;
170          if (count == 0 || stages == 0)
171             continue;
172          switch (binding->type) {
173          case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
174             base = MAX_SETS;
175             offset = (layout->set[i].dynamic_offset_start +
176                       binding->dynamic_offset_offset) * A6XX_TEX_CONST_DWORDS;
177             /* fallthrough */
178          case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
179          case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
180          case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
181             /* IBO-backed resources only need one packet for all graphics stages */
182             if (stages & ~VK_SHADER_STAGE_COMPUTE_BIT) {
183                emit_load_state(&cs, CP_LOAD_STATE6, ST6_SHADER, SB6_IBO,
184                                base, offset, count);
185             }
186             if (stages & VK_SHADER_STAGE_COMPUTE_BIT) {
187                emit_load_state(&cs, CP_LOAD_STATE6_FRAG, ST6_IBO, SB6_CS_SHADER,
188                                base, offset, count);
189             }
190             break;
191          case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
192             /* nothing - input attachment doesn't use bindless */
193             break;
194          case VK_DESCRIPTOR_TYPE_SAMPLER:
195          case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
196          case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: {
197             tu_foreach_stage(stage, stages) {
198                emit_load_state(&cs, tu6_stage2opcode(stage),
199                                binding->type == VK_DESCRIPTOR_TYPE_SAMPLER ?
200                                ST6_SHADER : ST6_CONSTANTS,
201                                tu6_stage2texsb(stage), base, offset, count);
202             }
203             break;
204          }
205          case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
206             base = MAX_SETS;
207             offset = (layout->set[i].dynamic_offset_start +
208                       binding->dynamic_offset_offset) * A6XX_TEX_CONST_DWORDS;
209             /* fallthrough */
210          case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: {
211             tu_foreach_stage(stage, stages) {
212                emit_load_state(&cs, tu6_stage2opcode(stage), ST6_UBO,
213                                tu6_stage2shadersb(stage), base, offset, count);
214             }
215             break;
216          }
217          case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: {
218             tu_foreach_stage(stage, stages) {
219                /* TODO: We could emit less CP_LOAD_STATE6 if we used
220                 * struct-of-arrays instead of array-of-structs.
221                 */
222                for (unsigned i = 0; i < count; i++) {
223                   unsigned tex_offset = offset + 2 * i * A6XX_TEX_CONST_DWORDS;
224                   unsigned sam_offset = offset + (2 * i + 1) * A6XX_TEX_CONST_DWORDS;
225                   emit_load_state(&cs, tu6_stage2opcode(stage),
226                                   ST6_CONSTANTS, tu6_stage2texsb(stage),
227                                   base, tex_offset, 1);
228                   emit_load_state(&cs, tu6_stage2opcode(stage),
229                                   ST6_SHADER, tu6_stage2texsb(stage),
230                                   base, sam_offset, 1);
231                }
232             }
233             break;
234          }
235          default:
236             unreachable("bad descriptor type");
237          }
238       }
239    }
240 
241    pipeline->load_state = tu_cs_end_draw_state(&pipeline->cs, &cs);
242 }
243 
244 struct tu_pipeline_builder
245 {
246    struct tu_device *device;
247    struct tu_pipeline_cache *cache;
248    struct tu_pipeline_layout *layout;
249    const VkAllocationCallbacks *alloc;
250    const VkGraphicsPipelineCreateInfo *create_info;
251 
252    struct tu_shader *shaders[MESA_SHADER_STAGES];
253    struct ir3_shader_variant *variants[MESA_SHADER_STAGES];
254    struct ir3_shader_variant *binning_variant;
255    uint64_t shader_iova[MESA_SHADER_STAGES];
256    uint64_t binning_vs_iova;
257 
258    bool rasterizer_discard;
259    /* these states are affectd by rasterizer_discard */
260    VkSampleCountFlagBits samples;
261    bool use_color_attachments;
262    bool use_dual_src_blend;
263    uint32_t color_attachment_count;
264    VkFormat color_attachment_formats[MAX_RTS];
265    VkFormat depth_attachment_format;
266    uint32_t render_components;
267    uint32_t multiview_mask;
268 };
269 
270 static bool
tu_logic_op_reads_dst(VkLogicOp op)271 tu_logic_op_reads_dst(VkLogicOp op)
272 {
273    switch (op) {
274    case VK_LOGIC_OP_CLEAR:
275    case VK_LOGIC_OP_COPY:
276    case VK_LOGIC_OP_COPY_INVERTED:
277    case VK_LOGIC_OP_SET:
278       return false;
279    default:
280       return true;
281    }
282 }
283 
284 static VkBlendFactor
tu_blend_factor_no_dst_alpha(VkBlendFactor factor)285 tu_blend_factor_no_dst_alpha(VkBlendFactor factor)
286 {
287    /* treat dst alpha as 1.0 and avoid reading it */
288    switch (factor) {
289    case VK_BLEND_FACTOR_DST_ALPHA:
290       return VK_BLEND_FACTOR_ONE;
291    case VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA:
292       return VK_BLEND_FACTOR_ZERO;
293    default:
294       return factor;
295    }
296 }
297 
tu_blend_factor_is_dual_src(VkBlendFactor factor)298 static bool tu_blend_factor_is_dual_src(VkBlendFactor factor)
299 {
300    switch (factor) {
301    case VK_BLEND_FACTOR_SRC1_COLOR:
302    case VK_BLEND_FACTOR_ONE_MINUS_SRC1_COLOR:
303    case VK_BLEND_FACTOR_SRC1_ALPHA:
304    case VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA:
305       return true;
306    default:
307       return false;
308    }
309 }
310 
311 static bool
tu_blend_state_is_dual_src(const VkPipelineColorBlendStateCreateInfo * info)312 tu_blend_state_is_dual_src(const VkPipelineColorBlendStateCreateInfo *info)
313 {
314    if (!info)
315       return false;
316 
317    for (unsigned i = 0; i < info->attachmentCount; i++) {
318       const VkPipelineColorBlendAttachmentState *blend = &info->pAttachments[i];
319       if (tu_blend_factor_is_dual_src(blend->srcColorBlendFactor) ||
320           tu_blend_factor_is_dual_src(blend->dstColorBlendFactor) ||
321           tu_blend_factor_is_dual_src(blend->srcAlphaBlendFactor) ||
322           tu_blend_factor_is_dual_src(blend->dstAlphaBlendFactor))
323          return true;
324    }
325 
326    return false;
327 }
328 
329 void
tu6_emit_xs_config(struct tu_cs * cs,gl_shader_stage stage,const struct ir3_shader_variant * xs,uint64_t binary_iova)330 tu6_emit_xs_config(struct tu_cs *cs,
331                    gl_shader_stage stage, /* xs->type, but xs may be NULL */
332                    const struct ir3_shader_variant *xs,
333                    uint64_t binary_iova)
334 {
335    static const struct xs_config {
336       uint16_t reg_sp_xs_ctrl;
337       uint16_t reg_sp_xs_config;
338       uint16_t reg_hlsq_xs_ctrl;
339       uint16_t reg_sp_vs_obj_start;
340    } xs_config[] = {
341       [MESA_SHADER_VERTEX] = {
342          REG_A6XX_SP_VS_CTRL_REG0,
343          REG_A6XX_SP_VS_CONFIG,
344          REG_A6XX_HLSQ_VS_CNTL,
345          REG_A6XX_SP_VS_OBJ_START_LO,
346       },
347       [MESA_SHADER_TESS_CTRL] = {
348          REG_A6XX_SP_HS_CTRL_REG0,
349          REG_A6XX_SP_HS_CONFIG,
350          REG_A6XX_HLSQ_HS_CNTL,
351          REG_A6XX_SP_HS_OBJ_START_LO,
352       },
353       [MESA_SHADER_TESS_EVAL] = {
354          REG_A6XX_SP_DS_CTRL_REG0,
355          REG_A6XX_SP_DS_CONFIG,
356          REG_A6XX_HLSQ_DS_CNTL,
357          REG_A6XX_SP_DS_OBJ_START_LO,
358       },
359       [MESA_SHADER_GEOMETRY] = {
360          REG_A6XX_SP_GS_CTRL_REG0,
361          REG_A6XX_SP_GS_CONFIG,
362          REG_A6XX_HLSQ_GS_CNTL,
363          REG_A6XX_SP_GS_OBJ_START_LO,
364       },
365       [MESA_SHADER_FRAGMENT] = {
366          REG_A6XX_SP_FS_CTRL_REG0,
367          REG_A6XX_SP_FS_CONFIG,
368          REG_A6XX_HLSQ_FS_CNTL,
369          REG_A6XX_SP_FS_OBJ_START_LO,
370       },
371       [MESA_SHADER_COMPUTE] = {
372          REG_A6XX_SP_CS_CTRL_REG0,
373          REG_A6XX_SP_CS_CONFIG,
374          REG_A6XX_HLSQ_CS_CNTL,
375          REG_A6XX_SP_CS_OBJ_START_LO,
376       },
377    };
378    const struct xs_config *cfg = &xs_config[stage];
379 
380    if (!xs) {
381       /* shader stage disabled */
382       tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_config, 1);
383       tu_cs_emit(cs, 0);
384 
385       tu_cs_emit_pkt4(cs, cfg->reg_hlsq_xs_ctrl, 1);
386       tu_cs_emit(cs, 0);
387       return;
388    }
389 
390    bool is_fs = xs->type == MESA_SHADER_FRAGMENT;
391    enum a3xx_threadsize threadsize = FOUR_QUADS;
392 
393    /* TODO:
394     * the "threadsize" field may have nothing to do with threadsize,
395     * use a value that matches the blob until it is figured out
396     */
397    if (xs->type == MESA_SHADER_GEOMETRY)
398       threadsize = TWO_QUADS;
399 
400    tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_ctrl, 1);
401    tu_cs_emit(cs,
402               A6XX_SP_VS_CTRL_REG0_THREADSIZE(threadsize) |
403               A6XX_SP_VS_CTRL_REG0_FULLREGFOOTPRINT(xs->info.max_reg + 1) |
404               A6XX_SP_VS_CTRL_REG0_HALFREGFOOTPRINT(xs->info.max_half_reg + 1) |
405               COND(xs->mergedregs, A6XX_SP_VS_CTRL_REG0_MERGEDREGS) |
406               A6XX_SP_VS_CTRL_REG0_BRANCHSTACK(xs->branchstack) |
407               COND(xs->need_pixlod, A6XX_SP_VS_CTRL_REG0_PIXLODENABLE) |
408               COND(xs->need_fine_derivatives, A6XX_SP_VS_CTRL_REG0_DIFF_FINE) |
409               /* only fragment shader sets VARYING bit */
410               COND(xs->total_in && is_fs, A6XX_SP_FS_CTRL_REG0_VARYING) |
411               /* unknown bit, seems unnecessary */
412               COND(is_fs, 0x1000000));
413 
414    tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_config, 2);
415    tu_cs_emit(cs, A6XX_SP_VS_CONFIG_ENABLED |
416                   COND(xs->bindless_tex, A6XX_SP_VS_CONFIG_BINDLESS_TEX) |
417                   COND(xs->bindless_samp, A6XX_SP_VS_CONFIG_BINDLESS_SAMP) |
418                   COND(xs->bindless_ibo, A6XX_SP_VS_CONFIG_BINDLESS_IBO) |
419                   COND(xs->bindless_ubo, A6XX_SP_VS_CONFIG_BINDLESS_UBO) |
420                   A6XX_SP_VS_CONFIG_NTEX(xs->num_samp) |
421                   A6XX_SP_VS_CONFIG_NSAMP(xs->num_samp));
422    tu_cs_emit(cs, xs->instrlen);
423 
424    tu_cs_emit_pkt4(cs, cfg->reg_hlsq_xs_ctrl, 1);
425    tu_cs_emit(cs, A6XX_HLSQ_VS_CNTL_CONSTLEN(xs->constlen) |
426                   A6XX_HLSQ_VS_CNTL_ENABLED);
427 
428    /* emit program binary
429     * binary_iova should be aligned to 1 instrlen unit (128 bytes)
430     */
431 
432    assert((binary_iova & 0x7f) == 0);
433 
434    tu_cs_emit_pkt4(cs, cfg->reg_sp_vs_obj_start, 2);
435    tu_cs_emit_qw(cs, binary_iova);
436 
437    tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3);
438    tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
439                   CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
440                   CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
441                   CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
442                   CP_LOAD_STATE6_0_NUM_UNIT(xs->instrlen));
443    tu_cs_emit_qw(cs, binary_iova);
444 
445    /* emit immediates */
446 
447    const struct ir3_const_state *const_state = ir3_const_state(xs);
448    uint32_t base = const_state->offsets.immediate;
449    int size = DIV_ROUND_UP(const_state->immediates_count, 4);
450 
451    /* truncate size to avoid writing constants that shader
452     * does not use:
453     */
454    size = MIN2(size + base, xs->constlen) - base;
455 
456    if (size <= 0)
457       return;
458 
459    tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3 + size * 4);
460    tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(base) |
461                   CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
462                   CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
463                   CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
464                   CP_LOAD_STATE6_0_NUM_UNIT(size));
465    tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
466    tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
467 
468    tu_cs_emit_array(cs, const_state->immediates, size * 4);
469 }
470 
471 static void
tu6_emit_cs_config(struct tu_cs * cs,const struct tu_shader * shader,const struct ir3_shader_variant * v,uint64_t binary_iova)472 tu6_emit_cs_config(struct tu_cs *cs, const struct tu_shader *shader,
473                    const struct ir3_shader_variant *v,
474                    uint64_t binary_iova)
475 {
476    tu_cs_emit_regs(cs, A6XX_HLSQ_INVALIDATE_CMD(
477          .cs_state = true,
478          .cs_ibo = true));
479 
480    tu6_emit_xs_config(cs, MESA_SHADER_COMPUTE, v, binary_iova);
481 
482    tu_cs_emit_pkt4(cs, REG_A6XX_SP_CS_UNKNOWN_A9B1, 1);
483    tu_cs_emit(cs, 0x41);
484 
485    uint32_t local_invocation_id =
486       ir3_find_sysval_regid(v, SYSTEM_VALUE_LOCAL_INVOCATION_ID);
487    uint32_t work_group_id =
488       ir3_find_sysval_regid(v, SYSTEM_VALUE_WORK_GROUP_ID);
489 
490    tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_CS_CNTL_0, 2);
491    tu_cs_emit(cs,
492               A6XX_HLSQ_CS_CNTL_0_WGIDCONSTID(work_group_id) |
493               A6XX_HLSQ_CS_CNTL_0_UNK0(regid(63, 0)) |
494               A6XX_HLSQ_CS_CNTL_0_UNK1(regid(63, 0)) |
495               A6XX_HLSQ_CS_CNTL_0_LOCALIDREGID(local_invocation_id));
496    tu_cs_emit(cs, 0x2fc);             /* HLSQ_CS_UNKNOWN_B998 */
497 }
498 
499 static void
tu6_emit_vs_system_values(struct tu_cs * cs,const struct ir3_shader_variant * vs,const struct ir3_shader_variant * hs,const struct ir3_shader_variant * ds,const struct ir3_shader_variant * gs,bool primid_passthru)500 tu6_emit_vs_system_values(struct tu_cs *cs,
501                           const struct ir3_shader_variant *vs,
502                           const struct ir3_shader_variant *hs,
503                           const struct ir3_shader_variant *ds,
504                           const struct ir3_shader_variant *gs,
505                           bool primid_passthru)
506 {
507    const uint32_t vertexid_regid =
508          ir3_find_sysval_regid(vs, SYSTEM_VALUE_VERTEX_ID);
509    const uint32_t instanceid_regid =
510          ir3_find_sysval_regid(vs, SYSTEM_VALUE_INSTANCE_ID);
511    const uint32_t tess_coord_x_regid = hs ?
512          ir3_find_sysval_regid(ds, SYSTEM_VALUE_TESS_COORD) :
513          regid(63, 0);
514    const uint32_t tess_coord_y_regid = VALIDREG(tess_coord_x_regid) ?
515          tess_coord_x_regid + 1 :
516          regid(63, 0);
517    const uint32_t hs_patch_regid = hs ?
518          ir3_find_sysval_regid(hs, SYSTEM_VALUE_PRIMITIVE_ID) :
519          regid(63, 0);
520    const uint32_t ds_patch_regid = hs ?
521          ir3_find_sysval_regid(ds, SYSTEM_VALUE_PRIMITIVE_ID) :
522          regid(63, 0);
523    const uint32_t hs_invocation_regid = hs ?
524          ir3_find_sysval_regid(hs, SYSTEM_VALUE_TCS_HEADER_IR3) :
525          regid(63, 0);
526    const uint32_t primitiveid_regid = gs ?
527          ir3_find_sysval_regid(gs, SYSTEM_VALUE_PRIMITIVE_ID) :
528          regid(63, 0);
529    const uint32_t gsheader_regid = gs ?
530          ir3_find_sysval_regid(gs, SYSTEM_VALUE_GS_HEADER_IR3) :
531          regid(63, 0);
532 
533    /* Note: we currently don't support multiview with tess or GS. If we did,
534     * and the HW actually works, then we'd have to somehow share this across
535     * stages. Note that the blob doesn't support this either.
536     */
537    const uint32_t viewid_regid =
538       ir3_find_sysval_regid(vs, SYSTEM_VALUE_VIEW_INDEX);
539 
540    tu_cs_emit_pkt4(cs, REG_A6XX_VFD_CONTROL_1, 6);
541    tu_cs_emit(cs, A6XX_VFD_CONTROL_1_REGID4VTX(vertexid_regid) |
542                   A6XX_VFD_CONTROL_1_REGID4INST(instanceid_regid) |
543                   A6XX_VFD_CONTROL_1_REGID4PRIMID(primitiveid_regid) |
544                   A6XX_VFD_CONTROL_1_REGID4VIEWID(viewid_regid));
545    tu_cs_emit(cs, A6XX_VFD_CONTROL_2_REGID_HSPATCHID(hs_patch_regid) |
546                   A6XX_VFD_CONTROL_2_REGID_INVOCATIONID(hs_invocation_regid));
547    tu_cs_emit(cs, A6XX_VFD_CONTROL_3_REGID_DSPATCHID(ds_patch_regid) |
548                   A6XX_VFD_CONTROL_3_REGID_TESSX(tess_coord_x_regid) |
549                   A6XX_VFD_CONTROL_3_REGID_TESSY(tess_coord_y_regid) |
550                   0xfc);
551    tu_cs_emit(cs, 0x000000fc); /* VFD_CONTROL_4 */
552    tu_cs_emit(cs, A6XX_VFD_CONTROL_5_REGID_GSHEADER(gsheader_regid) |
553                   0xfc00); /* VFD_CONTROL_5 */
554    tu_cs_emit(cs, COND(primid_passthru, A6XX_VFD_CONTROL_6_PRIMID_PASSTHRU)); /* VFD_CONTROL_6 */
555 }
556 
557 /* Add any missing varyings needed for stream-out. Otherwise varyings not
558  * used by fragment shader will be stripped out.
559  */
560 static void
tu6_link_streamout(struct ir3_shader_linkage * l,const struct ir3_shader_variant * v)561 tu6_link_streamout(struct ir3_shader_linkage *l,
562                      const struct ir3_shader_variant *v)
563 {
564    const struct ir3_stream_output_info *info = &v->shader->stream_output;
565 
566    /*
567     * First, any stream-out varyings not already in linkage map (ie. also
568     * consumed by frag shader) need to be added:
569     */
570    for (unsigned i = 0; i < info->num_outputs; i++) {
571       const struct ir3_stream_output *out = &info->output[i];
572       unsigned compmask =
573                   (1 << (out->num_components + out->start_component)) - 1;
574       unsigned k = out->register_index;
575       unsigned idx, nextloc = 0;
576 
577       /* psize/pos need to be the last entries in linkage map, and will
578        * get added link_stream_out, so skip over them:
579        */
580       if (v->outputs[k].slot == VARYING_SLOT_PSIZ ||
581             v->outputs[k].slot == VARYING_SLOT_POS)
582          continue;
583 
584       for (idx = 0; idx < l->cnt; idx++) {
585          if (l->var[idx].regid == v->outputs[k].regid)
586             break;
587          nextloc = MAX2(nextloc, l->var[idx].loc + 4);
588       }
589 
590       /* add if not already in linkage map: */
591       if (idx == l->cnt)
592          ir3_link_add(l, v->outputs[k].regid, compmask, nextloc);
593 
594       /* expand component-mask if needed, ie streaming out all components
595        * but frag shader doesn't consume all components:
596        */
597       if (compmask & ~l->var[idx].compmask) {
598          l->var[idx].compmask |= compmask;
599          l->max_loc = MAX2(l->max_loc, l->var[idx].loc +
600                            util_last_bit(l->var[idx].compmask));
601       }
602    }
603 }
604 
605 static void
tu6_setup_streamout(struct tu_cs * cs,const struct ir3_shader_variant * v,struct ir3_shader_linkage * l)606 tu6_setup_streamout(struct tu_cs *cs,
607                     const struct ir3_shader_variant *v,
608                     struct ir3_shader_linkage *l)
609 {
610    const struct ir3_stream_output_info *info = &v->shader->stream_output;
611    /* Note: 64 here comes from the HW layout of the program RAM. The program
612     * for stream N is at DWORD 64 * N.
613     */
614 #define A6XX_SO_PROG_DWORDS 64
615    uint32_t prog[A6XX_SO_PROG_DWORDS * IR3_MAX_SO_STREAMS] = {};
616    BITSET_DECLARE(valid_dwords, A6XX_SO_PROG_DWORDS * IR3_MAX_SO_STREAMS) = {0};
617    uint32_t ncomp[IR3_MAX_SO_BUFFERS] = {};
618 
619    /* TODO: streamout state should be in a non-GMEM draw state */
620 
621    /* no streamout: */
622    if (info->num_outputs == 0) {
623       tu_cs_emit_pkt7(cs, CP_CONTEXT_REG_BUNCH, 4);
624       tu_cs_emit(cs, REG_A6XX_VPC_SO_CNTL);
625       tu_cs_emit(cs, 0);
626       tu_cs_emit(cs, REG_A6XX_VPC_SO_STREAM_CNTL);
627       tu_cs_emit(cs, 0);
628       return;
629    }
630 
631    /* is there something to do with info->stride[i]? */
632 
633    for (unsigned i = 0; i < info->num_outputs; i++) {
634       const struct ir3_stream_output *out = &info->output[i];
635       unsigned k = out->register_index;
636       unsigned idx;
637 
638       /* Skip it, if there's an unused reg in the middle of outputs. */
639       if (v->outputs[k].regid == INVALID_REG)
640          continue;
641 
642       ncomp[out->output_buffer] += out->num_components;
643 
644       /* linkage map sorted by order frag shader wants things, so
645        * a bit less ideal here..
646        */
647       for (idx = 0; idx < l->cnt; idx++)
648          if (l->var[idx].regid == v->outputs[k].regid)
649             break;
650 
651       debug_assert(idx < l->cnt);
652 
653       for (unsigned j = 0; j < out->num_components; j++) {
654          unsigned c   = j + out->start_component;
655          unsigned loc = l->var[idx].loc + c;
656          unsigned off = j + out->dst_offset;  /* in dwords */
657 
658          assert(loc < A6XX_SO_PROG_DWORDS * 2);
659          unsigned dword = out->stream * A6XX_SO_PROG_DWORDS + loc/2;
660          if (loc & 1) {
661             prog[dword] |= A6XX_VPC_SO_PROG_B_EN |
662                            A6XX_VPC_SO_PROG_B_BUF(out->output_buffer) |
663                            A6XX_VPC_SO_PROG_B_OFF(off * 4);
664          } else {
665             prog[dword] |= A6XX_VPC_SO_PROG_A_EN |
666                            A6XX_VPC_SO_PROG_A_BUF(out->output_buffer) |
667                            A6XX_VPC_SO_PROG_A_OFF(off * 4);
668          }
669          BITSET_SET(valid_dwords, dword);
670       }
671    }
672 
673    unsigned prog_count = 0;
674    unsigned start, end;
675    BITSET_FOREACH_RANGE(start, end, valid_dwords,
676                         A6XX_SO_PROG_DWORDS * IR3_MAX_SO_STREAMS) {
677       prog_count += end - start + 1;
678    }
679 
680    tu_cs_emit_pkt7(cs, CP_CONTEXT_REG_BUNCH, 10 + 2 * prog_count);
681    tu_cs_emit(cs, REG_A6XX_VPC_SO_STREAM_CNTL);
682    tu_cs_emit(cs, A6XX_VPC_SO_STREAM_CNTL_STREAM_ENABLE(info->streams_written) |
683                   COND(ncomp[0] > 0,
684                        A6XX_VPC_SO_STREAM_CNTL_BUF0_STREAM(1 + info->buffer_to_stream[0])) |
685                   COND(ncomp[1] > 0,
686                        A6XX_VPC_SO_STREAM_CNTL_BUF1_STREAM(1 + info->buffer_to_stream[1])) |
687                   COND(ncomp[2] > 0,
688                        A6XX_VPC_SO_STREAM_CNTL_BUF2_STREAM(1 + info->buffer_to_stream[2])) |
689                   COND(ncomp[3] > 0,
690                        A6XX_VPC_SO_STREAM_CNTL_BUF3_STREAM(1 + info->buffer_to_stream[3])));
691    for (uint32_t i = 0; i < 4; i++) {
692       tu_cs_emit(cs, REG_A6XX_VPC_SO_NCOMP(i));
693       tu_cs_emit(cs, ncomp[i]);
694    }
695    bool first = true;
696    BITSET_FOREACH_RANGE(start, end, valid_dwords,
697                         A6XX_SO_PROG_DWORDS * IR3_MAX_SO_STREAMS) {
698       tu_cs_emit(cs, REG_A6XX_VPC_SO_CNTL);
699       tu_cs_emit(cs, COND(first, A6XX_VPC_SO_CNTL_RESET) |
700                      A6XX_VPC_SO_CNTL_ADDR(start));
701       for (unsigned i = start; i < end; i++) {
702          tu_cs_emit(cs, REG_A6XX_VPC_SO_PROG);
703          tu_cs_emit(cs, prog[i]);
704       }
705       first = false;
706    }
707 }
708 
709 static void
tu6_emit_const(struct tu_cs * cs,uint32_t opcode,uint32_t base,enum a6xx_state_block block,uint32_t offset,uint32_t size,const uint32_t * dwords)710 tu6_emit_const(struct tu_cs *cs, uint32_t opcode, uint32_t base,
711                enum a6xx_state_block block, uint32_t offset,
712                uint32_t size, const uint32_t *dwords) {
713    assert(size % 4 == 0);
714 
715    tu_cs_emit_pkt7(cs, opcode, 3 + size);
716    tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(base) |
717          CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
718          CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
719          CP_LOAD_STATE6_0_STATE_BLOCK(block) |
720          CP_LOAD_STATE6_0_NUM_UNIT(size / 4));
721 
722    tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
723    tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
724    dwords = (uint32_t *)&((uint8_t *)dwords)[offset];
725 
726    tu_cs_emit_array(cs, dwords, size);
727 }
728 
729 static void
tu6_emit_link_map(struct tu_cs * cs,const struct ir3_shader_variant * producer,const struct ir3_shader_variant * consumer,enum a6xx_state_block sb)730 tu6_emit_link_map(struct tu_cs *cs,
731                   const struct ir3_shader_variant *producer,
732                   const struct ir3_shader_variant *consumer,
733                   enum a6xx_state_block sb)
734 {
735    const struct ir3_const_state *const_state = ir3_const_state(consumer);
736    uint32_t base = const_state->offsets.primitive_map;
737    int size = DIV_ROUND_UP(consumer->input_size, 4);
738 
739    size = (MIN2(size + base, consumer->constlen) - base) * 4;
740    if (size <= 0)
741       return;
742 
743    tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, base, sb, 0, size,
744                          producer->output_loc);
745 }
746 
747 static uint16_t
gl_primitive_to_tess(uint16_t primitive)748 gl_primitive_to_tess(uint16_t primitive) {
749    switch (primitive) {
750    case GL_POINTS:
751       return TESS_POINTS;
752    case GL_LINE_STRIP:
753       return TESS_LINES;
754    case GL_TRIANGLE_STRIP:
755       return TESS_CW_TRIS;
756    default:
757       unreachable("");
758    }
759 }
760 
761 void
tu6_emit_vpc(struct tu_cs * cs,const struct ir3_shader_variant * vs,const struct ir3_shader_variant * hs,const struct ir3_shader_variant * ds,const struct ir3_shader_variant * gs,const struct ir3_shader_variant * fs,uint32_t patch_control_points,bool vshs_workgroup)762 tu6_emit_vpc(struct tu_cs *cs,
763              const struct ir3_shader_variant *vs,
764              const struct ir3_shader_variant *hs,
765              const struct ir3_shader_variant *ds,
766              const struct ir3_shader_variant *gs,
767              const struct ir3_shader_variant *fs,
768              uint32_t patch_control_points,
769              bool vshs_workgroup)
770 {
771    /* note: doesn't compile as static because of the array regs.. */
772    const struct reg_config {
773       uint16_t reg_sp_xs_out_reg;
774       uint16_t reg_sp_xs_vpc_dst_reg;
775       uint16_t reg_vpc_xs_pack;
776       uint16_t reg_vpc_xs_clip_cntl;
777       uint16_t reg_gras_xs_cl_cntl;
778       uint16_t reg_pc_xs_out_cntl;
779       uint16_t reg_sp_xs_primitive_cntl;
780       uint16_t reg_vpc_xs_layer_cntl;
781       uint16_t reg_gras_xs_layer_cntl;
782    } reg_config[] = {
783       [MESA_SHADER_VERTEX] = {
784          REG_A6XX_SP_VS_OUT_REG(0),
785          REG_A6XX_SP_VS_VPC_DST_REG(0),
786          REG_A6XX_VPC_VS_PACK,
787          REG_A6XX_VPC_VS_CLIP_CNTL,
788          REG_A6XX_GRAS_VS_CL_CNTL,
789          REG_A6XX_PC_VS_OUT_CNTL,
790          REG_A6XX_SP_VS_PRIMITIVE_CNTL,
791          REG_A6XX_VPC_VS_LAYER_CNTL,
792          REG_A6XX_GRAS_VS_LAYER_CNTL
793       },
794       [MESA_SHADER_TESS_EVAL] = {
795          REG_A6XX_SP_DS_OUT_REG(0),
796          REG_A6XX_SP_DS_VPC_DST_REG(0),
797          REG_A6XX_VPC_DS_PACK,
798          REG_A6XX_VPC_DS_CLIP_CNTL,
799          REG_A6XX_GRAS_DS_CL_CNTL,
800          REG_A6XX_PC_DS_OUT_CNTL,
801          REG_A6XX_SP_DS_PRIMITIVE_CNTL,
802          REG_A6XX_VPC_DS_LAYER_CNTL,
803          REG_A6XX_GRAS_DS_LAYER_CNTL
804       },
805       [MESA_SHADER_GEOMETRY] = {
806          REG_A6XX_SP_GS_OUT_REG(0),
807          REG_A6XX_SP_GS_VPC_DST_REG(0),
808          REG_A6XX_VPC_GS_PACK,
809          REG_A6XX_VPC_GS_CLIP_CNTL,
810          REG_A6XX_GRAS_GS_CL_CNTL,
811          REG_A6XX_PC_GS_OUT_CNTL,
812          REG_A6XX_SP_GS_PRIMITIVE_CNTL,
813          REG_A6XX_VPC_GS_LAYER_CNTL,
814          REG_A6XX_GRAS_GS_LAYER_CNTL
815       },
816    };
817 
818    const struct ir3_shader_variant *last_shader;
819    if (gs) {
820       last_shader = gs;
821    } else if (hs) {
822       last_shader = ds;
823    } else {
824       last_shader = vs;
825    }
826 
827    const struct reg_config *cfg = &reg_config[last_shader->type];
828 
829    struct ir3_shader_linkage linkage = {
830       .primid_loc = 0xff,
831       .clip0_loc = 0xff,
832       .clip1_loc = 0xff,
833    };
834    if (fs)
835       ir3_link_shaders(&linkage, last_shader, fs, true);
836 
837    if (last_shader->shader->stream_output.num_outputs)
838       tu6_link_streamout(&linkage, last_shader);
839 
840    /* We do this after linking shaders in order to know whether PrimID
841     * passthrough needs to be enabled.
842     */
843    bool primid_passthru = linkage.primid_loc != 0xff;
844    tu6_emit_vs_system_values(cs, vs, hs, ds, gs, primid_passthru);
845 
846    tu_cs_emit_pkt4(cs, REG_A6XX_VPC_VAR_DISABLE(0), 4);
847    tu_cs_emit(cs, ~linkage.varmask[0]);
848    tu_cs_emit(cs, ~linkage.varmask[1]);
849    tu_cs_emit(cs, ~linkage.varmask[2]);
850    tu_cs_emit(cs, ~linkage.varmask[3]);
851 
852    /* a6xx finds position/pointsize at the end */
853    const uint32_t pointsize_regid =
854       ir3_find_output_regid(last_shader, VARYING_SLOT_PSIZ);
855    const uint32_t layer_regid =
856       ir3_find_output_regid(last_shader, VARYING_SLOT_LAYER);
857    const uint32_t view_regid =
858       ir3_find_output_regid(last_shader, VARYING_SLOT_VIEWPORT);
859    const uint32_t clip0_regid =
860       ir3_find_output_regid(last_shader, VARYING_SLOT_CLIP_DIST0);
861    const uint32_t clip1_regid =
862       ir3_find_output_regid(last_shader, VARYING_SLOT_CLIP_DIST1);
863    uint32_t primitive_regid = gs ?
864       ir3_find_sysval_regid(gs, SYSTEM_VALUE_PRIMITIVE_ID) : regid(63, 0);
865    uint32_t flags_regid = gs ?
866       ir3_find_output_regid(gs, VARYING_SLOT_GS_VERTEX_FLAGS_IR3) : 0;
867 
868    uint32_t pointsize_loc = 0xff, position_loc = 0xff, layer_loc = 0xff, view_loc = 0xff;
869 
870    if (layer_regid != regid(63, 0)) {
871       layer_loc = linkage.max_loc;
872       ir3_link_add(&linkage, layer_regid, 0x1, linkage.max_loc);
873    }
874 
875    if (view_regid != regid(63, 0)) {
876       view_loc = linkage.max_loc;
877       ir3_link_add(&linkage, view_regid, 0x1, linkage.max_loc);
878    }
879 
880    unsigned extra_pos = 0;
881 
882    for (unsigned i = 0; i < last_shader->outputs_count; i++) {
883       if (last_shader->outputs[i].slot != VARYING_SLOT_POS)
884          continue;
885 
886       if (position_loc == 0xff)
887          position_loc = linkage.max_loc;
888 
889       ir3_link_add(&linkage, last_shader->outputs[i].regid,
890                    0xf, position_loc + 4 * last_shader->outputs[i].view);
891       extra_pos = MAX2(extra_pos, last_shader->outputs[i].view);
892    }
893 
894    if (pointsize_regid != regid(63, 0)) {
895       pointsize_loc = linkage.max_loc;
896       ir3_link_add(&linkage, pointsize_regid, 0x1, linkage.max_loc);
897    }
898 
899    uint8_t clip_cull_mask = last_shader->clip_mask | last_shader->cull_mask;
900 
901    /* Handle the case where clip/cull distances aren't read by the FS */
902    uint32_t clip0_loc = linkage.clip0_loc, clip1_loc = linkage.clip1_loc;
903    if (clip0_loc == 0xff && clip0_regid != regid(63, 0)) {
904       clip0_loc = linkage.max_loc;
905       ir3_link_add(&linkage, clip0_regid, clip_cull_mask & 0xf, linkage.max_loc);
906    }
907    if (clip1_loc == 0xff && clip1_regid != regid(63, 0)) {
908       clip1_loc = linkage.max_loc;
909       ir3_link_add(&linkage, clip1_regid, clip_cull_mask >> 4, linkage.max_loc);
910    }
911 
912    tu6_setup_streamout(cs, last_shader, &linkage);
913 
914    /* The GPU hangs on some models when there are no outputs (xs_pack::CNT),
915     * at least when a DS is the last stage, so add a dummy output to keep it
916     * happy if there aren't any. We do this late in order to avoid emitting
917     * any unused code and make sure that optimizations don't remove it.
918     */
919    if (linkage.cnt == 0)
920       ir3_link_add(&linkage, 0, 0x1, linkage.max_loc);
921 
922    /* map outputs of the last shader to VPC */
923    assert(linkage.cnt <= 32);
924    const uint32_t sp_out_count = DIV_ROUND_UP(linkage.cnt, 2);
925    const uint32_t sp_vpc_dst_count = DIV_ROUND_UP(linkage.cnt, 4);
926    uint32_t sp_out[16];
927    uint32_t sp_vpc_dst[8];
928    for (uint32_t i = 0; i < linkage.cnt; i++) {
929       ((uint16_t *) sp_out)[i] =
930          A6XX_SP_VS_OUT_REG_A_REGID(linkage.var[i].regid) |
931          A6XX_SP_VS_OUT_REG_A_COMPMASK(linkage.var[i].compmask);
932       ((uint8_t *) sp_vpc_dst)[i] =
933          A6XX_SP_VS_VPC_DST_REG_OUTLOC0(linkage.var[i].loc);
934    }
935 
936    tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_out_reg, sp_out_count);
937    tu_cs_emit_array(cs, sp_out, sp_out_count);
938 
939    tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_vpc_dst_reg, sp_vpc_dst_count);
940    tu_cs_emit_array(cs, sp_vpc_dst, sp_vpc_dst_count);
941 
942    tu_cs_emit_pkt4(cs, cfg->reg_vpc_xs_pack, 1);
943    tu_cs_emit(cs, A6XX_VPC_VS_PACK_POSITIONLOC(position_loc) |
944                   A6XX_VPC_VS_PACK_PSIZELOC(pointsize_loc) |
945                   A6XX_VPC_VS_PACK_STRIDE_IN_VPC(linkage.max_loc) |
946                   A6XX_VPC_VS_PACK_EXTRAPOS(extra_pos));
947 
948    tu_cs_emit_pkt4(cs, cfg->reg_vpc_xs_clip_cntl, 1);
949    tu_cs_emit(cs, A6XX_VPC_VS_CLIP_CNTL_CLIP_MASK(clip_cull_mask) |
950                   A6XX_VPC_VS_CLIP_CNTL_CLIP_DIST_03_LOC(clip0_loc) |
951                   A6XX_VPC_VS_CLIP_CNTL_CLIP_DIST_47_LOC(clip1_loc));
952 
953    tu_cs_emit_pkt4(cs, cfg->reg_gras_xs_cl_cntl, 1);
954    tu_cs_emit(cs, A6XX_GRAS_VS_CL_CNTL_CLIP_MASK(last_shader->clip_mask) |
955                   A6XX_GRAS_VS_CL_CNTL_CULL_MASK(last_shader->cull_mask));
956 
957    tu_cs_emit_pkt4(cs, cfg->reg_pc_xs_out_cntl, 1);
958    tu_cs_emit(cs, A6XX_PC_VS_OUT_CNTL_STRIDE_IN_VPC(linkage.max_loc) |
959                   CONDREG(pointsize_regid, A6XX_PC_VS_OUT_CNTL_PSIZE) |
960                   CONDREG(layer_regid, A6XX_PC_VS_OUT_CNTL_LAYER) |
961                   CONDREG(view_regid, A6XX_PC_VS_OUT_CNTL_VIEW) |
962                   CONDREG(primitive_regid, A6XX_PC_VS_OUT_CNTL_PRIMITIVE_ID) |
963                   A6XX_PC_VS_OUT_CNTL_CLIP_MASK(clip_cull_mask));
964 
965    tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_primitive_cntl, 1);
966    tu_cs_emit(cs, A6XX_SP_VS_PRIMITIVE_CNTL_OUT(linkage.cnt) |
967                   A6XX_SP_GS_PRIMITIVE_CNTL_FLAGS_REGID(flags_regid));
968 
969    tu_cs_emit_pkt4(cs, cfg->reg_vpc_xs_layer_cntl, 1);
970    tu_cs_emit(cs, A6XX_VPC_VS_LAYER_CNTL_LAYERLOC(layer_loc) |
971                   A6XX_VPC_VS_LAYER_CNTL_VIEWLOC(view_loc));
972 
973    tu_cs_emit_pkt4(cs, cfg->reg_gras_xs_layer_cntl, 1);
974    tu_cs_emit(cs, CONDREG(layer_regid, A6XX_GRAS_GS_LAYER_CNTL_WRITES_LAYER) |
975                   CONDREG(view_regid, A6XX_GRAS_GS_LAYER_CNTL_WRITES_VIEW));
976 
977    tu_cs_emit_regs(cs, A6XX_PC_PRIMID_PASSTHRU(primid_passthru));
978 
979    tu_cs_emit_pkt4(cs, REG_A6XX_VPC_CNTL_0, 1);
980    tu_cs_emit(cs, A6XX_VPC_CNTL_0_NUMNONPOSVAR(fs ? fs->total_in : 0) |
981                   COND(fs && fs->total_in, A6XX_VPC_CNTL_0_VARYING) |
982                   A6XX_VPC_CNTL_0_PRIMIDLOC(linkage.primid_loc) |
983                   A6XX_VPC_CNTL_0_VIEWIDLOC(linkage.viewid_loc));
984 
985    if (hs) {
986       shader_info *hs_info = &hs->shader->nir->info;
987       uint32_t unknown_a831 = vs->output_size;
988 
989       tu_cs_emit_pkt4(cs, REG_A6XX_PC_TESS_NUM_VERTEX, 1);
990       tu_cs_emit(cs, hs_info->tess.tcs_vertices_out);
991 
992       /* Total attribute slots in HS incoming patch. */
993       tu_cs_emit_pkt4(cs, REG_A6XX_PC_HS_INPUT_SIZE, 1);
994       tu_cs_emit(cs, patch_control_points * vs->output_size / 4);
995 
996       /* for A650 this value seems to be local memory size per wave */
997       if (vshs_workgroup) {
998          const uint32_t wavesize = 64;
999          /* note: if HS is really just the VS extended, then this
1000           * should be by MAX2(patch_control_points, hs_info->tess.tcs_vertices_out)
1001           * however that doesn't match the blob, and fails some dEQP tests.
1002           */
1003          uint32_t prims_per_wave = wavesize / hs_info->tess.tcs_vertices_out;
1004          uint32_t total_size = vs->output_size * patch_control_points * prims_per_wave;
1005          unknown_a831 = DIV_ROUND_UP(total_size, wavesize);
1006       }
1007 
1008       tu_cs_emit_pkt4(cs, REG_A6XX_SP_HS_UNKNOWN_A831, 1);
1009       tu_cs_emit(cs, unknown_a831);
1010 
1011       /* In SPIR-V generated from GLSL, the tessellation primitive params are
1012        * are specified in the tess eval shader, but in SPIR-V generated from
1013        * HLSL, they are specified in the tess control shader. */
1014       shader_info *tess_info =
1015             ds->shader->nir->info.tess.spacing == TESS_SPACING_UNSPECIFIED ?
1016             &hs->shader->nir->info : &ds->shader->nir->info;
1017       tu_cs_emit_pkt4(cs, REG_A6XX_PC_TESS_CNTL, 1);
1018       uint32_t output;
1019       if (tess_info->tess.point_mode)
1020          output = TESS_POINTS;
1021       else if (tess_info->tess.primitive_mode == GL_ISOLINES)
1022          output = TESS_LINES;
1023       else if (tess_info->tess.ccw)
1024          output = TESS_CCW_TRIS;
1025       else
1026          output = TESS_CW_TRIS;
1027 
1028       enum a6xx_tess_spacing spacing;
1029       switch (tess_info->tess.spacing) {
1030       case TESS_SPACING_EQUAL:
1031          spacing = TESS_EQUAL;
1032          break;
1033       case TESS_SPACING_FRACTIONAL_ODD:
1034          spacing = TESS_FRACTIONAL_ODD;
1035          break;
1036       case TESS_SPACING_FRACTIONAL_EVEN:
1037          spacing = TESS_FRACTIONAL_EVEN;
1038          break;
1039       case TESS_SPACING_UNSPECIFIED:
1040       default:
1041          unreachable("invalid tess spacing");
1042       }
1043       tu_cs_emit(cs, A6XX_PC_TESS_CNTL_SPACING(spacing) |
1044             A6XX_PC_TESS_CNTL_OUTPUT(output));
1045 
1046       tu6_emit_link_map(cs, vs, hs, SB6_HS_SHADER);
1047       tu6_emit_link_map(cs, hs, ds, SB6_DS_SHADER);
1048    }
1049 
1050 
1051    if (gs) {
1052       uint32_t vertices_out, invocations, output, vec4_size;
1053       /* this detects the tu_clear_blit path, which doesn't set ->nir */
1054       if (gs->shader->nir) {
1055          if (hs) {
1056             tu6_emit_link_map(cs, ds, gs, SB6_GS_SHADER);
1057          } else {
1058             tu6_emit_link_map(cs, vs, gs, SB6_GS_SHADER);
1059          }
1060          vertices_out = gs->shader->nir->info.gs.vertices_out - 1;
1061          output = gl_primitive_to_tess(gs->shader->nir->info.gs.output_primitive);
1062          invocations = gs->shader->nir->info.gs.invocations - 1;
1063          /* Size of per-primitive alloction in ldlw memory in vec4s. */
1064          vec4_size = gs->shader->nir->info.gs.vertices_in *
1065                      DIV_ROUND_UP(vs->output_size, 4);
1066       } else {
1067          vertices_out = 3;
1068          output = TESS_CW_TRIS;
1069          invocations = 0;
1070          vec4_size = 0;
1071       }
1072 
1073       tu_cs_emit_pkt4(cs, REG_A6XX_PC_PRIMITIVE_CNTL_5, 1);
1074       tu_cs_emit(cs,
1075             A6XX_PC_PRIMITIVE_CNTL_5_GS_VERTICES_OUT(vertices_out) |
1076             A6XX_PC_PRIMITIVE_CNTL_5_GS_OUTPUT(output) |
1077             A6XX_PC_PRIMITIVE_CNTL_5_GS_INVOCATIONS(invocations));
1078 
1079       tu_cs_emit_pkt4(cs, REG_A6XX_PC_PRIMITIVE_CNTL_3, 1);
1080       tu_cs_emit(cs, 0);
1081 
1082       tu_cs_emit_pkt4(cs, REG_A6XX_VPC_UNKNOWN_9100, 1);
1083       tu_cs_emit(cs, 0xff);
1084 
1085       tu_cs_emit_pkt4(cs, REG_A6XX_PC_PRIMITIVE_CNTL_6, 1);
1086       tu_cs_emit(cs, A6XX_PC_PRIMITIVE_CNTL_6_STRIDE_IN_VPC(vec4_size));
1087 
1088       tu_cs_emit_pkt4(cs, REG_A6XX_SP_GS_PRIM_SIZE, 1);
1089       tu_cs_emit(cs, vs->output_size);
1090    }
1091 }
1092 
1093 static int
tu6_vpc_varying_mode(const struct ir3_shader_variant * fs,uint32_t index,uint8_t * interp_mode,uint8_t * ps_repl_mode)1094 tu6_vpc_varying_mode(const struct ir3_shader_variant *fs,
1095                      uint32_t index,
1096                      uint8_t *interp_mode,
1097                      uint8_t *ps_repl_mode)
1098 {
1099    enum
1100    {
1101       INTERP_SMOOTH = 0,
1102       INTERP_FLAT = 1,
1103       INTERP_ZERO = 2,
1104       INTERP_ONE = 3,
1105    };
1106    enum
1107    {
1108       PS_REPL_NONE = 0,
1109       PS_REPL_S = 1,
1110       PS_REPL_T = 2,
1111       PS_REPL_ONE_MINUS_T = 3,
1112    };
1113 
1114    const uint32_t compmask = fs->inputs[index].compmask;
1115 
1116    /* NOTE: varyings are packed, so if compmask is 0xb then first, second, and
1117     * fourth component occupy three consecutive varying slots
1118     */
1119    int shift = 0;
1120    *interp_mode = 0;
1121    *ps_repl_mode = 0;
1122    if (fs->inputs[index].slot == VARYING_SLOT_PNTC) {
1123       if (compmask & 0x1) {
1124          *ps_repl_mode |= PS_REPL_S << shift;
1125          shift += 2;
1126       }
1127       if (compmask & 0x2) {
1128          *ps_repl_mode |= PS_REPL_T << shift;
1129          shift += 2;
1130       }
1131       if (compmask & 0x4) {
1132          *interp_mode |= INTERP_ZERO << shift;
1133          shift += 2;
1134       }
1135       if (compmask & 0x8) {
1136          *interp_mode |= INTERP_ONE << 6;
1137          shift += 2;
1138       }
1139    } else if (fs->inputs[index].flat) {
1140       for (int i = 0; i < 4; i++) {
1141          if (compmask & (1 << i)) {
1142             *interp_mode |= INTERP_FLAT << shift;
1143             shift += 2;
1144          }
1145       }
1146    }
1147 
1148    return shift;
1149 }
1150 
1151 static void
tu6_emit_vpc_varying_modes(struct tu_cs * cs,const struct ir3_shader_variant * fs)1152 tu6_emit_vpc_varying_modes(struct tu_cs *cs,
1153                            const struct ir3_shader_variant *fs)
1154 {
1155    uint32_t interp_modes[8] = { 0 };
1156    uint32_t ps_repl_modes[8] = { 0 };
1157 
1158    if (fs) {
1159       for (int i = -1;
1160            (i = ir3_next_varying(fs, i)) < (int) fs->inputs_count;) {
1161 
1162          /* get the mode for input i */
1163          uint8_t interp_mode;
1164          uint8_t ps_repl_mode;
1165          const int bits =
1166             tu6_vpc_varying_mode(fs, i, &interp_mode, &ps_repl_mode);
1167 
1168          /* OR the mode into the array */
1169          const uint32_t inloc = fs->inputs[i].inloc * 2;
1170          uint32_t n = inloc / 32;
1171          uint32_t shift = inloc % 32;
1172          interp_modes[n] |= interp_mode << shift;
1173          ps_repl_modes[n] |= ps_repl_mode << shift;
1174          if (shift + bits > 32) {
1175             n++;
1176             shift = 32 - shift;
1177 
1178             interp_modes[n] |= interp_mode >> shift;
1179             ps_repl_modes[n] |= ps_repl_mode >> shift;
1180          }
1181       }
1182    }
1183 
1184    tu_cs_emit_pkt4(cs, REG_A6XX_VPC_VARYING_INTERP_MODE(0), 8);
1185    tu_cs_emit_array(cs, interp_modes, 8);
1186 
1187    tu_cs_emit_pkt4(cs, REG_A6XX_VPC_VARYING_PS_REPL_MODE(0), 8);
1188    tu_cs_emit_array(cs, ps_repl_modes, 8);
1189 }
1190 
1191 void
tu6_emit_fs_inputs(struct tu_cs * cs,const struct ir3_shader_variant * fs)1192 tu6_emit_fs_inputs(struct tu_cs *cs, const struct ir3_shader_variant *fs)
1193 {
1194    uint32_t face_regid, coord_regid, zwcoord_regid, samp_id_regid;
1195    uint32_t ij_regid[IJ_COUNT];
1196    uint32_t smask_in_regid;
1197 
1198    bool sample_shading = fs->per_samp | fs->key.sample_shading;
1199    bool enable_varyings = fs->total_in > 0;
1200 
1201    samp_id_regid   = ir3_find_sysval_regid(fs, SYSTEM_VALUE_SAMPLE_ID);
1202    smask_in_regid  = ir3_find_sysval_regid(fs, SYSTEM_VALUE_SAMPLE_MASK_IN);
1203    face_regid      = ir3_find_sysval_regid(fs, SYSTEM_VALUE_FRONT_FACE);
1204    coord_regid     = ir3_find_sysval_regid(fs, SYSTEM_VALUE_FRAG_COORD);
1205    zwcoord_regid   = VALIDREG(coord_regid) ? coord_regid + 2 : regid(63, 0);
1206    for (unsigned i = 0; i < ARRAY_SIZE(ij_regid); i++)
1207       ij_regid[i] = ir3_find_sysval_regid(fs, SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL + i);
1208 
1209    if (VALIDREG(ij_regid[IJ_LINEAR_SAMPLE]))
1210       tu_finishme("linear sample varying");
1211 
1212    if (VALIDREG(ij_regid[IJ_LINEAR_CENTROID]))
1213       tu_finishme("linear centroid varying");
1214 
1215    if (fs->num_sampler_prefetch > 0) {
1216       assert(VALIDREG(ij_regid[IJ_PERSP_PIXEL]));
1217       /* also, it seems like ij_pix is *required* to be r0.x */
1218       assert(ij_regid[IJ_PERSP_PIXEL] == regid(0, 0));
1219    }
1220 
1221    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_PREFETCH_CNTL, 1 + fs->num_sampler_prefetch);
1222    tu_cs_emit(cs, A6XX_SP_FS_PREFETCH_CNTL_COUNT(fs->num_sampler_prefetch) |
1223          A6XX_SP_FS_PREFETCH_CNTL_UNK4(regid(63, 0)) |
1224          0x7000);    // XXX);
1225    for (int i = 0; i < fs->num_sampler_prefetch; i++) {
1226       const struct ir3_sampler_prefetch *prefetch = &fs->sampler_prefetch[i];
1227       tu_cs_emit(cs, A6XX_SP_FS_PREFETCH_CMD_SRC(prefetch->src) |
1228                      A6XX_SP_FS_PREFETCH_CMD_SAMP_ID(prefetch->samp_id) |
1229                      A6XX_SP_FS_PREFETCH_CMD_TEX_ID(prefetch->tex_id) |
1230                      A6XX_SP_FS_PREFETCH_CMD_DST(prefetch->dst) |
1231                      A6XX_SP_FS_PREFETCH_CMD_WRMASK(prefetch->wrmask) |
1232                      COND(prefetch->half_precision, A6XX_SP_FS_PREFETCH_CMD_HALF) |
1233                      A6XX_SP_FS_PREFETCH_CMD_CMD(prefetch->cmd));
1234    }
1235 
1236    if (fs->num_sampler_prefetch > 0) {
1237       tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_BINDLESS_PREFETCH_CMD(0), fs->num_sampler_prefetch);
1238       for (int i = 0; i < fs->num_sampler_prefetch; i++) {
1239          const struct ir3_sampler_prefetch *prefetch = &fs->sampler_prefetch[i];
1240          tu_cs_emit(cs,
1241                     A6XX_SP_FS_BINDLESS_PREFETCH_CMD_SAMP_ID(prefetch->samp_bindless_id) |
1242                     A6XX_SP_FS_BINDLESS_PREFETCH_CMD_TEX_ID(prefetch->tex_bindless_id));
1243       }
1244    }
1245 
1246    tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_CONTROL_1_REG, 5);
1247    tu_cs_emit(cs, 0x7);
1248    tu_cs_emit(cs, A6XX_HLSQ_CONTROL_2_REG_FACEREGID(face_regid) |
1249                   A6XX_HLSQ_CONTROL_2_REG_SAMPLEID(samp_id_regid) |
1250                   A6XX_HLSQ_CONTROL_2_REG_SAMPLEMASK(smask_in_regid) |
1251                   A6XX_HLSQ_CONTROL_2_REG_SIZE(ij_regid[IJ_PERSP_SIZE]));
1252    tu_cs_emit(cs, A6XX_HLSQ_CONTROL_3_REG_IJ_PERSP_PIXEL(ij_regid[IJ_PERSP_PIXEL]) |
1253                   A6XX_HLSQ_CONTROL_3_REG_IJ_LINEAR_PIXEL(ij_regid[IJ_LINEAR_PIXEL]) |
1254                   A6XX_HLSQ_CONTROL_3_REG_IJ_PERSP_CENTROID(ij_regid[IJ_PERSP_CENTROID]) |
1255                   A6XX_HLSQ_CONTROL_3_REG_IJ_LINEAR_CENTROID(ij_regid[IJ_LINEAR_CENTROID]));
1256    tu_cs_emit(cs, A6XX_HLSQ_CONTROL_4_REG_XYCOORDREGID(coord_regid) |
1257                   A6XX_HLSQ_CONTROL_4_REG_ZWCOORDREGID(zwcoord_regid) |
1258                   A6XX_HLSQ_CONTROL_4_REG_IJ_PERSP_SAMPLE(ij_regid[IJ_PERSP_SAMPLE]) |
1259                   A6XX_HLSQ_CONTROL_4_REG_IJ_LINEAR_SAMPLE(ij_regid[IJ_LINEAR_SAMPLE]));
1260    tu_cs_emit(cs, 0xfc);
1261 
1262    tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_UNKNOWN_B980, 1);
1263    tu_cs_emit(cs, enable_varyings ? 3 : 1);
1264 
1265    bool need_size = fs->frag_face || fs->fragcoord_compmask != 0;
1266    bool need_size_persamp = false;
1267    if (VALIDREG(ij_regid[IJ_PERSP_SIZE])) {
1268       if (sample_shading)
1269          need_size_persamp = true;
1270       else
1271          need_size = true;
1272    }
1273    if (VALIDREG(ij_regid[IJ_LINEAR_PIXEL]))
1274       need_size = true;
1275 
1276    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_CNTL, 1);
1277    tu_cs_emit(cs,
1278          CONDREG(ij_regid[IJ_PERSP_PIXEL], A6XX_GRAS_CNTL_IJ_PERSP_PIXEL) |
1279          CONDREG(ij_regid[IJ_PERSP_CENTROID], A6XX_GRAS_CNTL_IJ_PERSP_CENTROID) |
1280          CONDREG(ij_regid[IJ_PERSP_SAMPLE], A6XX_GRAS_CNTL_IJ_PERSP_SAMPLE) |
1281          COND(need_size, A6XX_GRAS_CNTL_SIZE) |
1282          COND(need_size_persamp, A6XX_GRAS_CNTL_SIZE_PERSAMP) |
1283          COND(fs->fragcoord_compmask != 0, A6XX_GRAS_CNTL_COORD_MASK(fs->fragcoord_compmask)));
1284 
1285    tu_cs_emit_pkt4(cs, REG_A6XX_RB_RENDER_CONTROL0, 2);
1286    tu_cs_emit(cs,
1287          CONDREG(ij_regid[IJ_PERSP_PIXEL], A6XX_RB_RENDER_CONTROL0_IJ_PERSP_PIXEL) |
1288          CONDREG(ij_regid[IJ_PERSP_CENTROID], A6XX_RB_RENDER_CONTROL0_IJ_PERSP_CENTROID) |
1289          CONDREG(ij_regid[IJ_PERSP_SAMPLE], A6XX_RB_RENDER_CONTROL0_IJ_PERSP_SAMPLE) |
1290          COND(need_size, A6XX_RB_RENDER_CONTROL0_SIZE) |
1291          COND(enable_varyings, A6XX_RB_RENDER_CONTROL0_UNK10) |
1292          COND(need_size_persamp, A6XX_RB_RENDER_CONTROL0_SIZE_PERSAMP) |
1293          COND(fs->fragcoord_compmask != 0,
1294                            A6XX_RB_RENDER_CONTROL0_COORD_MASK(fs->fragcoord_compmask)));
1295    tu_cs_emit(cs,
1296          /* these two bits (UNK4/UNK5) relate to fragcoord
1297           * without them, fragcoord is the same for all samples
1298           */
1299          COND(sample_shading, A6XX_RB_RENDER_CONTROL1_UNK4) |
1300          COND(sample_shading, A6XX_RB_RENDER_CONTROL1_UNK5) |
1301          CONDREG(smask_in_regid, A6XX_RB_RENDER_CONTROL1_SAMPLEMASK) |
1302          CONDREG(samp_id_regid, A6XX_RB_RENDER_CONTROL1_SAMPLEID) |
1303          CONDREG(ij_regid[IJ_PERSP_SIZE], A6XX_RB_RENDER_CONTROL1_SIZE) |
1304          COND(fs->frag_face, A6XX_RB_RENDER_CONTROL1_FACENESS));
1305 
1306    tu_cs_emit_pkt4(cs, REG_A6XX_RB_SAMPLE_CNTL, 1);
1307    tu_cs_emit(cs, COND(sample_shading, A6XX_RB_SAMPLE_CNTL_PER_SAMP_MODE));
1308 
1309    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_UNKNOWN_8101, 1);
1310    tu_cs_emit(cs, COND(sample_shading, 0x6));  // XXX
1311 
1312    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SAMPLE_CNTL, 1);
1313    tu_cs_emit(cs, COND(sample_shading, A6XX_GRAS_SAMPLE_CNTL_PER_SAMP_MODE));
1314 }
1315 
1316 static void
tu6_emit_fs_outputs(struct tu_cs * cs,const struct ir3_shader_variant * fs,uint32_t mrt_count,bool dual_src_blend,uint32_t render_components,bool is_s8_uint)1317 tu6_emit_fs_outputs(struct tu_cs *cs,
1318                     const struct ir3_shader_variant *fs,
1319                     uint32_t mrt_count, bool dual_src_blend,
1320                     uint32_t render_components,
1321                     bool is_s8_uint)
1322 {
1323    uint32_t smask_regid, posz_regid, stencilref_regid;
1324 
1325    posz_regid      = ir3_find_output_regid(fs, FRAG_RESULT_DEPTH);
1326    smask_regid     = ir3_find_output_regid(fs, FRAG_RESULT_SAMPLE_MASK);
1327    stencilref_regid = ir3_find_output_regid(fs, FRAG_RESULT_STENCIL);
1328 
1329    uint32_t fragdata_regid[8];
1330    if (fs->color0_mrt) {
1331       fragdata_regid[0] = ir3_find_output_regid(fs, FRAG_RESULT_COLOR);
1332       for (uint32_t i = 1; i < ARRAY_SIZE(fragdata_regid); i++)
1333          fragdata_regid[i] = fragdata_regid[0];
1334    } else {
1335       for (uint32_t i = 0; i < ARRAY_SIZE(fragdata_regid); i++)
1336          fragdata_regid[i] = ir3_find_output_regid(fs, FRAG_RESULT_DATA0 + i);
1337    }
1338 
1339    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2);
1340    tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(posz_regid) |
1341                   A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(smask_regid) |
1342                   A6XX_SP_FS_OUTPUT_CNTL0_STENCILREF_REGID(stencilref_regid) |
1343                   COND(dual_src_blend, A6XX_SP_FS_OUTPUT_CNTL0_DUAL_COLOR_IN_ENABLE));
1344    tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(mrt_count));
1345 
1346    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), 8);
1347    for (uint32_t i = 0; i < ARRAY_SIZE(fragdata_regid); i++) {
1348       // TODO we could have a mix of half and full precision outputs,
1349       // we really need to figure out half-precision from IR3_REG_HALF
1350       tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_REG_REGID(fragdata_regid[i]) |
1351                         (false ? A6XX_SP_FS_OUTPUT_REG_HALF_PRECISION : 0));
1352    }
1353 
1354    tu_cs_emit_regs(cs,
1355                    A6XX_SP_FS_RENDER_COMPONENTS(.dword = render_components));
1356 
1357    tu_cs_emit_pkt4(cs, REG_A6XX_RB_FS_OUTPUT_CNTL0, 2);
1358    tu_cs_emit(cs, COND(fs->writes_pos, A6XX_RB_FS_OUTPUT_CNTL0_FRAG_WRITES_Z) |
1359                   COND(fs->writes_smask, A6XX_RB_FS_OUTPUT_CNTL0_FRAG_WRITES_SAMPMASK) |
1360                   COND(fs->writes_stencilref, A6XX_RB_FS_OUTPUT_CNTL0_FRAG_WRITES_STENCILREF) |
1361                   COND(dual_src_blend, A6XX_RB_FS_OUTPUT_CNTL0_DUAL_COLOR_IN_ENABLE));
1362    tu_cs_emit(cs, A6XX_RB_FS_OUTPUT_CNTL1_MRT(mrt_count));
1363 
1364    tu_cs_emit_regs(cs,
1365                    A6XX_RB_RENDER_COMPONENTS(.dword = render_components));
1366 
1367    enum a6xx_ztest_mode zmode;
1368 
1369    if (fs->no_earlyz || fs->has_kill || fs->writes_pos || fs->writes_stencilref || is_s8_uint) {
1370       zmode = A6XX_LATE_Z;
1371    } else {
1372       zmode = A6XX_EARLY_Z;
1373    }
1374 
1375    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SU_DEPTH_PLANE_CNTL, 1);
1376    tu_cs_emit(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL_Z_MODE(zmode));
1377 
1378    tu_cs_emit_pkt4(cs, REG_A6XX_RB_DEPTH_PLANE_CNTL, 1);
1379    tu_cs_emit(cs, A6XX_RB_DEPTH_PLANE_CNTL_Z_MODE(zmode));
1380 }
1381 
1382 static void
tu6_emit_geom_tess_consts(struct tu_cs * cs,const struct ir3_shader_variant * vs,const struct ir3_shader_variant * hs,const struct ir3_shader_variant * ds,const struct ir3_shader_variant * gs,uint32_t cps_per_patch)1383 tu6_emit_geom_tess_consts(struct tu_cs *cs,
1384                           const struct ir3_shader_variant *vs,
1385                           const struct ir3_shader_variant *hs,
1386                           const struct ir3_shader_variant *ds,
1387                           const struct ir3_shader_variant *gs,
1388                           uint32_t cps_per_patch)
1389 {
1390    uint32_t num_vertices =
1391          hs ? cps_per_patch : gs->shader->nir->info.gs.vertices_in;
1392 
1393    uint32_t vs_params[4] = {
1394       vs->output_size * num_vertices * 4,  /* vs primitive stride */
1395       vs->output_size * 4,                 /* vs vertex stride */
1396       0,
1397       0,
1398    };
1399    uint32_t vs_base = ir3_const_state(vs)->offsets.primitive_param;
1400    tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, vs_base, SB6_VS_SHADER, 0,
1401                   ARRAY_SIZE(vs_params), vs_params);
1402 
1403    if (hs) {
1404       assert(ds->type != MESA_SHADER_NONE);
1405       uint32_t hs_params[4] = {
1406          vs->output_size * num_vertices * 4,  /* hs primitive stride */
1407          vs->output_size * 4,                 /* hs vertex stride */
1408          hs->output_size,
1409          cps_per_patch,
1410       };
1411 
1412       uint32_t hs_base = hs->const_state->offsets.primitive_param;
1413       tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, hs_base, SB6_HS_SHADER, 0,
1414                      ARRAY_SIZE(hs_params), hs_params);
1415       if (gs)
1416          num_vertices = gs->shader->nir->info.gs.vertices_in;
1417 
1418       uint32_t ds_params[4] = {
1419          ds->output_size * num_vertices * 4,  /* ds primitive stride */
1420          ds->output_size * 4,                 /* ds vertex stride */
1421          hs->output_size,                     /* hs vertex stride (dwords) */
1422          hs->shader->nir->info.tess.tcs_vertices_out
1423       };
1424 
1425       uint32_t ds_base = ds->const_state->offsets.primitive_param;
1426       tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, ds_base, SB6_DS_SHADER, 0,
1427                      ARRAY_SIZE(ds_params), ds_params);
1428    }
1429 
1430    if (gs) {
1431       const struct ir3_shader_variant *prev = ds ? ds : vs;
1432       uint32_t gs_params[4] = {
1433          prev->output_size * num_vertices * 4,  /* gs primitive stride */
1434          prev->output_size * 4,                 /* gs vertex stride */
1435          0,
1436          0,
1437       };
1438       uint32_t gs_base = gs->const_state->offsets.primitive_param;
1439       tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, gs_base, SB6_GS_SHADER, 0,
1440                      ARRAY_SIZE(gs_params), gs_params);
1441    }
1442 }
1443 
1444 static void
tu6_emit_program(struct tu_cs * cs,struct tu_pipeline_builder * builder,bool binning_pass)1445 tu6_emit_program(struct tu_cs *cs,
1446                  struct tu_pipeline_builder *builder,
1447                  bool binning_pass)
1448 {
1449    const struct ir3_shader_variant *vs = builder->variants[MESA_SHADER_VERTEX];
1450    const struct ir3_shader_variant *bs = builder->binning_variant;
1451    const struct ir3_shader_variant *hs = builder->variants[MESA_SHADER_TESS_CTRL];
1452    const struct ir3_shader_variant *ds = builder->variants[MESA_SHADER_TESS_EVAL];
1453    const struct ir3_shader_variant *gs = builder->variants[MESA_SHADER_GEOMETRY];
1454    const struct ir3_shader_variant *fs = builder->variants[MESA_SHADER_FRAGMENT];
1455    gl_shader_stage stage = MESA_SHADER_VERTEX;
1456    uint32_t cps_per_patch = builder->create_info->pTessellationState ?
1457       builder->create_info->pTessellationState->patchControlPoints : 0;
1458    bool multi_pos_output = builder->shaders[MESA_SHADER_VERTEX]->multi_pos_output;
1459 
1460    STATIC_ASSERT(MESA_SHADER_VERTEX == 0);
1461 
1462    tu_cs_emit_regs(cs, A6XX_HLSQ_INVALIDATE_CMD(
1463          .vs_state = true,
1464          .hs_state = true,
1465          .ds_state = true,
1466          .gs_state = true,
1467          .fs_state = true,
1468          .gfx_ibo = true));
1469 
1470   /* Don't use the binning pass variant when GS is present because we don't
1471    * support compiling correct binning pass variants with GS.
1472    */
1473    if (binning_pass && !gs) {
1474       vs = bs;
1475       tu6_emit_xs_config(cs, stage, bs, builder->binning_vs_iova);
1476       stage++;
1477    }
1478 
1479    for (; stage < ARRAY_SIZE(builder->shaders); stage++) {
1480       const struct ir3_shader_variant *xs = builder->variants[stage];
1481 
1482       if (stage == MESA_SHADER_FRAGMENT && binning_pass)
1483          fs = xs = NULL;
1484 
1485       tu6_emit_xs_config(cs, stage, xs, builder->shader_iova[stage]);
1486    }
1487 
1488    uint32_t multiview_views = util_logbase2(builder->multiview_mask) + 1;
1489    uint32_t multiview_cntl = builder->multiview_mask ?
1490       A6XX_PC_MULTIVIEW_CNTL_ENABLE |
1491       A6XX_PC_MULTIVIEW_CNTL_VIEWS(multiview_views) |
1492       COND(!multi_pos_output, A6XX_PC_MULTIVIEW_CNTL_DISABLEMULTIPOS)
1493       : 0;
1494 
1495    /* Copy what the blob does here. This will emit an extra 0x3f
1496     * CP_EVENT_WRITE when multiview is disabled. I'm not exactly sure what
1497     * this is working around yet.
1498     */
1499    tu_cs_emit_pkt7(cs, CP_REG_WRITE, 3);
1500    tu_cs_emit(cs, CP_REG_WRITE_0_TRACKER(UNK_EVENT_WRITE));
1501    tu_cs_emit(cs, REG_A6XX_PC_MULTIVIEW_CNTL);
1502    tu_cs_emit(cs, multiview_cntl);
1503 
1504    tu_cs_emit_pkt4(cs, REG_A6XX_VFD_MULTIVIEW_CNTL, 1);
1505    tu_cs_emit(cs, multiview_cntl);
1506 
1507    if (multiview_cntl &&
1508        builder->device->physical_device->info.a6xx.supports_multiview_mask) {
1509       tu_cs_emit_pkt4(cs, REG_A6XX_PC_MULTIVIEW_MASK, 1);
1510       tu_cs_emit(cs, builder->multiview_mask);
1511    }
1512 
1513    tu_cs_emit_pkt4(cs, REG_A6XX_SP_HS_UNKNOWN_A831, 1);
1514    tu_cs_emit(cs, 0);
1515 
1516    tu6_emit_vpc(cs, vs, hs, ds, gs, fs, cps_per_patch,
1517                 builder->device->physical_device->gpu_id == 650);
1518    tu6_emit_vpc_varying_modes(cs, fs);
1519 
1520    if (fs) {
1521       tu6_emit_fs_inputs(cs, fs);
1522       tu6_emit_fs_outputs(cs, fs, builder->color_attachment_count,
1523                           builder->use_dual_src_blend,
1524                           builder->render_components,
1525                           builder->depth_attachment_format == VK_FORMAT_S8_UINT);
1526    } else {
1527       /* TODO: check if these can be skipped if fs is disabled */
1528       struct ir3_shader_variant dummy_variant = {};
1529       tu6_emit_fs_inputs(cs, &dummy_variant);
1530       tu6_emit_fs_outputs(cs, &dummy_variant, builder->color_attachment_count,
1531                           builder->use_dual_src_blend,
1532                           builder->render_components,
1533                           builder->depth_attachment_format == VK_FORMAT_S8_UINT);
1534    }
1535 
1536    if (gs || hs) {
1537       tu6_emit_geom_tess_consts(cs, vs, hs, ds, gs, cps_per_patch);
1538    }
1539 }
1540 
1541 static void
tu6_emit_vertex_input(struct tu_pipeline * pipeline,struct tu_cs * cs,const struct ir3_shader_variant * vs,const VkPipelineVertexInputStateCreateInfo * info)1542 tu6_emit_vertex_input(struct tu_pipeline *pipeline,
1543                       struct tu_cs *cs,
1544                       const struct ir3_shader_variant *vs,
1545                       const VkPipelineVertexInputStateCreateInfo *info)
1546 {
1547    uint32_t vfd_decode_idx = 0;
1548    uint32_t binding_instanced = 0; /* bitmask of instanced bindings */
1549    uint32_t step_rate[MAX_VBS];
1550 
1551    for (uint32_t i = 0; i < info->vertexBindingDescriptionCount; i++) {
1552       const VkVertexInputBindingDescription *binding =
1553          &info->pVertexBindingDescriptions[i];
1554 
1555       if (!(pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_VB_STRIDE))) {
1556          tu_cs_emit_regs(cs,
1557                         A6XX_VFD_FETCH_STRIDE(binding->binding, binding->stride));
1558       }
1559 
1560       if (binding->inputRate == VK_VERTEX_INPUT_RATE_INSTANCE)
1561          binding_instanced |= 1 << binding->binding;
1562 
1563       step_rate[binding->binding] = 1;
1564    }
1565 
1566    const VkPipelineVertexInputDivisorStateCreateInfoEXT *div_state =
1567       vk_find_struct_const(info->pNext, PIPELINE_VERTEX_INPUT_DIVISOR_STATE_CREATE_INFO_EXT);
1568    if (div_state) {
1569       for (uint32_t i = 0; i < div_state->vertexBindingDivisorCount; i++) {
1570          const VkVertexInputBindingDivisorDescriptionEXT *desc =
1571             &div_state->pVertexBindingDivisors[i];
1572          step_rate[desc->binding] = desc->divisor;
1573       }
1574    }
1575 
1576    /* TODO: emit all VFD_DECODE/VFD_DEST_CNTL in same (two) pkt4 */
1577 
1578    for (uint32_t i = 0; i < info->vertexAttributeDescriptionCount; i++) {
1579       const VkVertexInputAttributeDescription *attr =
1580          &info->pVertexAttributeDescriptions[i];
1581       uint32_t input_idx;
1582 
1583       for (input_idx = 0; input_idx < vs->inputs_count; input_idx++) {
1584          if ((vs->inputs[input_idx].slot - VERT_ATTRIB_GENERIC0) == attr->location)
1585             break;
1586       }
1587 
1588       /* attribute not used, skip it */
1589       if (input_idx == vs->inputs_count)
1590          continue;
1591 
1592       const struct tu_native_format format = tu6_format_vtx(attr->format);
1593       tu_cs_emit_regs(cs,
1594                       A6XX_VFD_DECODE_INSTR(vfd_decode_idx,
1595                         .idx = attr->binding,
1596                         .offset = attr->offset,
1597                         .instanced = binding_instanced & (1 << attr->binding),
1598                         .format = format.fmt,
1599                         .swap = format.swap,
1600                         .unk30 = 1,
1601                         ._float = !vk_format_is_int(attr->format)),
1602                       A6XX_VFD_DECODE_STEP_RATE(vfd_decode_idx, step_rate[attr->binding]));
1603 
1604       tu_cs_emit_regs(cs,
1605                       A6XX_VFD_DEST_CNTL_INSTR(vfd_decode_idx,
1606                         .writemask = vs->inputs[input_idx].compmask,
1607                         .regid = vs->inputs[input_idx].regid));
1608 
1609       vfd_decode_idx++;
1610    }
1611 
1612    tu_cs_emit_regs(cs,
1613                    A6XX_VFD_CONTROL_0(
1614                      .fetch_cnt = vfd_decode_idx, /* decode_cnt for binning pass ? */
1615                      .decode_cnt = vfd_decode_idx));
1616 }
1617 
1618 void
tu6_emit_viewport(struct tu_cs * cs,const VkViewport * viewports,uint32_t num_viewport)1619 tu6_emit_viewport(struct tu_cs *cs, const VkViewport *viewports, uint32_t num_viewport)
1620 {
1621    VkExtent2D guardband = {511, 511};
1622 
1623    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_CL_VPORT_XOFFSET(0), num_viewport * 6);
1624    for (uint32_t i = 0; i < num_viewport; i++) {
1625       const VkViewport *viewport = &viewports[i];
1626       float offsets[3];
1627       float scales[3];
1628       scales[0] = viewport->width / 2.0f;
1629       scales[1] = viewport->height / 2.0f;
1630       scales[2] = viewport->maxDepth - viewport->minDepth;
1631       offsets[0] = viewport->x + scales[0];
1632       offsets[1] = viewport->y + scales[1];
1633       offsets[2] = viewport->minDepth;
1634       for (uint32_t j = 0; j < 3; j++) {
1635          tu_cs_emit(cs, fui(offsets[j]));
1636          tu_cs_emit(cs, fui(scales[j]));
1637       }
1638 
1639       guardband.width =
1640          MIN2(guardband.width, fd_calc_guardband(offsets[0], scales[0], false));
1641       guardband.height =
1642          MIN2(guardband.height, fd_calc_guardband(offsets[1], scales[1], false));
1643    }
1644 
1645    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL(0), num_viewport * 2);
1646    for (uint32_t i = 0; i < num_viewport; i++) {
1647       const VkViewport *viewport = &viewports[i];
1648       VkOffset2D min;
1649       VkOffset2D max;
1650       min.x = (int32_t) viewport->x;
1651       max.x = (int32_t) ceilf(viewport->x + viewport->width);
1652       if (viewport->height >= 0.0f) {
1653          min.y = (int32_t) viewport->y;
1654          max.y = (int32_t) ceilf(viewport->y + viewport->height);
1655       } else {
1656          min.y = (int32_t)(viewport->y + viewport->height);
1657          max.y = (int32_t) ceilf(viewport->y);
1658       }
1659       /* the spec allows viewport->height to be 0.0f */
1660       if (min.y == max.y)
1661          max.y++;
1662       /* allow viewport->width = 0.0f for un-initialized viewports: */
1663       if (min.x == max.x)
1664          max.x++;
1665       assert(min.x >= 0 && min.x < max.x);
1666       assert(min.y >= 0 && min.y < max.y);
1667       tu_cs_emit(cs, A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL_X(min.x) |
1668                      A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL_Y(min.y));
1669       tu_cs_emit(cs, A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL_X(max.x - 1) |
1670                      A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL_Y(max.y - 1));
1671    }
1672 
1673    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_CL_Z_CLAMP(0), num_viewport * 2);
1674    for (uint32_t i = 0; i < num_viewport; i++) {
1675       const VkViewport *viewport = &viewports[i];
1676       tu_cs_emit(cs, fui(MIN2(viewport->minDepth, viewport->maxDepth)));
1677       tu_cs_emit(cs, fui(MAX2(viewport->minDepth, viewport->maxDepth)));
1678    }
1679    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_CL_GUARDBAND_CLIP_ADJ, 1);
1680    tu_cs_emit(cs, A6XX_GRAS_CL_GUARDBAND_CLIP_ADJ_HORZ(guardband.width) |
1681                   A6XX_GRAS_CL_GUARDBAND_CLIP_ADJ_VERT(guardband.height));
1682 
1683    /* TODO: what to do about this and multi viewport ? */
1684    float z_clamp_min = num_viewport ? MIN2(viewports[0].minDepth, viewports[0].maxDepth) : 0;
1685    float z_clamp_max = num_viewport ? MAX2(viewports[0].minDepth, viewports[0].maxDepth) : 0;
1686 
1687    tu_cs_emit_regs(cs,
1688                    A6XX_RB_Z_CLAMP_MIN(z_clamp_min),
1689                    A6XX_RB_Z_CLAMP_MAX(z_clamp_max));
1690 }
1691 
1692 void
tu6_emit_scissor(struct tu_cs * cs,const VkRect2D * scissors,uint32_t scissor_count)1693 tu6_emit_scissor(struct tu_cs *cs, const VkRect2D *scissors, uint32_t scissor_count)
1694 {
1695    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SC_SCREEN_SCISSOR_TL(0), scissor_count * 2);
1696 
1697    for (uint32_t i = 0; i < scissor_count; i++) {
1698       const VkRect2D *scissor = &scissors[i];
1699 
1700       uint32_t min_x = scissor->offset.x;
1701       uint32_t min_y = scissor->offset.y;
1702       uint32_t max_x = min_x + scissor->extent.width - 1;
1703       uint32_t max_y = min_y + scissor->extent.height - 1;
1704 
1705       if (!scissor->extent.width || !scissor->extent.height) {
1706          min_x = min_y = 1;
1707          max_x = max_y = 0;
1708       } else {
1709          /* avoid overflow */
1710          uint32_t scissor_max = BITFIELD_MASK(15);
1711          min_x = MIN2(scissor_max, min_x);
1712          min_y = MIN2(scissor_max, min_y);
1713          max_x = MIN2(scissor_max, max_x);
1714          max_y = MIN2(scissor_max, max_y);
1715       }
1716 
1717       tu_cs_emit(cs, A6XX_GRAS_SC_SCREEN_SCISSOR_TL_X(min_x) |
1718                      A6XX_GRAS_SC_SCREEN_SCISSOR_TL_Y(min_y));
1719       tu_cs_emit(cs, A6XX_GRAS_SC_SCREEN_SCISSOR_BR_X(max_x) |
1720                      A6XX_GRAS_SC_SCREEN_SCISSOR_BR_Y(max_y));
1721    }
1722 }
1723 
1724 void
tu6_emit_sample_locations(struct tu_cs * cs,const VkSampleLocationsInfoEXT * samp_loc)1725 tu6_emit_sample_locations(struct tu_cs *cs, const VkSampleLocationsInfoEXT *samp_loc)
1726 {
1727    if (!samp_loc) {
1728       tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SAMPLE_CONFIG, 1);
1729       tu_cs_emit(cs, 0);
1730 
1731       tu_cs_emit_pkt4(cs, REG_A6XX_RB_SAMPLE_CONFIG, 1);
1732       tu_cs_emit(cs, 0);
1733 
1734       tu_cs_emit_pkt4(cs, REG_A6XX_SP_TP_SAMPLE_CONFIG, 1);
1735       tu_cs_emit(cs, 0);
1736       return;
1737    }
1738 
1739    assert(samp_loc->sampleLocationsPerPixel == samp_loc->sampleLocationsCount);
1740    assert(samp_loc->sampleLocationGridSize.width == 1);
1741    assert(samp_loc->sampleLocationGridSize.height == 1);
1742 
1743    uint32_t sample_config =
1744       A6XX_RB_SAMPLE_CONFIG_LOCATION_ENABLE;
1745    uint32_t sample_locations = 0;
1746    for (uint32_t i = 0; i < samp_loc->sampleLocationsCount; i++) {
1747       sample_locations |=
1748          (A6XX_RB_SAMPLE_LOCATION_0_SAMPLE_0_X(samp_loc->pSampleLocations[i].x) |
1749           A6XX_RB_SAMPLE_LOCATION_0_SAMPLE_0_Y(samp_loc->pSampleLocations[i].y)) << i*8;
1750    }
1751 
1752    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SAMPLE_CONFIG, 2);
1753    tu_cs_emit(cs, sample_config);
1754    tu_cs_emit(cs, sample_locations);
1755 
1756    tu_cs_emit_pkt4(cs, REG_A6XX_RB_SAMPLE_CONFIG, 2);
1757    tu_cs_emit(cs, sample_config);
1758    tu_cs_emit(cs, sample_locations);
1759 
1760    tu_cs_emit_pkt4(cs, REG_A6XX_SP_TP_SAMPLE_CONFIG, 2);
1761    tu_cs_emit(cs, sample_config);
1762    tu_cs_emit(cs, sample_locations);
1763 }
1764 
1765 static uint32_t
tu6_gras_su_cntl(const VkPipelineRasterizationStateCreateInfo * rast_info,VkSampleCountFlagBits samples,bool multiview)1766 tu6_gras_su_cntl(const VkPipelineRasterizationStateCreateInfo *rast_info,
1767                  VkSampleCountFlagBits samples,
1768                  bool multiview)
1769 {
1770    uint32_t gras_su_cntl = 0;
1771 
1772    if (rast_info->cullMode & VK_CULL_MODE_FRONT_BIT)
1773       gras_su_cntl |= A6XX_GRAS_SU_CNTL_CULL_FRONT;
1774    if (rast_info->cullMode & VK_CULL_MODE_BACK_BIT)
1775       gras_su_cntl |= A6XX_GRAS_SU_CNTL_CULL_BACK;
1776 
1777    if (rast_info->frontFace == VK_FRONT_FACE_CLOCKWISE)
1778       gras_su_cntl |= A6XX_GRAS_SU_CNTL_FRONT_CW;
1779 
1780    gras_su_cntl |=
1781       A6XX_GRAS_SU_CNTL_LINEHALFWIDTH(rast_info->lineWidth / 2.0f);
1782 
1783    if (rast_info->depthBiasEnable)
1784       gras_su_cntl |= A6XX_GRAS_SU_CNTL_POLY_OFFSET;
1785 
1786    if (samples > VK_SAMPLE_COUNT_1_BIT)
1787       gras_su_cntl |= A6XX_GRAS_SU_CNTL_MSAA_ENABLE;
1788 
1789    if (multiview) {
1790       gras_su_cntl |=
1791          A6XX_GRAS_SU_CNTL_UNK17 |
1792          A6XX_GRAS_SU_CNTL_MULTIVIEW_ENABLE;
1793    }
1794 
1795    return gras_su_cntl;
1796 }
1797 
1798 void
tu6_emit_depth_bias(struct tu_cs * cs,float constant_factor,float clamp,float slope_factor)1799 tu6_emit_depth_bias(struct tu_cs *cs,
1800                     float constant_factor,
1801                     float clamp,
1802                     float slope_factor)
1803 {
1804    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SU_POLY_OFFSET_SCALE, 3);
1805    tu_cs_emit(cs, A6XX_GRAS_SU_POLY_OFFSET_SCALE(slope_factor).value);
1806    tu_cs_emit(cs, A6XX_GRAS_SU_POLY_OFFSET_OFFSET(constant_factor).value);
1807    tu_cs_emit(cs, A6XX_GRAS_SU_POLY_OFFSET_OFFSET_CLAMP(clamp).value);
1808 }
1809 
1810 static uint32_t
tu6_rb_mrt_blend_control(const VkPipelineColorBlendAttachmentState * att,bool has_alpha)1811 tu6_rb_mrt_blend_control(const VkPipelineColorBlendAttachmentState *att,
1812                          bool has_alpha)
1813 {
1814    const enum a3xx_rb_blend_opcode color_op = tu6_blend_op(att->colorBlendOp);
1815    const enum adreno_rb_blend_factor src_color_factor = tu6_blend_factor(
1816       has_alpha ? att->srcColorBlendFactor
1817                 : tu_blend_factor_no_dst_alpha(att->srcColorBlendFactor));
1818    const enum adreno_rb_blend_factor dst_color_factor = tu6_blend_factor(
1819       has_alpha ? att->dstColorBlendFactor
1820                 : tu_blend_factor_no_dst_alpha(att->dstColorBlendFactor));
1821    const enum a3xx_rb_blend_opcode alpha_op = tu6_blend_op(att->alphaBlendOp);
1822    const enum adreno_rb_blend_factor src_alpha_factor =
1823       tu6_blend_factor(att->srcAlphaBlendFactor);
1824    const enum adreno_rb_blend_factor dst_alpha_factor =
1825       tu6_blend_factor(att->dstAlphaBlendFactor);
1826 
1827    return A6XX_RB_MRT_BLEND_CONTROL_RGB_SRC_FACTOR(src_color_factor) |
1828           A6XX_RB_MRT_BLEND_CONTROL_RGB_BLEND_OPCODE(color_op) |
1829           A6XX_RB_MRT_BLEND_CONTROL_RGB_DEST_FACTOR(dst_color_factor) |
1830           A6XX_RB_MRT_BLEND_CONTROL_ALPHA_SRC_FACTOR(src_alpha_factor) |
1831           A6XX_RB_MRT_BLEND_CONTROL_ALPHA_BLEND_OPCODE(alpha_op) |
1832           A6XX_RB_MRT_BLEND_CONTROL_ALPHA_DEST_FACTOR(dst_alpha_factor);
1833 }
1834 
1835 static uint32_t
tu6_rb_mrt_control(const VkPipelineColorBlendAttachmentState * att,uint32_t rb_mrt_control_rop,bool is_int,bool has_alpha)1836 tu6_rb_mrt_control(const VkPipelineColorBlendAttachmentState *att,
1837                    uint32_t rb_mrt_control_rop,
1838                    bool is_int,
1839                    bool has_alpha)
1840 {
1841    uint32_t rb_mrt_control =
1842       A6XX_RB_MRT_CONTROL_COMPONENT_ENABLE(att->colorWriteMask);
1843 
1844    /* ignore blending and logic op for integer attachments */
1845    if (is_int) {
1846       rb_mrt_control |= A6XX_RB_MRT_CONTROL_ROP_CODE(ROP_COPY);
1847       return rb_mrt_control;
1848    }
1849 
1850    rb_mrt_control |= rb_mrt_control_rop;
1851 
1852    if (att->blendEnable) {
1853       rb_mrt_control |= A6XX_RB_MRT_CONTROL_BLEND;
1854 
1855       if (has_alpha)
1856          rb_mrt_control |= A6XX_RB_MRT_CONTROL_BLEND2;
1857    }
1858 
1859    return rb_mrt_control;
1860 }
1861 
1862 static void
tu6_emit_rb_mrt_controls(struct tu_cs * cs,const VkPipelineColorBlendStateCreateInfo * blend_info,const VkFormat attachment_formats[MAX_RTS],uint32_t * blend_enable_mask)1863 tu6_emit_rb_mrt_controls(struct tu_cs *cs,
1864                          const VkPipelineColorBlendStateCreateInfo *blend_info,
1865                          const VkFormat attachment_formats[MAX_RTS],
1866                          uint32_t *blend_enable_mask)
1867 {
1868    *blend_enable_mask = 0;
1869 
1870    bool rop_reads_dst = false;
1871    uint32_t rb_mrt_control_rop = 0;
1872    if (blend_info->logicOpEnable) {
1873       rop_reads_dst = tu_logic_op_reads_dst(blend_info->logicOp);
1874       rb_mrt_control_rop =
1875          A6XX_RB_MRT_CONTROL_ROP_ENABLE |
1876          A6XX_RB_MRT_CONTROL_ROP_CODE(tu6_rop(blend_info->logicOp));
1877    }
1878 
1879    for (uint32_t i = 0; i < blend_info->attachmentCount; i++) {
1880       const VkPipelineColorBlendAttachmentState *att =
1881          &blend_info->pAttachments[i];
1882       const VkFormat format = attachment_formats[i];
1883 
1884       uint32_t rb_mrt_control = 0;
1885       uint32_t rb_mrt_blend_control = 0;
1886       if (format != VK_FORMAT_UNDEFINED) {
1887          const bool is_int = vk_format_is_int(format);
1888          const bool has_alpha = vk_format_has_alpha(format);
1889 
1890          rb_mrt_control =
1891             tu6_rb_mrt_control(att, rb_mrt_control_rop, is_int, has_alpha);
1892          rb_mrt_blend_control = tu6_rb_mrt_blend_control(att, has_alpha);
1893 
1894          if (att->blendEnable || rop_reads_dst)
1895             *blend_enable_mask |= 1 << i;
1896       }
1897 
1898       tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_CONTROL(i), 2);
1899       tu_cs_emit(cs, rb_mrt_control);
1900       tu_cs_emit(cs, rb_mrt_blend_control);
1901    }
1902 }
1903 
1904 static void
tu6_emit_blend_control(struct tu_cs * cs,uint32_t blend_enable_mask,bool dual_src_blend,const VkPipelineMultisampleStateCreateInfo * msaa_info)1905 tu6_emit_blend_control(struct tu_cs *cs,
1906                        uint32_t blend_enable_mask,
1907                        bool dual_src_blend,
1908                        const VkPipelineMultisampleStateCreateInfo *msaa_info)
1909 {
1910    const uint32_t sample_mask =
1911       msaa_info->pSampleMask ? (*msaa_info->pSampleMask & 0xffff)
1912                              : ((1 << msaa_info->rasterizationSamples) - 1);
1913 
1914    tu_cs_emit_regs(cs,
1915                    A6XX_SP_BLEND_CNTL(.enabled = blend_enable_mask,
1916                                       .dual_color_in_enable = dual_src_blend,
1917                                       .alpha_to_coverage = msaa_info->alphaToCoverageEnable,
1918                                       .unk8 = true));
1919 
1920    /* set A6XX_RB_BLEND_CNTL_INDEPENDENT_BLEND only when enabled? */
1921    tu_cs_emit_regs(cs,
1922                    A6XX_RB_BLEND_CNTL(.enable_blend = blend_enable_mask,
1923                                       .independent_blend = true,
1924                                       .sample_mask = sample_mask,
1925                                       .dual_color_in_enable = dual_src_blend,
1926                                       .alpha_to_coverage = msaa_info->alphaToCoverageEnable,
1927                                       .alpha_to_one = msaa_info->alphaToOneEnable));
1928 }
1929 
1930 static VkResult
tu_pipeline_allocate_cs(struct tu_device * dev,struct tu_pipeline * pipeline,struct tu_pipeline_builder * builder,struct ir3_shader_variant * compute)1931 tu_pipeline_allocate_cs(struct tu_device *dev,
1932                         struct tu_pipeline *pipeline,
1933                         struct tu_pipeline_builder *builder,
1934                         struct ir3_shader_variant *compute)
1935 {
1936    uint32_t size = 2048 + tu6_load_state_size(pipeline, compute);
1937 
1938    /* graphics case: */
1939    if (builder) {
1940       for (uint32_t i = 0; i < MESA_SHADER_STAGES; i++) {
1941          if (builder->variants[i])
1942             size += builder->variants[i]->info.sizedwords;
1943       }
1944 
1945       size += builder->binning_variant->info.sizedwords;
1946    } else {
1947       size += compute->info.sizedwords;
1948    }
1949 
1950    tu_cs_init(&pipeline->cs, dev, TU_CS_MODE_SUB_STREAM, size);
1951 
1952    /* Reserve the space now such that tu_cs_begin_sub_stream never fails. Note
1953     * that LOAD_STATE can potentially take up a large amount of space so we
1954     * calculate its size explicitly.
1955    */
1956    return tu_cs_reserve_space(&pipeline->cs, size);
1957 }
1958 
1959 static void
tu_pipeline_shader_key_init(struct ir3_shader_key * key,const VkGraphicsPipelineCreateInfo * pipeline_info)1960 tu_pipeline_shader_key_init(struct ir3_shader_key *key,
1961                             const VkGraphicsPipelineCreateInfo *pipeline_info)
1962 {
1963    for (uint32_t i = 0; i < pipeline_info->stageCount; i++) {
1964       if (pipeline_info->pStages[i].stage == VK_SHADER_STAGE_GEOMETRY_BIT) {
1965          key->has_gs = true;
1966          break;
1967       }
1968    }
1969 
1970    if (pipeline_info->pRasterizationState->rasterizerDiscardEnable)
1971       return;
1972 
1973    const VkPipelineMultisampleStateCreateInfo *msaa_info = pipeline_info->pMultisampleState;
1974    const struct VkPipelineSampleLocationsStateCreateInfoEXT *sample_locations =
1975       vk_find_struct_const(msaa_info->pNext, PIPELINE_SAMPLE_LOCATIONS_STATE_CREATE_INFO_EXT);
1976    if (msaa_info->rasterizationSamples > 1 ||
1977        /* also set msaa key when sample location is not the default
1978         * since this affects varying interpolation */
1979        (sample_locations && sample_locations->sampleLocationsEnable)) {
1980       key->msaa = true;
1981    }
1982 
1983    /* note: not actually used by ir3, just checked in tu6_emit_fs_inputs */
1984    if (msaa_info->sampleShadingEnable)
1985       key->sample_shading = true;
1986 
1987    /* We set this after we compile to NIR because we need the prim mode */
1988    key->tessellation = IR3_TESS_NONE;
1989 }
1990 
1991 static uint32_t
tu6_get_tessmode(struct tu_shader * shader)1992 tu6_get_tessmode(struct tu_shader* shader)
1993 {
1994    uint32_t primitive_mode = shader->ir3_shader->nir->info.tess.primitive_mode;
1995    switch (primitive_mode) {
1996    case GL_ISOLINES:
1997       return IR3_TESS_ISOLINES;
1998    case GL_TRIANGLES:
1999       return IR3_TESS_TRIANGLES;
2000    case GL_QUADS:
2001       return IR3_TESS_QUADS;
2002    case GL_NONE:
2003       return IR3_TESS_NONE;
2004    default:
2005       unreachable("bad tessmode");
2006    }
2007 }
2008 
2009 static uint64_t
tu_upload_variant(struct tu_pipeline * pipeline,const struct ir3_shader_variant * variant)2010 tu_upload_variant(struct tu_pipeline *pipeline,
2011                   const struct ir3_shader_variant *variant)
2012 {
2013    struct tu_cs_memory memory;
2014 
2015    if (!variant)
2016       return 0;
2017 
2018    /* this expects to get enough alignment because shaders are allocated first
2019     * and sizedwords is always aligned correctly
2020     * note: an assert in tu6_emit_xs_config validates the alignment
2021     */
2022    tu_cs_alloc(&pipeline->cs, variant->info.sizedwords, 1, &memory);
2023 
2024    memcpy(memory.map, variant->bin, sizeof(uint32_t) * variant->info.sizedwords);
2025    return memory.iova;
2026 }
2027 
2028 static VkResult
tu_pipeline_builder_compile_shaders(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline)2029 tu_pipeline_builder_compile_shaders(struct tu_pipeline_builder *builder,
2030                                     struct tu_pipeline *pipeline)
2031 {
2032    const struct ir3_compiler *compiler = builder->device->compiler;
2033    const VkPipelineShaderStageCreateInfo *stage_infos[MESA_SHADER_STAGES] = {
2034       NULL
2035    };
2036    for (uint32_t i = 0; i < builder->create_info->stageCount; i++) {
2037       gl_shader_stage stage =
2038          vk_to_mesa_shader_stage(builder->create_info->pStages[i].stage);
2039       stage_infos[stage] = &builder->create_info->pStages[i];
2040    }
2041 
2042    struct ir3_shader_key key = {};
2043    tu_pipeline_shader_key_init(&key, builder->create_info);
2044 
2045    nir_shader *nir[MESA_SHADER_STAGES] = { NULL };
2046 
2047    for (gl_shader_stage stage = MESA_SHADER_VERTEX;
2048         stage < MESA_SHADER_STAGES; stage++) {
2049       const VkPipelineShaderStageCreateInfo *stage_info = stage_infos[stage];
2050       if (!stage_info)
2051          continue;
2052 
2053       nir[stage] = tu_spirv_to_nir(builder->device, stage_info, stage);
2054       if (!nir[stage])
2055          return VK_ERROR_OUT_OF_HOST_MEMORY;
2056    }
2057 
2058    if (!nir[MESA_SHADER_FRAGMENT]) {
2059          const nir_shader_compiler_options *nir_options =
2060             ir3_get_compiler_options(builder->device->compiler);
2061          nir_builder fs_b;
2062          nir_builder_init_simple_shader(&fs_b, NULL, MESA_SHADER_FRAGMENT,
2063                                         nir_options);
2064          fs_b.shader->info.name = ralloc_strdup(fs_b.shader, "noop_fs");
2065          nir[MESA_SHADER_FRAGMENT] = fs_b.shader;
2066    }
2067 
2068    /* TODO do intra-stage linking here */
2069 
2070    for (gl_shader_stage stage = MESA_SHADER_VERTEX;
2071         stage < MESA_SHADER_STAGES; stage++) {
2072       if (!nir[stage])
2073          continue;
2074 
2075       struct tu_shader *shader =
2076          tu_shader_create(builder->device, nir[stage],
2077                           builder->multiview_mask, builder->layout,
2078                           builder->alloc);
2079       if (!shader)
2080          return VK_ERROR_OUT_OF_HOST_MEMORY;
2081 
2082       /* In SPIR-V generated from GLSL, the primitive mode is specified in the
2083        * tessellation evaluation shader, but in SPIR-V generated from HLSL,
2084        * the mode is specified in the tessellation control shader. */
2085       if ((stage == MESA_SHADER_TESS_EVAL || stage == MESA_SHADER_TESS_CTRL) &&
2086           key.tessellation == IR3_TESS_NONE) {
2087          key.tessellation = tu6_get_tessmode(shader);
2088       }
2089 
2090       builder->shaders[stage] = shader;
2091    }
2092 
2093    struct tu_shader *last_shader = builder->shaders[MESA_SHADER_GEOMETRY];
2094    if (!last_shader)
2095       last_shader = builder->shaders[MESA_SHADER_TESS_EVAL];
2096    if (!last_shader)
2097       last_shader = builder->shaders[MESA_SHADER_VERTEX];
2098 
2099    uint64_t outputs_written = last_shader->ir3_shader->nir->info.outputs_written;
2100 
2101    key.layer_zero = !(outputs_written & VARYING_BIT_LAYER);
2102    key.view_zero = !(outputs_written & VARYING_BIT_VIEWPORT);
2103    key.ucp_enables = MASK(last_shader->ir3_shader->nir->info.clip_distance_array_size);
2104 
2105    pipeline->tess.patch_type = key.tessellation;
2106 
2107    for (gl_shader_stage stage = MESA_SHADER_VERTEX;
2108         stage < MESA_SHADER_STAGES; stage++) {
2109       if (!builder->shaders[stage])
2110          continue;
2111 
2112       bool created;
2113       builder->variants[stage] =
2114          ir3_shader_get_variant(builder->shaders[stage]->ir3_shader,
2115                                 &key, false, &created);
2116       if (!builder->variants[stage])
2117          return VK_ERROR_OUT_OF_HOST_MEMORY;
2118    }
2119 
2120    uint32_t safe_constlens = ir3_trim_constlen(builder->variants, compiler);
2121 
2122    key.safe_constlen = true;
2123 
2124    for (gl_shader_stage stage = MESA_SHADER_VERTEX;
2125         stage < MESA_SHADER_STAGES; stage++) {
2126       if (!builder->shaders[stage])
2127          continue;
2128 
2129       if (safe_constlens & (1 << stage)) {
2130          bool created;
2131          builder->variants[stage] =
2132             ir3_shader_get_variant(builder->shaders[stage]->ir3_shader,
2133                                    &key, false, &created);
2134          if (!builder->variants[stage])
2135             return VK_ERROR_OUT_OF_HOST_MEMORY;
2136       }
2137    }
2138 
2139    const struct tu_shader *vs = builder->shaders[MESA_SHADER_VERTEX];
2140    struct ir3_shader_variant *variant;
2141 
2142    if (vs->ir3_shader->stream_output.num_outputs ||
2143        !ir3_has_binning_vs(&key)) {
2144       variant = builder->variants[MESA_SHADER_VERTEX];
2145    } else {
2146       bool created;
2147       key.safe_constlen = !!(safe_constlens & (1 << MESA_SHADER_VERTEX));
2148       variant = ir3_shader_get_variant(vs->ir3_shader, &key,
2149                                        true, &created);
2150       if (!variant)
2151          return VK_ERROR_OUT_OF_HOST_MEMORY;
2152    }
2153 
2154    builder->binning_variant = variant;
2155 
2156    return VK_SUCCESS;
2157 }
2158 
2159 static void
tu_pipeline_builder_parse_dynamic(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline)2160 tu_pipeline_builder_parse_dynamic(struct tu_pipeline_builder *builder,
2161                                   struct tu_pipeline *pipeline)
2162 {
2163    const VkPipelineDynamicStateCreateInfo *dynamic_info =
2164       builder->create_info->pDynamicState;
2165 
2166    if (!dynamic_info)
2167       return;
2168 
2169    pipeline->gras_su_cntl_mask = ~0u;
2170    pipeline->rb_depth_cntl_mask = ~0u;
2171    pipeline->rb_stencil_cntl_mask = ~0u;
2172 
2173    for (uint32_t i = 0; i < dynamic_info->dynamicStateCount; i++) {
2174       VkDynamicState state = dynamic_info->pDynamicStates[i];
2175       switch (state) {
2176       case VK_DYNAMIC_STATE_VIEWPORT ... VK_DYNAMIC_STATE_STENCIL_REFERENCE:
2177          if (state == VK_DYNAMIC_STATE_LINE_WIDTH)
2178             pipeline->gras_su_cntl_mask &= ~A6XX_GRAS_SU_CNTL_LINEHALFWIDTH__MASK;
2179          pipeline->dynamic_state_mask |= BIT(state);
2180          break;
2181       case VK_DYNAMIC_STATE_SAMPLE_LOCATIONS_EXT:
2182          pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_SAMPLE_LOCATIONS);
2183          break;
2184       case VK_DYNAMIC_STATE_CULL_MODE_EXT:
2185          pipeline->gras_su_cntl_mask &=
2186             ~(A6XX_GRAS_SU_CNTL_CULL_BACK | A6XX_GRAS_SU_CNTL_CULL_FRONT);
2187          pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_GRAS_SU_CNTL);
2188          break;
2189       case VK_DYNAMIC_STATE_FRONT_FACE_EXT:
2190          pipeline->gras_su_cntl_mask &= ~A6XX_GRAS_SU_CNTL_FRONT_CW;
2191          pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_GRAS_SU_CNTL);
2192          break;
2193       case VK_DYNAMIC_STATE_PRIMITIVE_TOPOLOGY_EXT:
2194          pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_PRIMITIVE_TOPOLOGY);
2195          break;
2196       case VK_DYNAMIC_STATE_VERTEX_INPUT_BINDING_STRIDE_EXT:
2197          pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_VB_STRIDE);
2198          break;
2199       case VK_DYNAMIC_STATE_VIEWPORT_WITH_COUNT_EXT:
2200          pipeline->dynamic_state_mask |= BIT(VK_DYNAMIC_STATE_VIEWPORT);
2201          break;
2202       case VK_DYNAMIC_STATE_SCISSOR_WITH_COUNT_EXT:
2203          pipeline->dynamic_state_mask |= BIT(VK_DYNAMIC_STATE_SCISSOR);
2204          break;
2205       case VK_DYNAMIC_STATE_DEPTH_TEST_ENABLE_EXT:
2206          pipeline->rb_depth_cntl_mask &=
2207             ~(A6XX_RB_DEPTH_CNTL_Z_ENABLE | A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE);
2208          pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_RB_DEPTH_CNTL);
2209          break;
2210       case VK_DYNAMIC_STATE_DEPTH_WRITE_ENABLE_EXT:
2211          pipeline->rb_depth_cntl_mask &= ~A6XX_RB_DEPTH_CNTL_Z_WRITE_ENABLE;
2212          pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_RB_DEPTH_CNTL);
2213          break;
2214       case VK_DYNAMIC_STATE_DEPTH_COMPARE_OP_EXT:
2215          pipeline->rb_depth_cntl_mask &= ~A6XX_RB_DEPTH_CNTL_ZFUNC__MASK;
2216          pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_RB_DEPTH_CNTL);
2217          break;
2218       case VK_DYNAMIC_STATE_DEPTH_BOUNDS_TEST_ENABLE_EXT:
2219          pipeline->rb_depth_cntl_mask &=
2220             ~(A6XX_RB_DEPTH_CNTL_Z_BOUNDS_ENABLE | A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE);
2221          pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_RB_DEPTH_CNTL);
2222          break;
2223       case VK_DYNAMIC_STATE_STENCIL_TEST_ENABLE_EXT:
2224          pipeline->rb_stencil_cntl_mask &= ~(A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE |
2225                                              A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE_BF |
2226                                              A6XX_RB_STENCIL_CONTROL_STENCIL_READ);
2227          pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_RB_STENCIL_CNTL);
2228          break;
2229       case VK_DYNAMIC_STATE_STENCIL_OP_EXT:
2230          pipeline->rb_stencil_cntl_mask &= A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE |
2231                                            A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE_BF |
2232                                            A6XX_RB_STENCIL_CONTROL_STENCIL_READ;
2233          pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_RB_STENCIL_CNTL);
2234          break;
2235       default:
2236          assert(!"unsupported dynamic state");
2237          break;
2238       }
2239    }
2240 }
2241 
2242 static void
tu_pipeline_set_linkage(struct tu_program_descriptor_linkage * link,struct tu_shader * shader,struct ir3_shader_variant * v)2243 tu_pipeline_set_linkage(struct tu_program_descriptor_linkage *link,
2244                         struct tu_shader *shader,
2245                         struct ir3_shader_variant *v)
2246 {
2247    link->const_state = *ir3_const_state(v);
2248    link->constlen = v->constlen;
2249    link->push_consts = shader->push_consts;
2250 }
2251 
2252 static void
tu_pipeline_builder_parse_shader_stages(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline)2253 tu_pipeline_builder_parse_shader_stages(struct tu_pipeline_builder *builder,
2254                                         struct tu_pipeline *pipeline)
2255 {
2256    struct tu_cs prog_cs;
2257    tu_cs_begin_sub_stream(&pipeline->cs, 512, &prog_cs);
2258    tu6_emit_program(&prog_cs, builder, false);
2259    pipeline->program.state = tu_cs_end_draw_state(&pipeline->cs, &prog_cs);
2260 
2261    tu_cs_begin_sub_stream(&pipeline->cs, 512, &prog_cs);
2262    tu6_emit_program(&prog_cs, builder, true);
2263    pipeline->program.binning_state = tu_cs_end_draw_state(&pipeline->cs, &prog_cs);
2264 
2265    VkShaderStageFlags stages = 0;
2266    for (unsigned i = 0; i < builder->create_info->stageCount; i++) {
2267       stages |= builder->create_info->pStages[i].stage;
2268    }
2269    pipeline->active_stages = stages;
2270 
2271    uint32_t desc_sets = 0;
2272    for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) {
2273       if (!builder->shaders[i])
2274          continue;
2275 
2276       tu_pipeline_set_linkage(&pipeline->program.link[i],
2277                               builder->shaders[i],
2278                               builder->variants[i]);
2279       desc_sets |= builder->shaders[i]->active_desc_sets;
2280    }
2281    pipeline->active_desc_sets = desc_sets;
2282 }
2283 
2284 static void
tu_pipeline_builder_parse_vertex_input(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline)2285 tu_pipeline_builder_parse_vertex_input(struct tu_pipeline_builder *builder,
2286                                        struct tu_pipeline *pipeline)
2287 {
2288    const VkPipelineVertexInputStateCreateInfo *vi_info =
2289       builder->create_info->pVertexInputState;
2290    const struct ir3_shader_variant *vs = builder->variants[MESA_SHADER_VERTEX];
2291    const struct ir3_shader_variant *bs = builder->binning_variant;
2292 
2293    pipeline->num_vbs = vi_info->vertexBindingDescriptionCount;
2294 
2295    struct tu_cs vi_cs;
2296    tu_cs_begin_sub_stream(&pipeline->cs,
2297                           MAX_VERTEX_ATTRIBS * 7 + 2, &vi_cs);
2298    tu6_emit_vertex_input(pipeline, &vi_cs, vs, vi_info);
2299    pipeline->vi.state = tu_cs_end_draw_state(&pipeline->cs, &vi_cs);
2300 
2301    if (bs) {
2302       tu_cs_begin_sub_stream(&pipeline->cs,
2303                              MAX_VERTEX_ATTRIBS * 7 + 2, &vi_cs);
2304       tu6_emit_vertex_input(pipeline, &vi_cs, bs, vi_info);
2305       pipeline->vi.binning_state =
2306          tu_cs_end_draw_state(&pipeline->cs, &vi_cs);
2307    }
2308 }
2309 
2310 static void
tu_pipeline_builder_parse_input_assembly(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline)2311 tu_pipeline_builder_parse_input_assembly(struct tu_pipeline_builder *builder,
2312                                          struct tu_pipeline *pipeline)
2313 {
2314    const VkPipelineInputAssemblyStateCreateInfo *ia_info =
2315       builder->create_info->pInputAssemblyState;
2316 
2317    pipeline->ia.primtype = tu6_primtype(ia_info->topology);
2318    pipeline->ia.primitive_restart = ia_info->primitiveRestartEnable;
2319 }
2320 
2321 static bool
tu_pipeline_static_state(struct tu_pipeline * pipeline,struct tu_cs * cs,uint32_t id,uint32_t size)2322 tu_pipeline_static_state(struct tu_pipeline *pipeline, struct tu_cs *cs,
2323                          uint32_t id, uint32_t size)
2324 {
2325    assert(id < ARRAY_SIZE(pipeline->dynamic_state));
2326 
2327    if (pipeline->dynamic_state_mask & BIT(id))
2328       return false;
2329 
2330    pipeline->dynamic_state[id] = tu_cs_draw_state(&pipeline->cs, cs, size);
2331    return true;
2332 }
2333 
2334 static void
tu_pipeline_builder_parse_tessellation(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline)2335 tu_pipeline_builder_parse_tessellation(struct tu_pipeline_builder *builder,
2336                                        struct tu_pipeline *pipeline)
2337 {
2338    if (!(pipeline->active_stages & VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT) ||
2339        !(pipeline->active_stages & VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT))
2340       return;
2341 
2342    const VkPipelineTessellationStateCreateInfo *tess_info =
2343       builder->create_info->pTessellationState;
2344 
2345    assert(!(pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_PRIMITIVE_TOPOLOGY)));
2346 
2347    assert(pipeline->ia.primtype == DI_PT_PATCHES0);
2348    assert(tess_info->patchControlPoints <= 32);
2349    pipeline->ia.primtype += tess_info->patchControlPoints;
2350    const VkPipelineTessellationDomainOriginStateCreateInfo *domain_info =
2351          vk_find_struct_const(tess_info->pNext, PIPELINE_TESSELLATION_DOMAIN_ORIGIN_STATE_CREATE_INFO);
2352    pipeline->tess.upper_left_domain_origin = !domain_info ||
2353          domain_info->domainOrigin == VK_TESSELLATION_DOMAIN_ORIGIN_UPPER_LEFT;
2354    const struct ir3_shader_variant *hs = builder->variants[MESA_SHADER_TESS_CTRL];
2355    const struct ir3_shader_variant *ds = builder->variants[MESA_SHADER_TESS_EVAL];
2356    pipeline->tess.param_stride = hs->output_size * 4;
2357    pipeline->tess.hs_bo_regid = hs->const_state->offsets.primitive_param + 1;
2358    pipeline->tess.ds_bo_regid = ds->const_state->offsets.primitive_param + 1;
2359 }
2360 
2361 static void
tu_pipeline_builder_parse_viewport(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline)2362 tu_pipeline_builder_parse_viewport(struct tu_pipeline_builder *builder,
2363                                    struct tu_pipeline *pipeline)
2364 {
2365    /* The spec says:
2366     *
2367     *    pViewportState is a pointer to an instance of the
2368     *    VkPipelineViewportStateCreateInfo structure, and is ignored if the
2369     *    pipeline has rasterization disabled."
2370     *
2371     * We leave the relevant registers stale in that case.
2372     */
2373    if (builder->rasterizer_discard)
2374       return;
2375 
2376    const VkPipelineViewportStateCreateInfo *vp_info =
2377       builder->create_info->pViewportState;
2378 
2379    struct tu_cs cs;
2380 
2381    if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_VIEWPORT, 8 + 10 * vp_info->viewportCount))
2382       tu6_emit_viewport(&cs, vp_info->pViewports, vp_info->viewportCount);
2383 
2384    if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_SCISSOR, 1 + 2 * vp_info->scissorCount))
2385       tu6_emit_scissor(&cs, vp_info->pScissors, vp_info->scissorCount);
2386 }
2387 
2388 static void
tu_pipeline_builder_parse_rasterization(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline)2389 tu_pipeline_builder_parse_rasterization(struct tu_pipeline_builder *builder,
2390                                         struct tu_pipeline *pipeline)
2391 {
2392    const VkPipelineRasterizationStateCreateInfo *rast_info =
2393       builder->create_info->pRasterizationState;
2394 
2395    enum a6xx_polygon_mode mode = tu6_polygon_mode(rast_info->polygonMode);
2396 
2397    bool depth_clip_disable = rast_info->depthClampEnable;
2398 
2399    const VkPipelineRasterizationDepthClipStateCreateInfoEXT *depth_clip_state =
2400       vk_find_struct_const(rast_info, PIPELINE_RASTERIZATION_DEPTH_CLIP_STATE_CREATE_INFO_EXT);
2401    if (depth_clip_state)
2402       depth_clip_disable = !depth_clip_state->depthClipEnable;
2403 
2404    struct tu_cs cs;
2405    pipeline->rast_state = tu_cs_draw_state(&pipeline->cs, &cs, 13);
2406 
2407    tu_cs_emit_regs(&cs,
2408                    A6XX_GRAS_CL_CNTL(
2409                      .znear_clip_disable = depth_clip_disable,
2410                      .zfar_clip_disable = depth_clip_disable,
2411                      /* TODO should this be depth_clip_disable instead? */
2412                      .unk5 = rast_info->depthClampEnable,
2413                      .zero_gb_scale_z = 1,
2414                      .vp_clip_code_ignore = 1));
2415 
2416    tu_cs_emit_regs(&cs,
2417                    A6XX_VPC_POLYGON_MODE(mode));
2418 
2419    tu_cs_emit_regs(&cs,
2420                    A6XX_PC_POLYGON_MODE(mode));
2421 
2422    /* move to hw ctx init? */
2423    tu_cs_emit_regs(&cs,
2424                    A6XX_GRAS_SU_POINT_MINMAX(.min = 1.0f / 16.0f, .max = 4092.0f),
2425                    A6XX_GRAS_SU_POINT_SIZE(1.0f));
2426 
2427    const VkPipelineRasterizationStateStreamCreateInfoEXT *stream_info =
2428       vk_find_struct_const(rast_info->pNext,
2429                            PIPELINE_RASTERIZATION_STATE_STREAM_CREATE_INFO_EXT);
2430    unsigned stream = stream_info ? stream_info->rasterizationStream : 0;
2431    tu_cs_emit_regs(&cs,
2432                    A6XX_PC_RASTER_CNTL(.stream = stream,
2433                                        .discard = rast_info->rasterizerDiscardEnable));
2434    tu_cs_emit_regs(&cs,
2435                    A6XX_VPC_UNKNOWN_9107(.raster_discard = rast_info->rasterizerDiscardEnable));
2436 
2437    pipeline->gras_su_cntl =
2438       tu6_gras_su_cntl(rast_info, builder->samples, builder->multiview_mask != 0);
2439 
2440    if (tu_pipeline_static_state(pipeline, &cs, TU_DYNAMIC_STATE_GRAS_SU_CNTL, 2))
2441       tu_cs_emit_regs(&cs, A6XX_GRAS_SU_CNTL(.dword = pipeline->gras_su_cntl));
2442 
2443    if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_DEPTH_BIAS, 4)) {
2444       tu6_emit_depth_bias(&cs, rast_info->depthBiasConstantFactor,
2445                           rast_info->depthBiasClamp,
2446                           rast_info->depthBiasSlopeFactor);
2447    }
2448 
2449 }
2450 
2451 static void
tu_pipeline_builder_parse_depth_stencil(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline)2452 tu_pipeline_builder_parse_depth_stencil(struct tu_pipeline_builder *builder,
2453                                         struct tu_pipeline *pipeline)
2454 {
2455    /* The spec says:
2456     *
2457     *    pDepthStencilState is a pointer to an instance of the
2458     *    VkPipelineDepthStencilStateCreateInfo structure, and is ignored if
2459     *    the pipeline has rasterization disabled or if the subpass of the
2460     *    render pass the pipeline is created against does not use a
2461     *    depth/stencil attachment.
2462     */
2463    const VkPipelineDepthStencilStateCreateInfo *ds_info =
2464       builder->create_info->pDepthStencilState;
2465    const VkPipelineRasterizationStateCreateInfo *rast_info =
2466       builder->create_info->pRasterizationState;
2467    uint32_t rb_depth_cntl = 0, rb_stencil_cntl = 0;
2468    struct tu_cs cs;
2469 
2470    if (builder->depth_attachment_format != VK_FORMAT_UNDEFINED &&
2471        builder->depth_attachment_format != VK_FORMAT_S8_UINT) {
2472       if (ds_info->depthTestEnable) {
2473          rb_depth_cntl |=
2474             A6XX_RB_DEPTH_CNTL_Z_ENABLE |
2475             A6XX_RB_DEPTH_CNTL_ZFUNC(tu6_compare_func(ds_info->depthCompareOp)) |
2476             A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE; /* TODO: don't set for ALWAYS/NEVER */
2477 
2478          if (rast_info->depthClampEnable)
2479             rb_depth_cntl |= A6XX_RB_DEPTH_CNTL_Z_CLAMP_ENABLE;
2480 
2481          if (ds_info->depthWriteEnable)
2482             rb_depth_cntl |= A6XX_RB_DEPTH_CNTL_Z_WRITE_ENABLE;
2483       }
2484 
2485       if (ds_info->depthBoundsTestEnable)
2486             rb_depth_cntl |= A6XX_RB_DEPTH_CNTL_Z_BOUNDS_ENABLE | A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE;
2487    } else {
2488       /* if RB_DEPTH_CNTL is set dynamically, we need to make sure it is set
2489        * to 0 when this pipeline is used, as enabling depth test when there
2490        * is no depth attachment is a problem (at least for the S8_UINT case)
2491        */
2492       if (pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_RB_DEPTH_CNTL))
2493          pipeline->rb_depth_cntl_disable = true;
2494    }
2495 
2496    if (builder->depth_attachment_format != VK_FORMAT_UNDEFINED) {
2497       const VkStencilOpState *front = &ds_info->front;
2498       const VkStencilOpState *back = &ds_info->back;
2499 
2500       rb_stencil_cntl |=
2501          A6XX_RB_STENCIL_CONTROL_FUNC(tu6_compare_func(front->compareOp)) |
2502          A6XX_RB_STENCIL_CONTROL_FAIL(tu6_stencil_op(front->failOp)) |
2503          A6XX_RB_STENCIL_CONTROL_ZPASS(tu6_stencil_op(front->passOp)) |
2504          A6XX_RB_STENCIL_CONTROL_ZFAIL(tu6_stencil_op(front->depthFailOp)) |
2505          A6XX_RB_STENCIL_CONTROL_FUNC_BF(tu6_compare_func(back->compareOp)) |
2506          A6XX_RB_STENCIL_CONTROL_FAIL_BF(tu6_stencil_op(back->failOp)) |
2507          A6XX_RB_STENCIL_CONTROL_ZPASS_BF(tu6_stencil_op(back->passOp)) |
2508          A6XX_RB_STENCIL_CONTROL_ZFAIL_BF(tu6_stencil_op(back->depthFailOp));
2509 
2510       if (ds_info->stencilTestEnable) {
2511          rb_stencil_cntl |=
2512             A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE |
2513             A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE_BF |
2514             A6XX_RB_STENCIL_CONTROL_STENCIL_READ;
2515       }
2516    }
2517 
2518    if (tu_pipeline_static_state(pipeline, &cs, TU_DYNAMIC_STATE_RB_DEPTH_CNTL, 2)) {
2519       tu_cs_emit_pkt4(&cs, REG_A6XX_RB_DEPTH_CNTL, 1);
2520       tu_cs_emit(&cs, rb_depth_cntl);
2521    } else {
2522       pipeline->rb_depth_cntl = rb_depth_cntl;
2523    }
2524 
2525    if (tu_pipeline_static_state(pipeline, &cs, TU_DYNAMIC_STATE_RB_STENCIL_CNTL, 2)) {
2526       tu_cs_emit_pkt4(&cs, REG_A6XX_RB_STENCIL_CONTROL, 1);
2527       tu_cs_emit(&cs, rb_stencil_cntl);
2528    } else {
2529       pipeline->rb_stencil_cntl = rb_stencil_cntl;
2530    }
2531 
2532    /* the remaining draw states arent used if there is no d/s, leave them empty */
2533    if (builder->depth_attachment_format == VK_FORMAT_UNDEFINED)
2534       return;
2535 
2536    if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_DEPTH_BOUNDS, 3)) {
2537       tu_cs_emit_regs(&cs,
2538                       A6XX_RB_Z_BOUNDS_MIN(ds_info->minDepthBounds),
2539                       A6XX_RB_Z_BOUNDS_MAX(ds_info->maxDepthBounds));
2540    }
2541 
2542    if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK, 2)) {
2543       tu_cs_emit_regs(&cs, A6XX_RB_STENCILMASK(.mask = ds_info->front.compareMask & 0xff,
2544                                                .bfmask = ds_info->back.compareMask & 0xff));
2545    }
2546 
2547    if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_STENCIL_WRITE_MASK, 2)) {
2548       tu_cs_emit_regs(&cs, A6XX_RB_STENCILWRMASK(.wrmask = ds_info->front.writeMask & 0xff,
2549                                                  .bfwrmask = ds_info->back.writeMask & 0xff));
2550    }
2551 
2552    if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_STENCIL_REFERENCE, 2)) {
2553       tu_cs_emit_regs(&cs, A6XX_RB_STENCILREF(.ref = ds_info->front.reference & 0xff,
2554                                               .bfref = ds_info->back.reference & 0xff));
2555    }
2556 
2557    if (ds_info->depthTestEnable) {
2558       pipeline->lrz.write = ds_info->depthWriteEnable;
2559       pipeline->lrz.invalidate = false;
2560       pipeline->lrz.z_test_enable = true;
2561 
2562       /* LRZ does not support some depth modes.
2563        *
2564        * The HW has a flag for GREATER and GREATER_OR_EQUAL modes which is used
2565        * in freedreno, however there are some dEQP-VK tests that fail if we use here.
2566        * Furthermore, blob disables LRZ on these comparison opcodes too.
2567        *
2568        * TODO: investigate if we can enable GREATER flag here.
2569        */
2570       switch(ds_info->depthCompareOp) {
2571       case VK_COMPARE_OP_ALWAYS:
2572       case VK_COMPARE_OP_NOT_EQUAL:
2573       case VK_COMPARE_OP_GREATER:
2574       case VK_COMPARE_OP_GREATER_OR_EQUAL:
2575          pipeline->lrz.invalidate = true;
2576          pipeline->lrz.write = false;
2577          break;
2578       case VK_COMPARE_OP_EQUAL:
2579       case VK_COMPARE_OP_NEVER:
2580          pipeline->lrz.enable = true;
2581          pipeline->lrz.write = false;
2582          break;
2583       case VK_COMPARE_OP_LESS:
2584       case VK_COMPARE_OP_LESS_OR_EQUAL:
2585          pipeline->lrz.enable = true;
2586          break;
2587       default:
2588          unreachable("bad VK_COMPARE_OP value");
2589          break;
2590       };
2591    }
2592 
2593    if (ds_info->stencilTestEnable) {
2594       pipeline->lrz.write = false;
2595       pipeline->lrz.invalidate = true;
2596    }
2597 
2598    if (builder->shaders[MESA_SHADER_FRAGMENT]) {
2599       const struct ir3_shader_variant *fs = &builder->shaders[MESA_SHADER_FRAGMENT]->ir3_shader->variants[0];
2600       if (fs->has_kill || fs->no_earlyz || fs->writes_pos) {
2601          pipeline->lrz.write = false;
2602       }
2603       if (fs->no_earlyz || fs->writes_pos) {
2604          pipeline->lrz.enable = false;
2605          pipeline->lrz.z_test_enable = false;
2606       }
2607    }
2608 }
2609 
2610 static void
tu_pipeline_builder_parse_multisample_and_color_blend(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline)2611 tu_pipeline_builder_parse_multisample_and_color_blend(
2612    struct tu_pipeline_builder *builder, struct tu_pipeline *pipeline)
2613 {
2614    /* The spec says:
2615     *
2616     *    pMultisampleState is a pointer to an instance of the
2617     *    VkPipelineMultisampleStateCreateInfo, and is ignored if the pipeline
2618     *    has rasterization disabled.
2619     *
2620     * Also,
2621     *
2622     *    pColorBlendState is a pointer to an instance of the
2623     *    VkPipelineColorBlendStateCreateInfo structure, and is ignored if the
2624     *    pipeline has rasterization disabled or if the subpass of the render
2625     *    pass the pipeline is created against does not use any color
2626     *    attachments.
2627     *
2628     * We leave the relevant registers stale when rasterization is disabled.
2629     */
2630    if (builder->rasterizer_discard)
2631       return;
2632 
2633    static const VkPipelineColorBlendStateCreateInfo dummy_blend_info;
2634    const VkPipelineMultisampleStateCreateInfo *msaa_info =
2635       builder->create_info->pMultisampleState;
2636    const VkPipelineColorBlendStateCreateInfo *blend_info =
2637       builder->use_color_attachments ? builder->create_info->pColorBlendState
2638                                      : &dummy_blend_info;
2639 
2640    struct tu_cs cs;
2641    pipeline->blend_state =
2642       tu_cs_draw_state(&pipeline->cs, &cs, blend_info->attachmentCount * 3 + 4);
2643 
2644    uint32_t blend_enable_mask;
2645    tu6_emit_rb_mrt_controls(&cs, blend_info,
2646                             builder->color_attachment_formats,
2647                             &blend_enable_mask);
2648 
2649    tu6_emit_blend_control(&cs, blend_enable_mask,
2650                           builder->use_dual_src_blend, msaa_info);
2651 
2652    assert(cs.cur == cs.end); /* validate draw state size */
2653 
2654    if (blend_enable_mask) {
2655       for (int i = 0; i < blend_info->attachmentCount; i++) {
2656          VkPipelineColorBlendAttachmentState blendAttachment = blend_info->pAttachments[i];
2657          /* Disable LRZ writes when blend is enabled, since the
2658           * resulting pixel value from the blend-draw
2659           * depends on an earlier draw, which LRZ in the draw pass
2660           * could early-reject if the previous blend-enabled draw wrote LRZ.
2661           *
2662           * From the PoV of LRZ, having masked color channels is
2663           * the same as having blend enabled, in that the draw will
2664           * care about the fragments from an earlier draw.
2665           */
2666          if (blendAttachment.blendEnable || blendAttachment.colorWriteMask != 0xf) {
2667             pipeline->lrz.blend_disable_write = true;
2668          }
2669       }
2670    }
2671 
2672    if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_BLEND_CONSTANTS, 5)) {
2673       tu_cs_emit_pkt4(&cs, REG_A6XX_RB_BLEND_RED_F32, 4);
2674       tu_cs_emit_array(&cs, (const uint32_t *) blend_info->blendConstants, 4);
2675    }
2676 
2677    const struct VkPipelineSampleLocationsStateCreateInfoEXT *sample_locations =
2678       vk_find_struct_const(msaa_info->pNext, PIPELINE_SAMPLE_LOCATIONS_STATE_CREATE_INFO_EXT);
2679    const VkSampleLocationsInfoEXT *samp_loc = NULL;
2680 
2681    if (sample_locations && sample_locations->sampleLocationsEnable)
2682       samp_loc = &sample_locations->sampleLocationsInfo;
2683 
2684     if (tu_pipeline_static_state(pipeline, &cs, TU_DYNAMIC_STATE_SAMPLE_LOCATIONS,
2685                                  samp_loc ? 9 : 6)) {
2686       tu6_emit_sample_locations(&cs, samp_loc);
2687     }
2688 }
2689 
2690 static void
tu_pipeline_finish(struct tu_pipeline * pipeline,struct tu_device * dev,const VkAllocationCallbacks * alloc)2691 tu_pipeline_finish(struct tu_pipeline *pipeline,
2692                    struct tu_device *dev,
2693                    const VkAllocationCallbacks *alloc)
2694 {
2695    tu_cs_finish(&pipeline->cs);
2696 }
2697 
2698 static VkResult
tu_pipeline_builder_build(struct tu_pipeline_builder * builder,struct tu_pipeline ** pipeline)2699 tu_pipeline_builder_build(struct tu_pipeline_builder *builder,
2700                           struct tu_pipeline **pipeline)
2701 {
2702    VkResult result;
2703 
2704    *pipeline = vk_object_zalloc(&builder->device->vk, builder->alloc,
2705                                 sizeof(**pipeline), VK_OBJECT_TYPE_PIPELINE);
2706    if (!*pipeline)
2707       return VK_ERROR_OUT_OF_HOST_MEMORY;
2708 
2709    (*pipeline)->layout = builder->layout;
2710 
2711    /* compile and upload shaders */
2712    result = tu_pipeline_builder_compile_shaders(builder, *pipeline);
2713    if (result != VK_SUCCESS) {
2714       vk_object_free(&builder->device->vk, builder->alloc, *pipeline);
2715       return result;
2716    }
2717 
2718    result = tu_pipeline_allocate_cs(builder->device, *pipeline, builder, NULL);
2719    if (result != VK_SUCCESS) {
2720       vk_object_free(&builder->device->vk, builder->alloc, *pipeline);
2721       return result;
2722    }
2723 
2724    for (uint32_t i = 0; i < MESA_SHADER_STAGES; i++)
2725       builder->shader_iova[i] = tu_upload_variant(*pipeline, builder->variants[i]);
2726 
2727    builder->binning_vs_iova =
2728       tu_upload_variant(*pipeline, builder->binning_variant);
2729 
2730    tu_pipeline_builder_parse_dynamic(builder, *pipeline);
2731    tu_pipeline_builder_parse_shader_stages(builder, *pipeline);
2732    tu_pipeline_builder_parse_vertex_input(builder, *pipeline);
2733    tu_pipeline_builder_parse_input_assembly(builder, *pipeline);
2734    tu_pipeline_builder_parse_tessellation(builder, *pipeline);
2735    tu_pipeline_builder_parse_viewport(builder, *pipeline);
2736    tu_pipeline_builder_parse_rasterization(builder, *pipeline);
2737    tu_pipeline_builder_parse_depth_stencil(builder, *pipeline);
2738    tu_pipeline_builder_parse_multisample_and_color_blend(builder, *pipeline);
2739    tu6_emit_load_state(*pipeline, false);
2740 
2741    /* we should have reserved enough space upfront such that the CS never
2742     * grows
2743     */
2744    assert((*pipeline)->cs.bo_count == 1);
2745 
2746    return VK_SUCCESS;
2747 }
2748 
2749 static void
tu_pipeline_builder_finish(struct tu_pipeline_builder * builder)2750 tu_pipeline_builder_finish(struct tu_pipeline_builder *builder)
2751 {
2752    for (uint32_t i = 0; i < MESA_SHADER_STAGES; i++) {
2753       if (!builder->shaders[i])
2754          continue;
2755       tu_shader_destroy(builder->device, builder->shaders[i], builder->alloc);
2756    }
2757 }
2758 
2759 static void
tu_pipeline_builder_init_graphics(struct tu_pipeline_builder * builder,struct tu_device * dev,struct tu_pipeline_cache * cache,const VkGraphicsPipelineCreateInfo * create_info,const VkAllocationCallbacks * alloc)2760 tu_pipeline_builder_init_graphics(
2761    struct tu_pipeline_builder *builder,
2762    struct tu_device *dev,
2763    struct tu_pipeline_cache *cache,
2764    const VkGraphicsPipelineCreateInfo *create_info,
2765    const VkAllocationCallbacks *alloc)
2766 {
2767    TU_FROM_HANDLE(tu_pipeline_layout, layout, create_info->layout);
2768 
2769    *builder = (struct tu_pipeline_builder) {
2770       .device = dev,
2771       .cache = cache,
2772       .create_info = create_info,
2773       .alloc = alloc,
2774       .layout = layout,
2775    };
2776 
2777    const struct tu_render_pass *pass =
2778       tu_render_pass_from_handle(create_info->renderPass);
2779    const struct tu_subpass *subpass =
2780       &pass->subpasses[create_info->subpass];
2781 
2782    builder->multiview_mask = subpass->multiview_mask;
2783 
2784    builder->rasterizer_discard =
2785       create_info->pRasterizationState->rasterizerDiscardEnable;
2786 
2787    if (builder->rasterizer_discard) {
2788       builder->samples = VK_SAMPLE_COUNT_1_BIT;
2789    } else {
2790       builder->samples = create_info->pMultisampleState->rasterizationSamples;
2791 
2792       const uint32_t a = subpass->depth_stencil_attachment.attachment;
2793       builder->depth_attachment_format = (a != VK_ATTACHMENT_UNUSED) ?
2794          pass->attachments[a].format : VK_FORMAT_UNDEFINED;
2795 
2796       assert(subpass->color_count == 0 ||
2797              !create_info->pColorBlendState ||
2798              subpass->color_count == create_info->pColorBlendState->attachmentCount);
2799       builder->color_attachment_count = subpass->color_count;
2800       for (uint32_t i = 0; i < subpass->color_count; i++) {
2801          const uint32_t a = subpass->color_attachments[i].attachment;
2802          if (a == VK_ATTACHMENT_UNUSED)
2803             continue;
2804 
2805          builder->color_attachment_formats[i] = pass->attachments[a].format;
2806          builder->use_color_attachments = true;
2807          builder->render_components |= 0xf << (i * 4);
2808       }
2809 
2810       if (tu_blend_state_is_dual_src(create_info->pColorBlendState)) {
2811          builder->color_attachment_count++;
2812          builder->use_dual_src_blend = true;
2813          /* dual source blending has an extra fs output in the 2nd slot */
2814          if (subpass->color_attachments[0].attachment != VK_ATTACHMENT_UNUSED)
2815             builder->render_components |= 0xf << 4;
2816       }
2817    }
2818 }
2819 
2820 static VkResult
tu_graphics_pipeline_create(VkDevice device,VkPipelineCache pipelineCache,const VkGraphicsPipelineCreateInfo * pCreateInfo,const VkAllocationCallbacks * pAllocator,VkPipeline * pPipeline)2821 tu_graphics_pipeline_create(VkDevice device,
2822                             VkPipelineCache pipelineCache,
2823                             const VkGraphicsPipelineCreateInfo *pCreateInfo,
2824                             const VkAllocationCallbacks *pAllocator,
2825                             VkPipeline *pPipeline)
2826 {
2827    TU_FROM_HANDLE(tu_device, dev, device);
2828    TU_FROM_HANDLE(tu_pipeline_cache, cache, pipelineCache);
2829 
2830    struct tu_pipeline_builder builder;
2831    tu_pipeline_builder_init_graphics(&builder, dev, cache,
2832                                      pCreateInfo, pAllocator);
2833 
2834    struct tu_pipeline *pipeline = NULL;
2835    VkResult result = tu_pipeline_builder_build(&builder, &pipeline);
2836    tu_pipeline_builder_finish(&builder);
2837 
2838    if (result == VK_SUCCESS)
2839       *pPipeline = tu_pipeline_to_handle(pipeline);
2840    else
2841       *pPipeline = VK_NULL_HANDLE;
2842 
2843    return result;
2844 }
2845 
2846 VkResult
tu_CreateGraphicsPipelines(VkDevice device,VkPipelineCache pipelineCache,uint32_t count,const VkGraphicsPipelineCreateInfo * pCreateInfos,const VkAllocationCallbacks * pAllocator,VkPipeline * pPipelines)2847 tu_CreateGraphicsPipelines(VkDevice device,
2848                            VkPipelineCache pipelineCache,
2849                            uint32_t count,
2850                            const VkGraphicsPipelineCreateInfo *pCreateInfos,
2851                            const VkAllocationCallbacks *pAllocator,
2852                            VkPipeline *pPipelines)
2853 {
2854    VkResult final_result = VK_SUCCESS;
2855 
2856    for (uint32_t i = 0; i < count; i++) {
2857       VkResult result = tu_graphics_pipeline_create(device, pipelineCache,
2858                                                     &pCreateInfos[i], pAllocator,
2859                                                     &pPipelines[i]);
2860 
2861       if (result != VK_SUCCESS)
2862          final_result = result;
2863    }
2864 
2865    return final_result;
2866 }
2867 
2868 static VkResult
tu_compute_pipeline_create(VkDevice device,VkPipelineCache _cache,const VkComputePipelineCreateInfo * pCreateInfo,const VkAllocationCallbacks * pAllocator,VkPipeline * pPipeline)2869 tu_compute_pipeline_create(VkDevice device,
2870                            VkPipelineCache _cache,
2871                            const VkComputePipelineCreateInfo *pCreateInfo,
2872                            const VkAllocationCallbacks *pAllocator,
2873                            VkPipeline *pPipeline)
2874 {
2875    TU_FROM_HANDLE(tu_device, dev, device);
2876    TU_FROM_HANDLE(tu_pipeline_layout, layout, pCreateInfo->layout);
2877    const VkPipelineShaderStageCreateInfo *stage_info = &pCreateInfo->stage;
2878    VkResult result;
2879 
2880    struct tu_pipeline *pipeline;
2881 
2882    *pPipeline = VK_NULL_HANDLE;
2883 
2884    pipeline = vk_object_zalloc(&dev->vk, pAllocator, sizeof(*pipeline),
2885                                VK_OBJECT_TYPE_PIPELINE);
2886    if (!pipeline)
2887       return VK_ERROR_OUT_OF_HOST_MEMORY;
2888 
2889    pipeline->layout = layout;
2890 
2891    struct ir3_shader_key key = {};
2892 
2893    nir_shader *nir = tu_spirv_to_nir(dev, stage_info, MESA_SHADER_COMPUTE);
2894 
2895    struct tu_shader *shader =
2896       tu_shader_create(dev, nir, 0, layout, pAllocator);
2897    if (!shader) {
2898       result = VK_ERROR_OUT_OF_HOST_MEMORY;
2899       goto fail;
2900    }
2901 
2902    pipeline->active_desc_sets = shader->active_desc_sets;
2903 
2904    bool created;
2905    struct ir3_shader_variant *v =
2906       ir3_shader_get_variant(shader->ir3_shader, &key, false, &created);
2907    if (!v) {
2908       result = VK_ERROR_OUT_OF_HOST_MEMORY;
2909       goto fail;
2910    }
2911 
2912    tu_pipeline_set_linkage(&pipeline->program.link[MESA_SHADER_COMPUTE],
2913                            shader, v);
2914 
2915    result = tu_pipeline_allocate_cs(dev, pipeline, NULL, v);
2916    if (result != VK_SUCCESS)
2917       goto fail;
2918 
2919    uint64_t shader_iova = tu_upload_variant(pipeline, v);
2920 
2921    for (int i = 0; i < 3; i++)
2922       pipeline->compute.local_size[i] = v->shader->nir->info.cs.local_size[i];
2923 
2924    struct tu_cs prog_cs;
2925    tu_cs_begin_sub_stream(&pipeline->cs, 512, &prog_cs);
2926    tu6_emit_cs_config(&prog_cs, shader, v, shader_iova);
2927    pipeline->program.state = tu_cs_end_draw_state(&pipeline->cs, &prog_cs);
2928 
2929    tu6_emit_load_state(pipeline, true);
2930 
2931    *pPipeline = tu_pipeline_to_handle(pipeline);
2932    return VK_SUCCESS;
2933 
2934 fail:
2935    if (shader)
2936       tu_shader_destroy(dev, shader, pAllocator);
2937 
2938    vk_object_free(&dev->vk, pAllocator, pipeline);
2939 
2940    return result;
2941 }
2942 
2943 VkResult
tu_CreateComputePipelines(VkDevice device,VkPipelineCache pipelineCache,uint32_t count,const VkComputePipelineCreateInfo * pCreateInfos,const VkAllocationCallbacks * pAllocator,VkPipeline * pPipelines)2944 tu_CreateComputePipelines(VkDevice device,
2945                           VkPipelineCache pipelineCache,
2946                           uint32_t count,
2947                           const VkComputePipelineCreateInfo *pCreateInfos,
2948                           const VkAllocationCallbacks *pAllocator,
2949                           VkPipeline *pPipelines)
2950 {
2951    VkResult final_result = VK_SUCCESS;
2952 
2953    for (uint32_t i = 0; i < count; i++) {
2954       VkResult result = tu_compute_pipeline_create(device, pipelineCache,
2955                                                    &pCreateInfos[i],
2956                                                    pAllocator, &pPipelines[i]);
2957       if (result != VK_SUCCESS)
2958          final_result = result;
2959    }
2960 
2961    return final_result;
2962 }
2963 
2964 void
tu_DestroyPipeline(VkDevice _device,VkPipeline _pipeline,const VkAllocationCallbacks * pAllocator)2965 tu_DestroyPipeline(VkDevice _device,
2966                    VkPipeline _pipeline,
2967                    const VkAllocationCallbacks *pAllocator)
2968 {
2969    TU_FROM_HANDLE(tu_device, dev, _device);
2970    TU_FROM_HANDLE(tu_pipeline, pipeline, _pipeline);
2971 
2972    if (!_pipeline)
2973       return;
2974 
2975    tu_pipeline_finish(pipeline, dev, pAllocator);
2976    vk_object_free(&dev->vk, pAllocator, pipeline);
2977 }
2978