1 /*
2  * Copyright 2020 Advanced Micro Devices, Inc.
3  * All Rights Reserved.
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * on the rights to use, copy, modify, merge, publish, distribute, sub
9  * license, and/or sell copies of the Software, and to permit persons to whom
10  * the Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice (including the next
13  * paragraph) shall be included in all copies or substantial portions of the
14  * Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22  * USE OR OTHER DEALINGS IN THE SOFTWARE.
23  */
24 
25 #include "si_pipe.h"
26 #include "si_shader_internal.h"
27 #include "sid.h"
28 
get_rel_patch_id(struct si_shader_context * ctx)29 static LLVMValueRef get_rel_patch_id(struct si_shader_context *ctx)
30 {
31    switch (ctx->stage) {
32    case MESA_SHADER_TESS_CTRL:
33       return si_unpack_param(ctx, ctx->args.tcs_rel_ids, 0, 8);
34 
35    case MESA_SHADER_TESS_EVAL:
36       return ac_get_arg(&ctx->ac, ctx->tes_rel_patch_id);
37 
38    default:
39       assert(0);
40       return NULL;
41    }
42 }
43 
44 /* Tessellation shaders pass outputs to the next shader using LDS.
45  *
46  * LS outputs = TCS inputs
47  * TCS outputs = TES inputs
48  *
49  * The LDS layout is:
50  * - TCS inputs for patch 0
51  * - TCS inputs for patch 1
52  * - TCS inputs for patch 2		= get_tcs_in_current_patch_offset (if RelPatchID==2)
53  * - ...
54  * - TCS outputs for patch 0            = get_tcs_out_patch0_offset
55  * - Per-patch TCS outputs for patch 0  = get_tcs_out_patch0_patch_data_offset
56  * - TCS outputs for patch 1
57  * - Per-patch TCS outputs for patch 1
58  * - TCS outputs for patch 2            = get_tcs_out_current_patch_offset (if RelPatchID==2)
59  * - Per-patch TCS outputs for patch 2  = get_tcs_out_current_patch_data_offset (if RelPatchID==2)
60  * - ...
61  *
62  * All three shaders VS(LS), TCS, TES share the same LDS space.
63  */
64 
get_tcs_in_patch_stride(struct si_shader_context * ctx)65 static LLVMValueRef get_tcs_in_patch_stride(struct si_shader_context *ctx)
66 {
67    return si_unpack_param(ctx, ctx->vs_state_bits, 11, 13);
68 }
69 
get_tcs_out_vertex_dw_stride_constant(struct si_shader_context * ctx)70 static unsigned get_tcs_out_vertex_dw_stride_constant(struct si_shader_context *ctx)
71 {
72    assert(ctx->stage == MESA_SHADER_TESS_CTRL);
73 
74    if (ctx->shader->key.mono.u.ff_tcs_inputs_to_copy)
75       return util_last_bit64(ctx->shader->key.mono.u.ff_tcs_inputs_to_copy) * 4;
76 
77    return util_last_bit64(ctx->shader->selector->outputs_written) * 4;
78 }
79 
get_tcs_out_vertex_dw_stride(struct si_shader_context * ctx)80 static LLVMValueRef get_tcs_out_vertex_dw_stride(struct si_shader_context *ctx)
81 {
82    unsigned stride = get_tcs_out_vertex_dw_stride_constant(ctx);
83 
84    return LLVMConstInt(ctx->ac.i32, stride, 0);
85 }
86 
get_tcs_out_patch_stride(struct si_shader_context * ctx)87 static LLVMValueRef get_tcs_out_patch_stride(struct si_shader_context *ctx)
88 {
89    if (ctx->shader->key.mono.u.ff_tcs_inputs_to_copy)
90       return si_unpack_param(ctx, ctx->tcs_out_lds_layout, 0, 13);
91 
92    const struct si_shader_info *info = &ctx->shader->selector->info;
93    unsigned tcs_out_vertices = info->base.tess.tcs_vertices_out;
94    unsigned vertex_dw_stride = get_tcs_out_vertex_dw_stride_constant(ctx);
95    unsigned num_patch_outputs = util_last_bit64(ctx->shader->selector->patch_outputs_written);
96    unsigned patch_dw_stride = tcs_out_vertices * vertex_dw_stride + num_patch_outputs * 4;
97    return LLVMConstInt(ctx->ac.i32, patch_dw_stride, 0);
98 }
99 
get_tcs_out_patch0_offset(struct si_shader_context * ctx)100 static LLVMValueRef get_tcs_out_patch0_offset(struct si_shader_context *ctx)
101 {
102    return LLVMBuildMul(ctx->ac.builder, si_unpack_param(ctx, ctx->tcs_out_lds_offsets, 0, 16),
103                        LLVMConstInt(ctx->ac.i32, 4, 0), "");
104 }
105 
get_tcs_out_patch0_patch_data_offset(struct si_shader_context * ctx)106 static LLVMValueRef get_tcs_out_patch0_patch_data_offset(struct si_shader_context *ctx)
107 {
108    return LLVMBuildMul(ctx->ac.builder, si_unpack_param(ctx, ctx->tcs_out_lds_offsets, 16, 16),
109                        LLVMConstInt(ctx->ac.i32, 4, 0), "");
110 }
111 
get_tcs_in_current_patch_offset(struct si_shader_context * ctx)112 static LLVMValueRef get_tcs_in_current_patch_offset(struct si_shader_context *ctx)
113 {
114    LLVMValueRef patch_stride = get_tcs_in_patch_stride(ctx);
115    LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
116 
117    return LLVMBuildMul(ctx->ac.builder, patch_stride, rel_patch_id, "");
118 }
119 
get_tcs_out_current_patch_offset(struct si_shader_context * ctx)120 static LLVMValueRef get_tcs_out_current_patch_offset(struct si_shader_context *ctx)
121 {
122    LLVMValueRef patch0_offset = get_tcs_out_patch0_offset(ctx);
123    LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
124    LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
125 
126    return ac_build_imad(&ctx->ac, patch_stride, rel_patch_id, patch0_offset);
127 }
128 
get_tcs_out_current_patch_data_offset(struct si_shader_context * ctx)129 static LLVMValueRef get_tcs_out_current_patch_data_offset(struct si_shader_context *ctx)
130 {
131    LLVMValueRef patch0_patch_data_offset = get_tcs_out_patch0_patch_data_offset(ctx);
132    LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
133    LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
134 
135    return ac_build_imad(&ctx->ac, patch_stride, rel_patch_id, patch0_patch_data_offset);
136 }
137 
get_num_tcs_out_vertices(struct si_shader_context * ctx)138 static LLVMValueRef get_num_tcs_out_vertices(struct si_shader_context *ctx)
139 {
140    unsigned tcs_out_vertices =
141       ctx->shader->selector ? ctx->shader->selector->info.base.tess.tcs_vertices_out
142                             : 0;
143 
144    /* If !tcs_out_vertices, it's either the fixed-func TCS or the TCS epilog. */
145    if (ctx->stage == MESA_SHADER_TESS_CTRL && tcs_out_vertices)
146       return LLVMConstInt(ctx->ac.i32, tcs_out_vertices, 0);
147 
148    return si_unpack_param(ctx, ctx->tcs_offchip_layout, 6, 6);
149 }
150 
get_tcs_in_vertex_dw_stride(struct si_shader_context * ctx)151 static LLVMValueRef get_tcs_in_vertex_dw_stride(struct si_shader_context *ctx)
152 {
153    unsigned stride;
154 
155    switch (ctx->stage) {
156    case MESA_SHADER_VERTEX:
157       stride = ctx->shader->selector->lshs_vertex_stride / 4;
158       return LLVMConstInt(ctx->ac.i32, stride, 0);
159 
160    case MESA_SHADER_TESS_CTRL:
161       if (ctx->screen->info.chip_class >= GFX9 && ctx->shader->is_monolithic) {
162          stride = ctx->shader->key.part.tcs.ls->lshs_vertex_stride / 4;
163          return LLVMConstInt(ctx->ac.i32, stride, 0);
164       }
165       return si_unpack_param(ctx, ctx->vs_state_bits, 24, 8);
166 
167    default:
168       assert(0);
169       return NULL;
170    }
171 }
172 
173 static LLVMValueRef
get_dw_address_from_generic_indices(struct si_shader_context * ctx,LLVMValueRef vertex_dw_stride,LLVMValueRef base_addr,LLVMValueRef vertex_index,LLVMValueRef param_index,ubyte name)174 get_dw_address_from_generic_indices(struct si_shader_context *ctx, LLVMValueRef vertex_dw_stride,
175                                     LLVMValueRef base_addr, LLVMValueRef vertex_index,
176                                     LLVMValueRef param_index, ubyte name)
177 {
178    if (vertex_dw_stride) {
179       base_addr = ac_build_imad(&ctx->ac, vertex_index, vertex_dw_stride, base_addr);
180    }
181 
182    if (param_index) {
183       base_addr = ac_build_imad(&ctx->ac, param_index, LLVMConstInt(ctx->ac.i32, 4, 0), base_addr);
184    }
185 
186    int param = name >= VARYING_SLOT_PATCH0 ||
187                name == VARYING_SLOT_TESS_LEVEL_INNER ||
188                name == VARYING_SLOT_TESS_LEVEL_OUTER
189                   ? si_shader_io_get_unique_index_patch(name)
190                   : si_shader_io_get_unique_index(name, false);
191 
192    /* Add the base address of the element. */
193    return LLVMBuildAdd(ctx->ac.builder, base_addr, LLVMConstInt(ctx->ac.i32, param * 4, 0), "");
194 }
195 
196 /* The offchip buffer layout for TCS->TES is
197  *
198  * - attribute 0 of patch 0 vertex 0
199  * - attribute 0 of patch 0 vertex 1
200  * - attribute 0 of patch 0 vertex 2
201  *   ...
202  * - attribute 0 of patch 1 vertex 0
203  * - attribute 0 of patch 1 vertex 1
204  *   ...
205  * - attribute 1 of patch 0 vertex 0
206  * - attribute 1 of patch 0 vertex 1
207  *   ...
208  * - per patch attribute 0 of patch 0
209  * - per patch attribute 0 of patch 1
210  *   ...
211  *
212  * Note that every attribute has 4 components.
213  */
get_tcs_tes_buffer_address(struct si_shader_context * ctx,LLVMValueRef rel_patch_id,LLVMValueRef vertex_index,LLVMValueRef param_index)214 static LLVMValueRef get_tcs_tes_buffer_address(struct si_shader_context *ctx,
215                                                LLVMValueRef rel_patch_id, LLVMValueRef vertex_index,
216                                                LLVMValueRef param_index)
217 {
218    LLVMValueRef base_addr, vertices_per_patch, num_patches, total_vertices;
219    LLVMValueRef param_stride, constant16;
220 
221    vertices_per_patch = get_num_tcs_out_vertices(ctx);
222    num_patches = si_unpack_param(ctx, ctx->tcs_offchip_layout, 0, 6);
223    total_vertices = LLVMBuildMul(ctx->ac.builder, vertices_per_patch, num_patches, "");
224 
225    constant16 = LLVMConstInt(ctx->ac.i32, 16, 0);
226    if (vertex_index) {
227       base_addr = ac_build_imad(&ctx->ac, rel_patch_id, vertices_per_patch, vertex_index);
228       param_stride = total_vertices;
229    } else {
230       base_addr = rel_patch_id;
231       param_stride = num_patches;
232    }
233 
234    base_addr = ac_build_imad(&ctx->ac, param_index, param_stride, base_addr);
235    base_addr = LLVMBuildMul(ctx->ac.builder, base_addr, constant16, "");
236 
237    if (!vertex_index) {
238       LLVMValueRef patch_data_offset = si_unpack_param(ctx, ctx->tcs_offchip_layout, 12, 20);
239 
240       base_addr = LLVMBuildAdd(ctx->ac.builder, base_addr, patch_data_offset, "");
241    }
242    return base_addr;
243 }
244 
get_tcs_tes_buffer_address_from_generic_indices(struct si_shader_context * ctx,LLVMValueRef vertex_index,LLVMValueRef param_index,ubyte name)245 static LLVMValueRef get_tcs_tes_buffer_address_from_generic_indices(struct si_shader_context *ctx,
246                                                                     LLVMValueRef vertex_index,
247                                                                     LLVMValueRef param_index,
248                                                                     ubyte name)
249 {
250    unsigned param_index_base;
251 
252    param_index_base = name >= VARYING_SLOT_PATCH0 ||
253                       name == VARYING_SLOT_TESS_LEVEL_INNER ||
254                       name == VARYING_SLOT_TESS_LEVEL_OUTER
255                          ? si_shader_io_get_unique_index_patch(name)
256                          : si_shader_io_get_unique_index(name, false);
257 
258    if (param_index) {
259       param_index = LLVMBuildAdd(ctx->ac.builder, param_index,
260                                  LLVMConstInt(ctx->ac.i32, param_index_base, 0), "");
261    } else {
262       param_index = LLVMConstInt(ctx->ac.i32, param_index_base, 0);
263    }
264 
265    return get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx), vertex_index, param_index);
266 }
267 
buffer_load(struct si_shader_context * ctx,LLVMTypeRef type,unsigned swizzle,LLVMValueRef buffer,LLVMValueRef offset,LLVMValueRef base,bool can_speculate)268 static LLVMValueRef buffer_load(struct si_shader_context *ctx, LLVMTypeRef type, unsigned swizzle,
269                                 LLVMValueRef buffer, LLVMValueRef offset, LLVMValueRef base,
270                                 bool can_speculate)
271 {
272    LLVMValueRef value;
273    LLVMTypeRef vec_type = LLVMVectorType(type, 4);
274 
275    if (swizzle == ~0) {
276       value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset, 0, ac_glc,
277                                    can_speculate, false);
278 
279       return LLVMBuildBitCast(ctx->ac.builder, value, vec_type, "");
280    }
281 
282    value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset, 0, ac_glc,
283                                 can_speculate, false);
284 
285    value = LLVMBuildBitCast(ctx->ac.builder, value, vec_type, "");
286    return LLVMBuildExtractElement(ctx->ac.builder, value, LLVMConstInt(ctx->ac.i32, swizzle, 0),
287                                   "");
288 }
289 
290 /**
291  * Load from LSHS LDS storage.
292  *
293  * \param type		output value type
294  * \param swizzle	offset (typically 0..3); it can be ~0, which loads a vec4
295  * \param dw_addr	address in dwords
296  */
lshs_lds_load(struct si_shader_context * ctx,LLVMTypeRef type,unsigned swizzle,LLVMValueRef dw_addr)297 static LLVMValueRef lshs_lds_load(struct si_shader_context *ctx, LLVMTypeRef type, unsigned swizzle,
298                                   LLVMValueRef dw_addr)
299 {
300    LLVMValueRef value;
301 
302    if (swizzle == ~0) {
303       LLVMValueRef values[4];
304 
305       for (unsigned chan = 0; chan < 4; chan++)
306          values[chan] = lshs_lds_load(ctx, type, chan, dw_addr);
307 
308       return ac_build_gather_values(&ctx->ac, values, 4);
309    }
310 
311    dw_addr = LLVMBuildAdd(ctx->ac.builder, dw_addr, LLVMConstInt(ctx->ac.i32, swizzle, 0), "");
312    value = ac_lds_load(&ctx->ac, dw_addr);
313    return LLVMBuildBitCast(ctx->ac.builder, value, type, "");
314 }
315 
316 /**
317  * Store to LSHS LDS storage.
318  *
319  * \param swizzle	offset (typically 0..3)
320  * \param dw_addr	address in dwords
321  * \param value		value to store
322  */
lshs_lds_store(struct si_shader_context * ctx,unsigned dw_offset_imm,LLVMValueRef dw_addr,LLVMValueRef value)323 static void lshs_lds_store(struct si_shader_context *ctx, unsigned dw_offset_imm,
324                            LLVMValueRef dw_addr, LLVMValueRef value)
325 {
326    dw_addr =
327       LLVMBuildAdd(ctx->ac.builder, dw_addr, LLVMConstInt(ctx->ac.i32, dw_offset_imm, 0), "");
328 
329    ac_lds_store(&ctx->ac, dw_addr, value);
330 }
331 
332 enum si_tess_ring
333 {
334    TCS_FACTOR_RING,
335    TESS_OFFCHIP_RING_TCS,
336    TESS_OFFCHIP_RING_TES,
337 };
338 
get_tess_ring_descriptor(struct si_shader_context * ctx,enum si_tess_ring ring)339 static LLVMValueRef get_tess_ring_descriptor(struct si_shader_context *ctx, enum si_tess_ring ring)
340 {
341    LLVMBuilderRef builder = ctx->ac.builder;
342    LLVMValueRef addr = ac_get_arg(
343       &ctx->ac, ring == TESS_OFFCHIP_RING_TES ? ctx->tes_offchip_addr : ctx->tcs_out_lds_layout);
344 
345    /* TCS only receives high 13 bits of the address. */
346    if (ring == TESS_OFFCHIP_RING_TCS || ring == TCS_FACTOR_RING) {
347       addr = LLVMBuildAnd(builder, addr, LLVMConstInt(ctx->ac.i32, 0xfff80000, 0), "");
348    }
349 
350    if (ring == TCS_FACTOR_RING) {
351       unsigned tf_offset = ctx->screen->tess_offchip_ring_size;
352       addr = LLVMBuildAdd(builder, addr, LLVMConstInt(ctx->ac.i32, tf_offset, 0), "");
353    }
354 
355    uint32_t rsrc3 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
356                     S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
357 
358    if (ctx->screen->info.chip_class >= GFX10)
359       rsrc3 |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
360                S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1);
361    else
362       rsrc3 |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
363                S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
364 
365    LLVMValueRef desc[4];
366    desc[0] = addr;
367    desc[1] = LLVMConstInt(ctx->ac.i32, S_008F04_BASE_ADDRESS_HI(ctx->screen->info.address32_hi), 0);
368    desc[2] = LLVMConstInt(ctx->ac.i32, 0xffffffff, 0);
369    desc[3] = LLVMConstInt(ctx->ac.i32, rsrc3, false);
370 
371    return ac_build_gather_values(&ctx->ac, desc, 4);
372 }
373 
si_llvm_preload_tes_rings(struct si_shader_context * ctx)374 void si_llvm_preload_tes_rings(struct si_shader_context *ctx)
375 {
376    ctx->tess_offchip_ring = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TES);
377 }
378 
si_nir_load_tcs_varyings(struct ac_shader_abi * abi,LLVMTypeRef type,LLVMValueRef vertex_index,LLVMValueRef param_index,unsigned driver_location,unsigned component,unsigned num_components,bool load_input)379 static LLVMValueRef si_nir_load_tcs_varyings(struct ac_shader_abi *abi, LLVMTypeRef type,
380                                              LLVMValueRef vertex_index, LLVMValueRef param_index,
381                                              unsigned driver_location, unsigned component,
382                                              unsigned num_components, bool load_input)
383 {
384    struct si_shader_context *ctx = si_shader_context_from_abi(abi);
385    struct si_shader_info *info = &ctx->shader->selector->info;
386    LLVMValueRef dw_addr, stride;
387    ubyte semantic;
388 
389    if (load_input) {
390       semantic = info->input_semantic[driver_location];
391    } else {
392       semantic = info->output_semantic[driver_location];
393    }
394 
395    bool is_patch = vertex_index == NULL;
396    assert((semantic >= VARYING_SLOT_PATCH0 ||
397            semantic == VARYING_SLOT_TESS_LEVEL_INNER ||
398            semantic == VARYING_SLOT_TESS_LEVEL_OUTER) == is_patch);
399 
400    if (load_input) {
401       stride = get_tcs_in_vertex_dw_stride(ctx);
402       dw_addr = get_tcs_in_current_patch_offset(ctx);
403    } else {
404       if (is_patch) {
405          stride = NULL;
406          dw_addr = get_tcs_out_current_patch_data_offset(ctx);
407       } else {
408          stride = get_tcs_out_vertex_dw_stride(ctx);
409          dw_addr = get_tcs_out_current_patch_offset(ctx);
410       }
411    }
412 
413    if (!param_index) {
414       param_index = ctx->ac.i32_0;
415    }
416 
417    dw_addr = get_dw_address_from_generic_indices(ctx, stride, dw_addr, vertex_index, param_index,
418                                                  semantic);
419 
420    LLVMValueRef value[4];
421    for (unsigned i = component; i < component + num_components; i++)
422       value[i] = lshs_lds_load(ctx, type, i, dw_addr);
423 
424    return ac_build_varying_gather_values(&ctx->ac, value, num_components, component);
425 }
426 
si_nir_load_input_tes(struct ac_shader_abi * abi,LLVMTypeRef type,LLVMValueRef vertex_index,LLVMValueRef param_index,unsigned driver_location,unsigned component,unsigned num_components,bool load_input)427 static LLVMValueRef si_nir_load_input_tes(struct ac_shader_abi *abi, LLVMTypeRef type,
428                                           LLVMValueRef vertex_index, LLVMValueRef param_index,
429                                           unsigned driver_location, unsigned component,
430                                           unsigned num_components,
431                                           bool load_input)
432 {
433    struct si_shader_context *ctx = si_shader_context_from_abi(abi);
434    struct si_shader_info *info = &ctx->shader->selector->info;
435    LLVMValueRef base, addr;
436 
437    ubyte semantic = info->input_semantic[driver_location];
438 
439    assert((semantic >= VARYING_SLOT_PATCH0 ||
440            semantic == VARYING_SLOT_TESS_LEVEL_INNER ||
441            semantic == VARYING_SLOT_TESS_LEVEL_OUTER) == (vertex_index == NULL));
442 
443    base = ac_get_arg(&ctx->ac, ctx->tcs_offchip_offset);
444 
445    if (!param_index) {
446       param_index = ctx->ac.i32_0;
447    }
448 
449    addr =
450       get_tcs_tes_buffer_address_from_generic_indices(ctx, vertex_index, param_index, semantic);
451 
452    /* TODO: This will generate rather ordinary llvm code, although it
453     * should be easy for the optimiser to fix up. In future we might want
454     * to refactor buffer_load().
455     */
456    LLVMValueRef value[4];
457    for (unsigned i = component; i < component + num_components; i++)
458       value[i] = buffer_load(ctx, type, i, ctx->tess_offchip_ring, base, addr, true);
459 
460    return ac_build_varying_gather_values(&ctx->ac, value, num_components, component);
461 }
462 
si_nir_store_output_tcs(struct ac_shader_abi * abi,LLVMValueRef vertex_index,LLVMValueRef param_index,LLVMValueRef src,unsigned writemask,unsigned component,unsigned location,unsigned driver_location)463 static void si_nir_store_output_tcs(struct ac_shader_abi *abi,
464                                     LLVMValueRef vertex_index, LLVMValueRef param_index,
465                                     LLVMValueRef src, unsigned writemask,
466                                     unsigned component, unsigned location, unsigned driver_location)
467 {
468    struct si_shader_context *ctx = si_shader_context_from_abi(abi);
469    struct si_shader_info *info = &ctx->shader->selector->info;
470    LLVMValueRef dw_addr, stride;
471    LLVMValueRef buffer, base, addr;
472    LLVMValueRef values[8];
473    bool is_tess_factor = false, is_tess_inner = false;
474 
475    ubyte semantic = info->output_semantic[driver_location];
476 
477    bool is_const = !param_index;
478    if (!param_index)
479       param_index = ctx->ac.i32_0;
480 
481    const bool is_patch = vertex_index == NULL;
482 
483    /* Invalid SPIR-V can cause this. */
484    if ((semantic >= VARYING_SLOT_PATCH0 || semantic == VARYING_SLOT_TESS_LEVEL_INNER ||
485         semantic == VARYING_SLOT_TESS_LEVEL_OUTER) != is_patch)
486       return;
487 
488    if (!is_patch) {
489       stride = get_tcs_out_vertex_dw_stride(ctx);
490       dw_addr = get_tcs_out_current_patch_offset(ctx);
491       dw_addr = get_dw_address_from_generic_indices(ctx, stride, dw_addr, vertex_index, param_index,
492                                                     semantic);
493    } else {
494       dw_addr = get_tcs_out_current_patch_data_offset(ctx);
495       dw_addr = get_dw_address_from_generic_indices(ctx, NULL, dw_addr, vertex_index, param_index,
496                                                     semantic);
497 
498       if (is_const) {
499          int semantic = info->output_semantic[driver_location];
500 
501          /* Always write tess factors into LDS for the TCS epilog. */
502          if (semantic == VARYING_SLOT_TESS_LEVEL_INNER ||
503              semantic == VARYING_SLOT_TESS_LEVEL_OUTER) {
504             is_tess_factor = true;
505             is_tess_inner = semantic == VARYING_SLOT_TESS_LEVEL_INNER;
506          }
507       }
508    }
509 
510    buffer = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TCS);
511 
512    base = ac_get_arg(&ctx->ac, ctx->tcs_offchip_offset);
513 
514    addr =
515       get_tcs_tes_buffer_address_from_generic_indices(ctx, vertex_index, param_index, semantic);
516 
517    for (unsigned chan = component; chan < 4; chan++) {
518       if (!(writemask & (1 << chan)))
519          continue;
520       LLVMValueRef value = ac_llvm_extract_elem(&ctx->ac, src, chan - component);
521 
522       /* Skip LDS stores if there is no LDS read of this output. */
523       if (info->output_readmask[driver_location] & (1 << chan) ||
524           /* The epilog reads LDS if invocation 0 doesn't define tess factors. */
525           (is_tess_factor &&
526            !ctx->shader->selector->info.tessfactors_are_def_in_all_invocs))
527          lshs_lds_store(ctx, chan, dw_addr, value);
528 
529       value = ac_to_integer(&ctx->ac, value);
530       values[chan] = value;
531 
532       if (writemask != 0xF && !is_tess_factor) {
533          ac_build_buffer_store_dword(&ctx->ac, buffer, value, 1, addr, base,
534                                      4 * chan, ac_glc);
535       }
536 
537       /* Write tess factors into VGPRs for the epilog. */
538       if (is_tess_factor && ctx->shader->selector->info.tessfactors_are_def_in_all_invocs) {
539          if (!is_tess_inner) {
540             LLVMBuildStore(ctx->ac.builder, value, /* outer */
541                            ctx->invoc0_tess_factors[chan]);
542          } else if (chan < 2) {
543             LLVMBuildStore(ctx->ac.builder, value, /* inner */
544                            ctx->invoc0_tess_factors[4 + chan]);
545          }
546       }
547    }
548 
549    if (writemask == 0xF && !is_tess_factor) {
550       LLVMValueRef value = ac_build_gather_values(&ctx->ac, values, 4);
551       ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, addr, base, 0, ac_glc);
552    }
553 }
554 
si_load_tess_coord(struct ac_shader_abi * abi)555 static LLVMValueRef si_load_tess_coord(struct ac_shader_abi *abi)
556 {
557    struct si_shader_context *ctx = si_shader_context_from_abi(abi);
558    LLVMValueRef coord[4] = {ac_get_arg(&ctx->ac, ctx->tes_u), ac_get_arg(&ctx->ac, ctx->tes_v),
559                             ctx->ac.f32_0, ctx->ac.f32_0};
560 
561    /* For triangles, the vector should be (u, v, 1-u-v). */
562    if (ctx->shader->selector->info.base.tess.primitive_mode == GL_TRIANGLES) {
563       coord[2] = LLVMBuildFSub(ctx->ac.builder, ctx->ac.f32_1,
564                                LLVMBuildFAdd(ctx->ac.builder, coord[0], coord[1], ""), "");
565    }
566    return ac_build_gather_values(&ctx->ac, coord, 4);
567 }
568 
load_tess_level(struct si_shader_context * ctx,unsigned semantic)569 static LLVMValueRef load_tess_level(struct si_shader_context *ctx, unsigned semantic)
570 {
571    LLVMValueRef base, addr;
572 
573    int param = si_shader_io_get_unique_index_patch(semantic);
574 
575    base = ac_get_arg(&ctx->ac, ctx->tcs_offchip_offset);
576    addr = get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx), NULL,
577                                      LLVMConstInt(ctx->ac.i32, param, 0));
578 
579    return buffer_load(ctx, ctx->ac.f32, ~0, ctx->tess_offchip_ring, base, addr, true);
580 }
581 
load_tess_level_default(struct si_shader_context * ctx,unsigned sysval)582 static LLVMValueRef load_tess_level_default(struct si_shader_context *ctx, unsigned sysval)
583 {
584    LLVMValueRef buf, slot, val[4];
585    int i, offset;
586 
587    slot = LLVMConstInt(ctx->ac.i32, SI_HS_CONST_DEFAULT_TESS_LEVELS, 0);
588    buf = ac_get_arg(&ctx->ac, ctx->rw_buffers);
589    buf = ac_build_load_to_sgpr(&ctx->ac, buf, slot);
590    offset = sysval == SYSTEM_VALUE_TESS_LEVEL_INNER_DEFAULT ? 4 : 0;
591 
592    for (i = 0; i < 4; i++)
593       val[i] = si_buffer_load_const(ctx, buf, LLVMConstInt(ctx->ac.i32, (offset + i) * 4, 0));
594    return ac_build_gather_values(&ctx->ac, val, 4);
595 }
596 
si_load_tess_level(struct ac_shader_abi * abi,unsigned varying_id,bool load_default_state)597 static LLVMValueRef si_load_tess_level(struct ac_shader_abi *abi, unsigned varying_id,
598                                        bool load_default_state)
599 {
600    struct si_shader_context *ctx = si_shader_context_from_abi(abi);
601    unsigned semantic;
602 
603    if (load_default_state) {
604       switch (varying_id) {
605       case VARYING_SLOT_TESS_LEVEL_INNER:
606          semantic = SYSTEM_VALUE_TESS_LEVEL_INNER_DEFAULT;
607          break;
608       case VARYING_SLOT_TESS_LEVEL_OUTER:
609          semantic = SYSTEM_VALUE_TESS_LEVEL_OUTER_DEFAULT;
610          break;
611       default:
612          unreachable("unknown tess level");
613       }
614       return load_tess_level_default(ctx, semantic);
615    }
616 
617    switch (varying_id) {
618    case VARYING_SLOT_TESS_LEVEL_INNER:
619       semantic = VARYING_SLOT_TESS_LEVEL_INNER;
620       break;
621    case VARYING_SLOT_TESS_LEVEL_OUTER:
622       semantic = VARYING_SLOT_TESS_LEVEL_OUTER;
623       break;
624    default:
625       unreachable("unknown tess level");
626    }
627 
628    return load_tess_level(ctx, semantic);
629 }
630 
si_load_patch_vertices_in(struct ac_shader_abi * abi)631 static LLVMValueRef si_load_patch_vertices_in(struct ac_shader_abi *abi)
632 {
633    struct si_shader_context *ctx = si_shader_context_from_abi(abi);
634    if (ctx->stage == MESA_SHADER_TESS_CTRL)
635       return si_unpack_param(ctx, ctx->tcs_out_lds_layout, 13, 6);
636    else if (ctx->stage == MESA_SHADER_TESS_EVAL)
637       return get_num_tcs_out_vertices(ctx);
638    else
639       unreachable("invalid shader stage for VERTICESIN");
640 }
641 
642 /**
643  * Forward all outputs from the vertex shader to the TES. This is only used
644  * for the fixed function TCS.
645  */
si_copy_tcs_inputs(struct si_shader_context * ctx)646 static void si_copy_tcs_inputs(struct si_shader_context *ctx)
647 {
648    LLVMValueRef invocation_id, buffer, buffer_offset;
649    LLVMValueRef lds_vertex_stride, lds_base;
650    uint64_t inputs;
651 
652    invocation_id = si_unpack_param(ctx, ctx->args.tcs_rel_ids, 8, 5);
653    buffer = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TCS);
654    buffer_offset = ac_get_arg(&ctx->ac, ctx->tcs_offchip_offset);
655 
656    lds_vertex_stride = get_tcs_in_vertex_dw_stride(ctx);
657    lds_base = get_tcs_in_current_patch_offset(ctx);
658    lds_base = ac_build_imad(&ctx->ac, invocation_id, lds_vertex_stride, lds_base);
659 
660    inputs = ctx->shader->key.mono.u.ff_tcs_inputs_to_copy;
661    while (inputs) {
662       unsigned i = u_bit_scan64(&inputs);
663 
664       LLVMValueRef lds_ptr =
665          LLVMBuildAdd(ctx->ac.builder, lds_base, LLVMConstInt(ctx->ac.i32, 4 * i, 0), "");
666 
667       LLVMValueRef buffer_addr = get_tcs_tes_buffer_address(
668          ctx, get_rel_patch_id(ctx), invocation_id, LLVMConstInt(ctx->ac.i32, i, 0));
669 
670       LLVMValueRef value = lshs_lds_load(ctx, ctx->ac.i32, ~0, lds_ptr);
671 
672       ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, buffer_addr, buffer_offset, 0,
673                                   ac_glc);
674    }
675 }
676 
si_write_tess_factors(struct si_shader_context * ctx,LLVMValueRef rel_patch_id,LLVMValueRef invocation_id,LLVMValueRef tcs_out_current_patch_data_offset,LLVMValueRef invoc0_tf_outer[4],LLVMValueRef invoc0_tf_inner[2])677 static void si_write_tess_factors(struct si_shader_context *ctx, LLVMValueRef rel_patch_id,
678                                   LLVMValueRef invocation_id,
679                                   LLVMValueRef tcs_out_current_patch_data_offset,
680                                   LLVMValueRef invoc0_tf_outer[4], LLVMValueRef invoc0_tf_inner[2])
681 {
682    struct si_shader *shader = ctx->shader;
683    unsigned tess_inner_index, tess_outer_index;
684    LLVMValueRef lds_base, lds_inner, lds_outer, byteoffset, buffer;
685    LLVMValueRef out[6], vec0, vec1, tf_base, inner[4], outer[4];
686    unsigned stride, outer_comps, inner_comps, i, offset;
687 
688    /* Add a barrier before loading tess factors from LDS. */
689    if (!shader->key.part.tcs.epilog.invoc0_tess_factors_are_def)
690       si_llvm_emit_barrier(ctx);
691 
692    /* Do this only for invocation 0, because the tess levels are per-patch,
693     * not per-vertex.
694     *
695     * This can't jump, because invocation 0 executes this. It should
696     * at least mask out the loads and stores for other invocations.
697     */
698    ac_build_ifcc(&ctx->ac,
699                  LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, invocation_id, ctx->ac.i32_0, ""), 6503);
700 
701    /* Determine the layout of one tess factor element in the buffer. */
702    switch (shader->key.part.tcs.epilog.prim_mode) {
703    case GL_LINES:
704       stride = 2; /* 2 dwords, 1 vec2 store */
705       outer_comps = 2;
706       inner_comps = 0;
707       break;
708    case GL_TRIANGLES:
709       stride = 4; /* 4 dwords, 1 vec4 store */
710       outer_comps = 3;
711       inner_comps = 1;
712       break;
713    case GL_QUADS:
714       stride = 6; /* 6 dwords, 2 stores (vec4 + vec2) */
715       outer_comps = 4;
716       inner_comps = 2;
717       break;
718    default:
719       assert(0);
720       return;
721    }
722 
723    for (i = 0; i < 4; i++) {
724       inner[i] = LLVMGetUndef(ctx->ac.i32);
725       outer[i] = LLVMGetUndef(ctx->ac.i32);
726    }
727 
728    if (shader->key.part.tcs.epilog.invoc0_tess_factors_are_def) {
729       /* Tess factors are in VGPRs. */
730       for (i = 0; i < outer_comps; i++)
731          outer[i] = out[i] = invoc0_tf_outer[i];
732       for (i = 0; i < inner_comps; i++)
733          inner[i] = out[outer_comps + i] = invoc0_tf_inner[i];
734    } else {
735       /* Load tess_inner and tess_outer from LDS.
736        * Any invocation can write them, so we can't get them from a temporary.
737        */
738       tess_inner_index = si_shader_io_get_unique_index_patch(VARYING_SLOT_TESS_LEVEL_INNER);
739       tess_outer_index = si_shader_io_get_unique_index_patch(VARYING_SLOT_TESS_LEVEL_OUTER);
740 
741       lds_base = tcs_out_current_patch_data_offset;
742       lds_inner = LLVMBuildAdd(ctx->ac.builder, lds_base,
743                                LLVMConstInt(ctx->ac.i32, tess_inner_index * 4, 0), "");
744       lds_outer = LLVMBuildAdd(ctx->ac.builder, lds_base,
745                                LLVMConstInt(ctx->ac.i32, tess_outer_index * 4, 0), "");
746 
747       for (i = 0; i < outer_comps; i++) {
748          outer[i] = out[i] = lshs_lds_load(ctx, ctx->ac.i32, i, lds_outer);
749       }
750       for (i = 0; i < inner_comps; i++) {
751          inner[i] = out[outer_comps + i] = lshs_lds_load(ctx, ctx->ac.i32, i, lds_inner);
752       }
753    }
754 
755    if (shader->key.part.tcs.epilog.prim_mode == GL_LINES) {
756       /* For isolines, the hardware expects tess factors in the
757        * reverse order from what NIR specifies.
758        */
759       LLVMValueRef tmp = out[0];
760       out[0] = out[1];
761       out[1] = tmp;
762    }
763 
764    /* Convert the outputs to vectors for stores. */
765    vec0 = ac_build_gather_values(&ctx->ac, out, MIN2(stride, 4));
766    vec1 = NULL;
767 
768    if (stride > 4)
769       vec1 = ac_build_gather_values(&ctx->ac, out + 4, stride - 4);
770 
771    /* Get the buffer. */
772    buffer = get_tess_ring_descriptor(ctx, TCS_FACTOR_RING);
773 
774    /* Get the offset. */
775    tf_base = ac_get_arg(&ctx->ac, ctx->tcs_factor_offset);
776    byteoffset =
777       LLVMBuildMul(ctx->ac.builder, rel_patch_id, LLVMConstInt(ctx->ac.i32, 4 * stride, 0), "");
778 
779    ac_build_ifcc(&ctx->ac,
780                  LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, rel_patch_id, ctx->ac.i32_0, ""), 6504);
781 
782    /* Store the dynamic HS control word. */
783    offset = 0;
784    if (ctx->screen->info.chip_class <= GFX8) {
785       ac_build_buffer_store_dword(&ctx->ac, buffer, LLVMConstInt(ctx->ac.i32, 0x80000000, 0), 1,
786                                   ctx->ac.i32_0, tf_base, offset, ac_glc);
787       offset += 4;
788    }
789 
790    ac_build_endif(&ctx->ac, 6504);
791 
792    /* Store the tessellation factors. */
793    ac_build_buffer_store_dword(&ctx->ac, buffer, vec0, MIN2(stride, 4), byteoffset, tf_base, offset,
794                                ac_glc);
795    offset += 16;
796    if (vec1)
797       ac_build_buffer_store_dword(&ctx->ac, buffer, vec1, stride - 4, byteoffset, tf_base, offset,
798                                   ac_glc);
799 
800    /* Store the tess factors into the offchip buffer if TES reads them. */
801    if (shader->key.part.tcs.epilog.tes_reads_tess_factors) {
802       LLVMValueRef buf, base, inner_vec, outer_vec, tf_outer_offset;
803       LLVMValueRef tf_inner_offset;
804       unsigned param_outer, param_inner;
805 
806       buf = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TCS);
807       base = ac_get_arg(&ctx->ac, ctx->tcs_offchip_offset);
808 
809       param_outer = si_shader_io_get_unique_index_patch(VARYING_SLOT_TESS_LEVEL_OUTER);
810       tf_outer_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL,
811                                                    LLVMConstInt(ctx->ac.i32, param_outer, 0));
812 
813       unsigned outer_vec_size = ac_has_vec3_support(ctx->screen->info.chip_class, false)
814                                    ? outer_comps
815                                    : util_next_power_of_two(outer_comps);
816       outer_vec = ac_build_gather_values(&ctx->ac, outer, outer_vec_size);
817 
818       ac_build_buffer_store_dword(&ctx->ac, buf, outer_vec, outer_comps, tf_outer_offset, base, 0,
819                                   ac_glc);
820       if (inner_comps) {
821          param_inner = si_shader_io_get_unique_index_patch(VARYING_SLOT_TESS_LEVEL_INNER);
822          tf_inner_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL,
823                                                       LLVMConstInt(ctx->ac.i32, param_inner, 0));
824 
825          inner_vec =
826             inner_comps == 1 ? inner[0] : ac_build_gather_values(&ctx->ac, inner, inner_comps);
827          ac_build_buffer_store_dword(&ctx->ac, buf, inner_vec, inner_comps, tf_inner_offset, base,
828                                      0, ac_glc);
829       }
830    }
831 
832    ac_build_endif(&ctx->ac, 6503);
833 }
834 
835 /* This only writes the tessellation factor levels. */
si_llvm_emit_tcs_epilogue(struct ac_shader_abi * abi,unsigned max_outputs,LLVMValueRef * addrs)836 static void si_llvm_emit_tcs_epilogue(struct ac_shader_abi *abi, unsigned max_outputs,
837                                       LLVMValueRef *addrs)
838 {
839    struct si_shader_context *ctx = si_shader_context_from_abi(abi);
840    LLVMBuilderRef builder = ctx->ac.builder;
841    LLVMValueRef rel_patch_id, invocation_id, tf_lds_offset;
842 
843    si_copy_tcs_inputs(ctx);
844 
845    rel_patch_id = get_rel_patch_id(ctx);
846    invocation_id = si_unpack_param(ctx, ctx->args.tcs_rel_ids, 8, 5);
847    tf_lds_offset = get_tcs_out_current_patch_data_offset(ctx);
848 
849    if (ctx->screen->info.chip_class >= GFX9) {
850       LLVMBasicBlockRef blocks[2] = {LLVMGetInsertBlock(builder), ctx->merged_wrap_if_entry_block};
851       LLVMValueRef values[2];
852 
853       ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
854 
855       values[0] = rel_patch_id;
856       values[1] = LLVMGetUndef(ctx->ac.i32);
857       rel_patch_id = ac_build_phi(&ctx->ac, ctx->ac.i32, 2, values, blocks);
858 
859       values[0] = tf_lds_offset;
860       values[1] = LLVMGetUndef(ctx->ac.i32);
861       tf_lds_offset = ac_build_phi(&ctx->ac, ctx->ac.i32, 2, values, blocks);
862 
863       values[0] = invocation_id;
864       values[1] = ctx->ac.i32_1; /* cause the epilog to skip threads */
865       invocation_id = ac_build_phi(&ctx->ac, ctx->ac.i32, 2, values, blocks);
866    }
867 
868    /* Return epilog parameters from this function. */
869    LLVMValueRef ret = ctx->return_value;
870    unsigned vgpr;
871 
872    if (ctx->screen->info.chip_class >= GFX9) {
873       ret =
874          si_insert_input_ret(ctx, ret, ctx->tcs_offchip_layout, 8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT);
875       ret = si_insert_input_ret(ctx, ret, ctx->tcs_out_lds_layout, 8 + GFX9_SGPR_TCS_OUT_LAYOUT);
876       /* Tess offchip and tess factor offsets are at the beginning. */
877       ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_offset, 2);
878       ret = si_insert_input_ret(ctx, ret, ctx->tcs_factor_offset, 4);
879       vgpr = 8 + GFX9_SGPR_TCS_OUT_LAYOUT + 1;
880    } else {
881       ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_layout, GFX6_SGPR_TCS_OFFCHIP_LAYOUT);
882       ret = si_insert_input_ret(ctx, ret, ctx->tcs_out_lds_layout, GFX6_SGPR_TCS_OUT_LAYOUT);
883       /* Tess offchip and tess factor offsets are after user SGPRs. */
884       ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_offset, GFX6_TCS_NUM_USER_SGPR);
885       ret = si_insert_input_ret(ctx, ret, ctx->tcs_factor_offset, GFX6_TCS_NUM_USER_SGPR + 1);
886       vgpr = GFX6_TCS_NUM_USER_SGPR + 2;
887    }
888 
889    /* VGPRs */
890    rel_patch_id = ac_to_float(&ctx->ac, rel_patch_id);
891    invocation_id = ac_to_float(&ctx->ac, invocation_id);
892    tf_lds_offset = ac_to_float(&ctx->ac, tf_lds_offset);
893 
894    /* Leave a hole corresponding to the two input VGPRs. This ensures that
895     * the invocation_id output does not alias the tcs_rel_ids input,
896     * which saves a V_MOV on gfx9.
897     */
898    vgpr += 2;
899 
900    ret = LLVMBuildInsertValue(builder, ret, rel_patch_id, vgpr++, "");
901    ret = LLVMBuildInsertValue(builder, ret, invocation_id, vgpr++, "");
902 
903    if (ctx->shader->selector->info.tessfactors_are_def_in_all_invocs) {
904       vgpr++; /* skip the tess factor LDS offset */
905       for (unsigned i = 0; i < 6; i++) {
906          LLVMValueRef value = LLVMBuildLoad(builder, ctx->invoc0_tess_factors[i], "");
907          value = ac_to_float(&ctx->ac, value);
908          ret = LLVMBuildInsertValue(builder, ret, value, vgpr++, "");
909       }
910    } else {
911       ret = LLVMBuildInsertValue(builder, ret, tf_lds_offset, vgpr++, "");
912    }
913    ctx->return_value = ret;
914 }
915 
916 /* Pass TCS inputs from LS to TCS on GFX9. */
si_set_ls_return_value_for_tcs(struct si_shader_context * ctx)917 static void si_set_ls_return_value_for_tcs(struct si_shader_context *ctx)
918 {
919    LLVMValueRef ret = ctx->return_value;
920 
921    ret = si_insert_input_ptr(ctx, ret, ctx->other_const_and_shader_buffers, 0);
922    ret = si_insert_input_ptr(ctx, ret, ctx->other_samplers_and_images, 1);
923    ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_offset, 2);
924    ret = si_insert_input_ret(ctx, ret, ctx->merged_wave_info, 3);
925    ret = si_insert_input_ret(ctx, ret, ctx->tcs_factor_offset, 4);
926    ret = si_insert_input_ret(ctx, ret, ctx->merged_scratch_offset, 5);
927 
928    ret = si_insert_input_ptr(ctx, ret, ctx->rw_buffers, 8 + SI_SGPR_RW_BUFFERS);
929    ret = si_insert_input_ptr(ctx, ret, ctx->bindless_samplers_and_images,
930                              8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES);
931 
932    ret = si_insert_input_ret(ctx, ret, ctx->vs_state_bits, 8 + SI_SGPR_VS_STATE_BITS);
933 
934    ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_layout, 8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT);
935    ret = si_insert_input_ret(ctx, ret, ctx->tcs_out_lds_offsets, 8 + GFX9_SGPR_TCS_OUT_OFFSETS);
936    ret = si_insert_input_ret(ctx, ret, ctx->tcs_out_lds_layout, 8 + GFX9_SGPR_TCS_OUT_LAYOUT);
937 
938    unsigned vgpr = 8 + GFX9_TCS_NUM_USER_SGPR;
939    ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
940                               ac_to_float(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args.tcs_patch_id)),
941                               vgpr++, "");
942    ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
943                               ac_to_float(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args.tcs_rel_ids)),
944                               vgpr++, "");
945    ctx->return_value = ret;
946 }
947 
si_llvm_emit_ls_epilogue(struct ac_shader_abi * abi,unsigned max_outputs,LLVMValueRef * addrs)948 void si_llvm_emit_ls_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LLVMValueRef *addrs)
949 {
950    struct si_shader_context *ctx = si_shader_context_from_abi(abi);
951    struct si_shader *shader = ctx->shader;
952    struct si_shader_info *info = &shader->selector->info;
953    unsigned i, chan;
954    LLVMValueRef vertex_id = ac_get_arg(&ctx->ac, ctx->rel_auto_id);
955    LLVMValueRef vertex_dw_stride = get_tcs_in_vertex_dw_stride(ctx);
956    LLVMValueRef base_dw_addr = LLVMBuildMul(ctx->ac.builder, vertex_id, vertex_dw_stride, "");
957 
958    /* Write outputs to LDS. The next shader (TCS aka HS) will read
959     * its inputs from it. */
960    for (i = 0; i < info->num_outputs; i++) {
961       unsigned semantic = info->output_semantic[i];
962 
963       /* The ARB_shader_viewport_layer_array spec contains the
964        * following issue:
965        *
966        *    2) What happens if gl_ViewportIndex or gl_Layer is
967        *    written in the vertex shader and a geometry shader is
968        *    present?
969        *
970        *    RESOLVED: The value written by the last vertex processing
971        *    stage is used. If the last vertex processing stage
972        *    (vertex, tessellation evaluation or geometry) does not
973        *    statically assign to gl_ViewportIndex or gl_Layer, index
974        *    or layer zero is assumed.
975        *
976        * So writes to those outputs in VS-as-LS are simply ignored.
977        */
978       if (semantic == VARYING_SLOT_LAYER || semantic == VARYING_SLOT_VIEWPORT)
979          continue;
980 
981       int param = si_shader_io_get_unique_index(semantic, false);
982       LLVMValueRef dw_addr =
983          LLVMBuildAdd(ctx->ac.builder, base_dw_addr, LLVMConstInt(ctx->ac.i32, param * 4, 0), "");
984 
985       for (chan = 0; chan < 4; chan++) {
986          if (!(info->output_usagemask[i] & (1 << chan)))
987             continue;
988 
989          lshs_lds_store(ctx, chan, dw_addr,
990                         LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], ""));
991       }
992    }
993 
994    if (ctx->screen->info.chip_class >= GFX9)
995       si_set_ls_return_value_for_tcs(ctx);
996 }
997 
998 /**
999  * Compile the TCS epilog function. This writes tesselation factors to memory
1000  * based on the output primitive type of the tesselator (determined by TES).
1001  */
si_llvm_build_tcs_epilog(struct si_shader_context * ctx,union si_shader_part_key * key)1002 void si_llvm_build_tcs_epilog(struct si_shader_context *ctx, union si_shader_part_key *key)
1003 {
1004    memset(&ctx->args, 0, sizeof(ctx->args));
1005 
1006    if (ctx->screen->info.chip_class >= GFX9) {
1007       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
1008       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
1009       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
1010       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* wave info */
1011       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_factor_offset);
1012       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
1013       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
1014       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
1015       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
1016       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
1017       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
1018       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
1019       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
1020       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
1021       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
1022       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
1023       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
1024       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
1025       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_layout);
1026    } else {
1027       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
1028       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
1029       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
1030       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
1031       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
1032       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
1033       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_layout);
1034       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
1035       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
1036       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_factor_offset);
1037    }
1038 
1039    ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* VGPR gap */
1040    ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* VGPR gap */
1041    struct ac_arg rel_patch_id; /* patch index within the wave (REL_PATCH_ID) */
1042    ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &rel_patch_id);
1043    struct ac_arg invocation_id; /* invocation ID within the patch */
1044    ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &invocation_id);
1045    struct ac_arg
1046       tcs_out_current_patch_data_offset; /* LDS offset where tess factors should be loaded from */
1047    ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &tcs_out_current_patch_data_offset);
1048 
1049    struct ac_arg tess_factors[6];
1050    for (unsigned i = 0; i < 6; i++)
1051       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &tess_factors[i]);
1052 
1053    /* Create the function. */
1054    si_llvm_create_func(ctx, "tcs_epilog", NULL, 0, ctx->screen->info.chip_class >= GFX7 ? 128 : 0);
1055    ac_declare_lds_as_pointer(&ctx->ac);
1056 
1057    LLVMValueRef invoc0_tess_factors[6];
1058    for (unsigned i = 0; i < 6; i++)
1059       invoc0_tess_factors[i] = ac_get_arg(&ctx->ac, tess_factors[i]);
1060 
1061    si_write_tess_factors(ctx, ac_get_arg(&ctx->ac, rel_patch_id),
1062                          ac_get_arg(&ctx->ac, invocation_id),
1063                          ac_get_arg(&ctx->ac, tcs_out_current_patch_data_offset),
1064                          invoc0_tess_factors, invoc0_tess_factors + 4);
1065 
1066    LLVMBuildRetVoid(ctx->ac.builder);
1067 }
1068 
si_llvm_init_tcs_callbacks(struct si_shader_context * ctx)1069 void si_llvm_init_tcs_callbacks(struct si_shader_context *ctx)
1070 {
1071    ctx->abi.load_tess_varyings = si_nir_load_tcs_varyings;
1072    ctx->abi.load_tess_level = si_load_tess_level;
1073    ctx->abi.store_tcs_outputs = si_nir_store_output_tcs;
1074    ctx->abi.emit_outputs = si_llvm_emit_tcs_epilogue;
1075    ctx->abi.load_patch_vertices_in = si_load_patch_vertices_in;
1076 }
1077 
si_llvm_init_tes_callbacks(struct si_shader_context * ctx,bool ngg_cull_shader)1078 void si_llvm_init_tes_callbacks(struct si_shader_context *ctx, bool ngg_cull_shader)
1079 {
1080    ctx->abi.load_tess_varyings = si_nir_load_input_tes;
1081    ctx->abi.load_tess_coord = si_load_tess_coord;
1082    ctx->abi.load_tess_level = si_load_tess_level;
1083    ctx->abi.load_patch_vertices_in = si_load_patch_vertices_in;
1084 
1085    if (ctx->shader->key.as_es)
1086       ctx->abi.emit_outputs = si_llvm_emit_es_epilogue;
1087    else if (ngg_cull_shader)
1088       ctx->abi.emit_outputs = gfx10_emit_ngg_culling_epilogue;
1089    else if (ctx->shader->key.as_ngg)
1090       ctx->abi.emit_outputs = gfx10_emit_ngg_epilogue;
1091    else
1092       ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue;
1093 }
1094