1 /*
2  * Copyright © 2019 Google, Inc.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  */
23 
24 #include "ir3_nir.h"
25 #include "ir3_compiler.h"
26 #include "compiler/nir/nir_builder.h"
27 
28 struct state {
29 	uint32_t topology;
30 
31 	struct primitive_map {
32 		unsigned loc[32 + 4]; /* +POSITION +PSIZE +CLIP_DIST0 +CLIP_DIST1 */
33 		unsigned stride;
34 	} map;
35 
36 	nir_ssa_def *header;
37 
38 	nir_variable *vertex_count_var;
39 	nir_variable *emitted_vertex_var;
40 	nir_variable *vertex_flags_out;
41 
42 	struct exec_list old_outputs;
43 	struct exec_list new_outputs;
44 	struct exec_list emit_outputs;
45 
46 	/* tess ctrl shader on a650 gets the local primitive id at different bits: */
47 	unsigned local_primitive_id_start;
48 };
49 
50 static nir_ssa_def *
bitfield_extract(nir_builder * b,nir_ssa_def * v,uint32_t start,uint32_t mask)51 bitfield_extract(nir_builder *b, nir_ssa_def *v, uint32_t start, uint32_t mask)
52 {
53 	return nir_iand(b, nir_ushr(b, v, nir_imm_int(b, start)),
54 			nir_imm_int(b, mask));
55 }
56 
57 static nir_ssa_def *
build_invocation_id(nir_builder * b,struct state * state)58 build_invocation_id(nir_builder *b, struct state *state)
59 {
60 	return bitfield_extract(b, state->header, 11, 31);
61 }
62 
63 static nir_ssa_def *
build_vertex_id(nir_builder * b,struct state * state)64 build_vertex_id(nir_builder *b, struct state *state)
65 {
66 	return bitfield_extract(b, state->header, 6, 31);
67 }
68 
69 static nir_ssa_def *
build_local_primitive_id(nir_builder * b,struct state * state)70 build_local_primitive_id(nir_builder *b, struct state *state)
71 {
72 	return bitfield_extract(b, state->header, state->local_primitive_id_start, 63);
73 }
74 
75 static bool
is_tess_levels(gl_varying_slot slot)76 is_tess_levels(gl_varying_slot slot)
77 {
78 	return (slot == VARYING_SLOT_TESS_LEVEL_OUTER ||
79 			slot == VARYING_SLOT_TESS_LEVEL_INNER);
80 }
81 
82 /* Return a deterministic index for varyings. We can't rely on driver_location
83  * to be correct without linking the different stages first, so we create
84  * "primitive maps" where the producer decides on the location of each varying
85  * slot and then exports a per-slot array to the consumer. This compacts the
86  * gl_varying_slot space down a bit so that the primitive maps aren't too
87  * large.
88  *
89  * Note: per-patch varyings are currently handled separately, without any
90  * compacting.
91  *
92  * TODO: We could probably use the driver_location's directly in the non-SSO
93  * (Vulkan) case.
94  */
95 
96 static unsigned
shader_io_get_unique_index(gl_varying_slot slot)97 shader_io_get_unique_index(gl_varying_slot slot)
98 {
99 	if (slot == VARYING_SLOT_POS)
100 		return 0;
101 	if (slot == VARYING_SLOT_PSIZ)
102 		return 1;
103 	if (slot == VARYING_SLOT_CLIP_DIST0)
104 		return 2;
105 	if (slot == VARYING_SLOT_CLIP_DIST1)
106 		return 3;
107 	if (slot >= VARYING_SLOT_VAR0 && slot <= VARYING_SLOT_VAR31)
108 		return 4 + (slot - VARYING_SLOT_VAR0);
109 	unreachable("illegal slot in get unique index\n");
110 }
111 
112 static nir_ssa_def *
build_local_offset(nir_builder * b,struct state * state,nir_ssa_def * vertex,uint32_t location,uint32_t comp,nir_ssa_def * offset)113 build_local_offset(nir_builder *b, struct state *state,
114 		nir_ssa_def *vertex, uint32_t location, uint32_t comp, nir_ssa_def *offset)
115 {
116 	nir_ssa_def *primitive_stride = nir_load_vs_primitive_stride_ir3(b);
117 	nir_ssa_def *primitive_offset =
118 		nir_imul24(b, build_local_primitive_id(b, state), primitive_stride);
119 	nir_ssa_def *attr_offset;
120 	nir_ssa_def *vertex_stride;
121 	unsigned index = shader_io_get_unique_index(location);
122 
123 	switch (b->shader->info.stage) {
124 	case MESA_SHADER_VERTEX:
125 	case MESA_SHADER_TESS_EVAL:
126 		vertex_stride = nir_imm_int(b, state->map.stride * 4);
127 		attr_offset = nir_imm_int(b, state->map.loc[index] + 4 * comp);
128 		break;
129 	case MESA_SHADER_TESS_CTRL:
130 	case MESA_SHADER_GEOMETRY:
131 		vertex_stride = nir_load_vs_vertex_stride_ir3(b);
132 		attr_offset = nir_iadd(b, nir_load_primitive_location_ir3(b, index),
133 							   nir_imm_int(b, comp * 4));
134 		break;
135 	default:
136 		unreachable("bad shader stage");
137 	}
138 
139 	nir_ssa_def *vertex_offset = nir_imul24(b, vertex, vertex_stride);
140 
141 	return nir_iadd(b, nir_iadd(b, primitive_offset, vertex_offset),
142 			nir_iadd(b, attr_offset, nir_ishl(b, offset, nir_imm_int(b, 4))));
143 }
144 
145 static nir_intrinsic_instr *
replace_intrinsic(nir_builder * b,nir_intrinsic_instr * intr,nir_intrinsic_op op,nir_ssa_def * src0,nir_ssa_def * src1,nir_ssa_def * src2)146 replace_intrinsic(nir_builder *b, nir_intrinsic_instr *intr,
147 		nir_intrinsic_op op, nir_ssa_def *src0, nir_ssa_def *src1, nir_ssa_def *src2)
148 {
149 	nir_intrinsic_instr *new_intr =
150 		nir_intrinsic_instr_create(b->shader, op);
151 
152 	new_intr->src[0] = nir_src_for_ssa(src0);
153 	if (src1)
154 		new_intr->src[1] = nir_src_for_ssa(src1);
155 	if (src2)
156 		new_intr->src[2] = nir_src_for_ssa(src2);
157 
158 	new_intr->num_components = intr->num_components;
159 
160 	if (nir_intrinsic_infos[op].has_dest)
161 		nir_ssa_dest_init(&new_intr->instr, &new_intr->dest,
162 						  intr->num_components, 32, NULL);
163 
164 	nir_builder_instr_insert(b, &new_intr->instr);
165 
166 	if (nir_intrinsic_infos[op].has_dest)
167 		nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_src_for_ssa(&new_intr->dest.ssa));
168 
169 	nir_instr_remove(&intr->instr);
170 
171 	return new_intr;
172 }
173 
174 static void
build_primitive_map(nir_shader * shader,struct primitive_map * map)175 build_primitive_map(nir_shader *shader, struct primitive_map *map)
176 {
177 	/* All interfaces except the TCS <-> TES interface use ldlw, which takes
178 	 * an offset in bytes, so each vec4 slot is 16 bytes. TCS <-> TES uses
179 	 * ldg, which takes an offset in dwords, but each per-vertex slot has
180 	 * space for every vertex, and there's space at the beginning for
181 	 * per-patch varyings.
182 	 */
183 	unsigned slot_size = 16, start = 0;
184 	if (shader->info.stage == MESA_SHADER_TESS_CTRL) {
185 		slot_size = shader->info.tess.tcs_vertices_out * 4;
186 		start = util_last_bit(shader->info.patch_outputs_written) * 4;
187 	}
188 
189 	uint64_t mask = shader->info.outputs_written;
190 	unsigned loc = start;
191 	while (mask) {
192 		int location = u_bit_scan64(&mask);
193 		if (is_tess_levels(location))
194 			continue;
195 
196 		unsigned index = shader_io_get_unique_index(location);
197 		map->loc[index] = loc;
198 		loc += slot_size;
199 	}
200 
201 	map->stride = loc;
202 	/* Use units of dwords for the stride. */
203 	if (shader->info.stage != MESA_SHADER_TESS_CTRL)
204 		map->stride /= 4;
205 }
206 
207 /* For shader stages that receive a primitive map, calculate how big it should
208  * be.
209  */
210 
211 static unsigned
calc_primitive_map_size(nir_shader * shader)212 calc_primitive_map_size(nir_shader *shader)
213 {
214 	uint64_t mask = shader->info.inputs_read;
215 	unsigned max_index = 0;
216 	while (mask) {
217 		int location = u_bit_scan64(&mask);
218 
219 		if (is_tess_levels(location))
220 			continue;
221 
222 		unsigned index = shader_io_get_unique_index(location);
223 		max_index = MAX2(max_index, index + 1);
224 	}
225 
226 	return max_index;
227 }
228 
229 static void
lower_block_to_explicit_output(nir_block * block,nir_builder * b,struct state * state)230 lower_block_to_explicit_output(nir_block *block, nir_builder *b, struct state *state)
231 {
232 	nir_foreach_instr_safe (instr, block) {
233 		if (instr->type != nir_instr_type_intrinsic)
234 			continue;
235 
236 		nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
237 
238 		switch (intr->intrinsic) {
239 		case nir_intrinsic_store_output: {
240 			// src[] = { value, offset }.
241 
242 			/* nir_lower_io_to_temporaries replaces all access to output
243 			 * variables with temp variables and then emits a nir_copy_var at
244 			 * the end of the shader.  Thus, we should always get a full wrmask
245 			 * here.
246 			 */
247 			assert(util_is_power_of_two_nonzero(nir_intrinsic_write_mask(intr) + 1));
248 
249 			b->cursor = nir_instr_remove(&intr->instr);
250 
251 			nir_ssa_def *vertex_id = build_vertex_id(b, state);
252 			nir_ssa_def *offset = build_local_offset(b, state, vertex_id,
253 					nir_intrinsic_io_semantics(intr).location,
254 					nir_intrinsic_component(intr),
255 					intr->src[1].ssa);
256 			nir_intrinsic_instr *store =
257 				nir_intrinsic_instr_create(b->shader, nir_intrinsic_store_shared_ir3);
258 
259 			store->src[0] = nir_src_for_ssa(intr->src[0].ssa);
260 			store->src[1] = nir_src_for_ssa(offset);
261 			store->num_components = intr->num_components;
262 
263 			nir_builder_instr_insert(b, &store->instr);
264 			break;
265 		}
266 
267 		default:
268 			break;
269 		}
270 	}
271 }
272 
273 static nir_ssa_def *
local_thread_id(nir_builder * b)274 local_thread_id(nir_builder *b)
275 {
276 	return bitfield_extract(b, nir_load_gs_header_ir3(b), 16, 1023);
277 }
278 
279 void
ir3_nir_lower_to_explicit_output(nir_shader * shader,struct ir3_shader_variant * v,unsigned topology)280 ir3_nir_lower_to_explicit_output(nir_shader *shader, struct ir3_shader_variant *v,
281 		unsigned topology)
282 {
283 	struct state state = { };
284 
285 	build_primitive_map(shader, &state.map);
286 	memcpy(v->output_loc, state.map.loc, sizeof(v->output_loc));
287 
288 	nir_function_impl *impl = nir_shader_get_entrypoint(shader);
289 	assert(impl);
290 
291 	nir_builder b;
292 	nir_builder_init(&b, impl);
293 	b.cursor = nir_before_cf_list(&impl->body);
294 
295 	if (v->type == MESA_SHADER_VERTEX && topology != IR3_TESS_NONE)
296 		state.header = nir_load_tcs_header_ir3(&b);
297 	else
298 		state.header = nir_load_gs_header_ir3(&b);
299 
300 	nir_foreach_block_safe (block, impl)
301 		lower_block_to_explicit_output(block, &b, &state);
302 
303 	nir_metadata_preserve(impl, nir_metadata_block_index |
304 			nir_metadata_dominance);
305 
306 	v->output_size = state.map.stride;
307 }
308 
309 
310 static void
lower_block_to_explicit_input(nir_block * block,nir_builder * b,struct state * state)311 lower_block_to_explicit_input(nir_block *block, nir_builder *b, struct state *state)
312 {
313 	nir_foreach_instr_safe (instr, block) {
314 		if (instr->type != nir_instr_type_intrinsic)
315 			continue;
316 
317 		nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
318 
319 		switch (intr->intrinsic) {
320 		case nir_intrinsic_load_per_vertex_input: {
321 			// src[] = { vertex, offset }.
322 
323 			b->cursor = nir_before_instr(&intr->instr);
324 
325 			nir_ssa_def *offset = build_local_offset(b, state,
326 					intr->src[0].ssa, // this is typically gl_InvocationID
327 					nir_intrinsic_io_semantics(intr).location,
328 					nir_intrinsic_component(intr),
329 					intr->src[1].ssa);
330 
331 			replace_intrinsic(b, intr, nir_intrinsic_load_shared_ir3, offset, NULL, NULL);
332 			break;
333 		}
334 
335 		case nir_intrinsic_load_invocation_id: {
336 			b->cursor = nir_before_instr(&intr->instr);
337 
338 			nir_ssa_def *iid = build_invocation_id(b, state);
339 			nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_src_for_ssa(iid));
340 			nir_instr_remove(&intr->instr);
341 			break;
342 		}
343 
344 		default:
345 			break;
346 		}
347 	}
348 }
349 
350 void
ir3_nir_lower_to_explicit_input(nir_shader * shader,struct ir3_shader_variant * v)351 ir3_nir_lower_to_explicit_input(nir_shader *shader, struct ir3_shader_variant *v)
352 {
353  	struct state state = { };
354 
355 	/* when using stl/ldl (instead of stlw/ldlw) for linking VS and HS,
356 	 * HS uses a different primitive id, which starts at bit 16 in the header
357 	 */
358 	if (shader->info.stage == MESA_SHADER_TESS_CTRL && v->shader->compiler->tess_use_shared)
359 		state.local_primitive_id_start = 16;
360 
361 	nir_function_impl *impl = nir_shader_get_entrypoint(shader);
362 	assert(impl);
363 
364 	nir_builder b;
365 	nir_builder_init(&b, impl);
366 	b.cursor = nir_before_cf_list(&impl->body);
367 
368 	if (shader->info.stage == MESA_SHADER_GEOMETRY)
369 		state.header = nir_load_gs_header_ir3(&b);
370 	else
371 		state.header = nir_load_tcs_header_ir3(&b);
372 
373 	nir_foreach_block_safe (block, impl)
374 		lower_block_to_explicit_input(block, &b, &state);
375 
376 	v->input_size = calc_primitive_map_size(shader);
377 }
378 
379 static nir_ssa_def *
build_tcs_out_vertices(nir_builder * b)380 build_tcs_out_vertices(nir_builder *b)
381 {
382 	if (b->shader->info.stage == MESA_SHADER_TESS_CTRL)
383 		return nir_imm_int(b, b->shader->info.tess.tcs_vertices_out);
384 	else
385 		return nir_load_patch_vertices_in(b);
386 }
387 
388 static nir_ssa_def *
build_per_vertex_offset(nir_builder * b,struct state * state,nir_ssa_def * vertex,uint32_t location,uint32_t comp,nir_ssa_def * offset)389 build_per_vertex_offset(nir_builder *b, struct state *state,
390 		nir_ssa_def *vertex, uint32_t location, uint32_t comp, nir_ssa_def *offset)
391 {
392 	nir_ssa_def *primitive_id = nir_load_primitive_id(b);
393 	nir_ssa_def *patch_stride = nir_load_hs_patch_stride_ir3(b);
394 	nir_ssa_def *patch_offset = nir_imul24(b, primitive_id, patch_stride);
395 	nir_ssa_def *attr_offset;
396 
397 	if (nir_src_is_const(nir_src_for_ssa(offset))) {
398 		location += nir_src_as_uint(nir_src_for_ssa(offset));
399 		offset = nir_imm_int(b, 0);
400 	} else {
401 		/* Offset is in vec4's, but we need it in unit of components for the
402 		 * load/store_global_ir3 offset.
403 		 */
404 		offset = nir_ishl(b, offset, nir_imm_int(b, 2));
405 	}
406 
407 	nir_ssa_def *vertex_offset;
408 	if (vertex) {
409 		unsigned index = shader_io_get_unique_index(location);
410 		switch (b->shader->info.stage) {
411 		case MESA_SHADER_TESS_CTRL:
412 			attr_offset = nir_imm_int(b, state->map.loc[index] + comp);
413 			break;
414 		case MESA_SHADER_TESS_EVAL:
415 			attr_offset =
416 				nir_iadd(b, nir_load_primitive_location_ir3(b, index),
417 						 nir_imm_int(b, comp));
418 			break;
419 		default:
420 			unreachable("bad shader state");
421 		}
422 
423 		attr_offset = nir_iadd(b, attr_offset,
424 							   nir_imul24(b, offset,
425 										  build_tcs_out_vertices(b)));
426 		vertex_offset = nir_ishl(b, vertex, nir_imm_int(b, 2));
427 	} else {
428 		assert(location >= VARYING_SLOT_PATCH0 &&
429 			   location <= VARYING_SLOT_TESS_MAX);
430 		unsigned index = location - VARYING_SLOT_PATCH0;
431 		attr_offset = nir_iadd(b, nir_imm_int(b, index * 4 + comp), offset);
432 		vertex_offset = nir_imm_int(b, 0);
433 	}
434 
435 	return nir_iadd(b, nir_iadd(b, patch_offset, attr_offset), vertex_offset);
436 }
437 
438 static nir_ssa_def *
build_patch_offset(nir_builder * b,struct state * state,uint32_t base,uint32_t comp,nir_ssa_def * offset)439 build_patch_offset(nir_builder *b, struct state *state,
440 		uint32_t base, uint32_t comp, nir_ssa_def *offset)
441 {
442 	return build_per_vertex_offset(b, state, NULL, base, comp, offset);
443 }
444 
445 static void
tess_level_components(struct state * state,uint32_t * inner,uint32_t * outer)446 tess_level_components(struct state *state, uint32_t *inner, uint32_t *outer)
447 {
448 	switch (state->topology) {
449 	case IR3_TESS_TRIANGLES:
450 		*inner = 1;
451 		*outer = 3;
452 		break;
453 	case IR3_TESS_QUADS:
454 		*inner = 2;
455 		*outer = 4;
456 		break;
457 	case IR3_TESS_ISOLINES:
458 		*inner = 0;
459 		*outer = 2;
460 		break;
461 	default:
462 		unreachable("bad");
463 	}
464 }
465 
466 static nir_ssa_def *
build_tessfactor_base(nir_builder * b,gl_varying_slot slot,struct state * state)467 build_tessfactor_base(nir_builder *b, gl_varying_slot slot, struct state *state)
468 {
469 	uint32_t inner_levels, outer_levels;
470 	tess_level_components(state, &inner_levels, &outer_levels);
471 
472 	const uint32_t patch_stride = 1 + inner_levels + outer_levels;
473 
474 	nir_ssa_def *primitive_id = nir_load_primitive_id(b);
475 
476 	nir_ssa_def *patch_offset = nir_imul24(b, primitive_id, nir_imm_int(b, patch_stride));
477 
478 	uint32_t offset;
479 	switch (slot) {
480 	case VARYING_SLOT_TESS_LEVEL_OUTER:
481 		/* There's some kind of header dword, tess levels start at index 1. */
482 		offset = 1;
483 		break;
484 	case VARYING_SLOT_TESS_LEVEL_INNER:
485 		offset = 1 + outer_levels;
486 		break;
487 	default:
488 		unreachable("bad");
489 	}
490 
491 	return nir_iadd(b, patch_offset, nir_imm_int(b, offset));
492 }
493 
494 static void
lower_tess_ctrl_block(nir_block * block,nir_builder * b,struct state * state)495 lower_tess_ctrl_block(nir_block *block, nir_builder *b, struct state *state)
496 {
497 	nir_foreach_instr_safe (instr, block) {
498 		if (instr->type != nir_instr_type_intrinsic)
499 			continue;
500 
501 		nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
502 
503 		switch (intr->intrinsic) {
504 		case nir_intrinsic_control_barrier:
505 		case nir_intrinsic_memory_barrier_tcs_patch:
506 			/* Hull shaders dispatch 32 wide so an entire patch will always
507 			 * fit in a single warp and execute in lock-step.  Consequently,
508 			 * we don't need to do anything for TCS barriers so just remove
509 			 * the intrinsic. Otherwise we'll emit an actual barrier
510 			 * instructions, which will deadlock.
511 			 */
512 			nir_instr_remove(&intr->instr);
513 			break;
514 
515 		case nir_intrinsic_load_per_vertex_output: {
516 			// src[] = { vertex, offset }.
517 
518 			b->cursor = nir_before_instr(&intr->instr);
519 
520 			nir_ssa_def *address = nir_load_tess_param_base_ir3(b);
521 			nir_ssa_def *offset = build_per_vertex_offset(b, state,
522 					intr->src[0].ssa,
523 					nir_intrinsic_io_semantics(intr).location,
524 					nir_intrinsic_component(intr),
525 				   	intr->src[1].ssa);
526 
527 			replace_intrinsic(b, intr, nir_intrinsic_load_global_ir3, address, offset, NULL);
528 			break;
529 		}
530 
531 		case nir_intrinsic_store_per_vertex_output: {
532 			// src[] = { value, vertex, offset }.
533 
534 			b->cursor = nir_before_instr(&intr->instr);
535 
536 			/* sparse writemask not supported */
537 			assert(util_is_power_of_two_nonzero(nir_intrinsic_write_mask(intr) + 1));
538 
539 			nir_ssa_def *value = intr->src[0].ssa;
540 			nir_ssa_def *address = nir_load_tess_param_base_ir3(b);
541 			nir_ssa_def *offset = build_per_vertex_offset(b, state,
542 					intr->src[1].ssa,
543 					nir_intrinsic_io_semantics(intr).location,
544 					nir_intrinsic_component(intr),
545 					intr->src[2].ssa);
546 
547 			replace_intrinsic(b, intr, nir_intrinsic_store_global_ir3, value, address, offset);
548 
549 			break;
550 		}
551 
552 		case nir_intrinsic_load_output: {
553 			// src[] = { offset }.
554 
555 			b->cursor = nir_before_instr(&intr->instr);
556 
557 			nir_ssa_def *address, *offset;
558 
559 			/* note if vectorization of the tess level loads ever happens:
560 			 * "ldg" across 16-byte boundaries can behave incorrectly if results
561 			 * are never used. most likely some issue with (sy) not properly
562 			 * syncing with values coming from a second memory transaction.
563 			 */
564 			gl_varying_slot location = nir_intrinsic_io_semantics(intr).location;
565 			if (is_tess_levels(location)) {
566 				assert(intr->dest.ssa.num_components == 1);
567 				address = nir_load_tess_factor_base_ir3(b);
568 				offset = build_tessfactor_base(b, location, state);
569 			} else {
570 				address = nir_load_tess_param_base_ir3(b);
571 				offset = build_patch_offset(b, state,
572 											location,
573 											nir_intrinsic_component(intr),
574 											intr->src[0].ssa);
575 			}
576 
577 			replace_intrinsic(b, intr, nir_intrinsic_load_global_ir3, address, offset, NULL);
578 			break;
579 		}
580 
581 		case nir_intrinsic_store_output: {
582 			// src[] = { value, offset }.
583 
584 			/* write patch output to bo */
585 
586 			b->cursor = nir_before_instr(&intr->instr);
587 
588 			/* sparse writemask not supported */
589 			assert(util_is_power_of_two_nonzero(nir_intrinsic_write_mask(intr) + 1));
590 
591 			gl_varying_slot location = nir_intrinsic_io_semantics(intr).location;
592 			if (is_tess_levels(location)) {
593 				/* with tess levels are defined as float[4] and float[2],
594 				 * but tess factor BO has smaller sizes for tris/isolines,
595 				 * so we have to discard any writes beyond the number of
596 				 * components for inner/outer levels */
597 				uint32_t inner_levels, outer_levels, levels;
598 				tess_level_components(state, &inner_levels, &outer_levels);
599 
600 				if (location == VARYING_SLOT_TESS_LEVEL_OUTER)
601 					levels = outer_levels;
602 				else
603 					levels = inner_levels;
604 
605 				assert(intr->src[0].ssa->num_components == 1);
606 
607 				nir_ssa_def *offset =
608 					nir_iadd_imm(b, intr->src[1].ssa, nir_intrinsic_component(intr));
609 
610 				nir_if *nif = nir_push_if(b, nir_ult(b, offset, nir_imm_int(b, levels)));
611 
612 				replace_intrinsic(b, intr, nir_intrinsic_store_global_ir3,
613 						intr->src[0].ssa,
614 						nir_load_tess_factor_base_ir3(b),
615 						nir_iadd(b, offset, build_tessfactor_base(b, location, state)));
616 
617 				nir_pop_if(b, nif);
618 			} else {
619 				nir_ssa_def *address = nir_load_tess_param_base_ir3(b);
620 				nir_ssa_def *offset = build_patch_offset(b, state,
621 														 location,
622 														 nir_intrinsic_component(intr),
623 														 intr->src[1].ssa);
624 
625 				debug_assert(nir_intrinsic_component(intr) == 0);
626 
627 				replace_intrinsic(b, intr, nir_intrinsic_store_global_ir3,
628 						intr->src[0].ssa, address, offset);
629 			}
630 			break;
631 		}
632 
633 		default:
634 			break;
635 		}
636 	}
637 }
638 
639 static void
emit_tess_epilouge(nir_builder * b,struct state * state)640 emit_tess_epilouge(nir_builder *b, struct state *state)
641 {
642 	/* Insert endpatch instruction:
643 	 *
644 	 * TODO we should re-work this to use normal flow control.
645 	 */
646 
647 	nir_intrinsic_instr *end_patch =
648 		nir_intrinsic_instr_create(b->shader, nir_intrinsic_end_patch_ir3);
649 	nir_builder_instr_insert(b, &end_patch->instr);
650 }
651 
652 void
ir3_nir_lower_tess_ctrl(nir_shader * shader,struct ir3_shader_variant * v,unsigned topology)653 ir3_nir_lower_tess_ctrl(nir_shader *shader, struct ir3_shader_variant *v,
654 		unsigned topology)
655 {
656 	struct state state = { .topology = topology };
657 
658 	if (shader_debug_enabled(shader->info.stage)) {
659 		fprintf(stderr, "NIR (before tess lowering) for %s shader:\n",
660 				_mesa_shader_stage_to_string(shader->info.stage));
661 		nir_print_shader(shader, stderr);
662 	}
663 
664 	build_primitive_map(shader, &state.map);
665 	memcpy(v->output_loc, state.map.loc, sizeof(v->output_loc));
666 	v->output_size = state.map.stride;
667 
668 	nir_function_impl *impl = nir_shader_get_entrypoint(shader);
669 	assert(impl);
670 
671 	nir_builder b;
672 	nir_builder_init(&b, impl);
673 	b.cursor = nir_before_cf_list(&impl->body);
674 
675 	state.header = nir_load_tcs_header_ir3(&b);
676 
677 	nir_foreach_block_safe (block, impl)
678 		lower_tess_ctrl_block(block, &b, &state);
679 
680 	/* Now move the body of the TCS into a conditional:
681 	 *
682 	 *   if (gl_InvocationID < num_vertices)
683 	 *     // body
684 	 *
685 	 */
686 
687 	nir_cf_list body;
688 	nir_cf_extract(&body, nir_before_cf_list(&impl->body),
689 				   nir_after_cf_list(&impl->body));
690 
691 	b.cursor = nir_after_cf_list(&impl->body);
692 
693 	/* Re-emit the header, since the old one got moved into the if branch */
694 	state.header = nir_load_tcs_header_ir3(&b);
695 	nir_ssa_def *iid = build_invocation_id(&b, &state);
696 
697 	const uint32_t nvertices = shader->info.tess.tcs_vertices_out;
698 	nir_ssa_def *cond = nir_ult(&b, iid, nir_imm_int(&b, nvertices));
699 
700 	nir_if *nif = nir_push_if(&b, cond);
701 
702 	nir_cf_reinsert(&body, b.cursor);
703 
704 	b.cursor = nir_after_cf_list(&nif->then_list);
705 
706 	/* Insert conditional exit for threads invocation id != 0 */
707 	nir_ssa_def *iid0_cond = nir_ieq_imm(&b, iid, 0);
708 	nir_intrinsic_instr *cond_end =
709 		nir_intrinsic_instr_create(shader, nir_intrinsic_cond_end_ir3);
710 	cond_end->src[0] = nir_src_for_ssa(iid0_cond);
711 	nir_builder_instr_insert(&b, &cond_end->instr);
712 
713 	emit_tess_epilouge(&b, &state);
714 
715 	nir_pop_if(&b, nif);
716 
717 	nir_metadata_preserve(impl, 0);
718 }
719 
720 
721 static void
lower_tess_eval_block(nir_block * block,nir_builder * b,struct state * state)722 lower_tess_eval_block(nir_block *block, nir_builder *b, struct state *state)
723 {
724 	nir_foreach_instr_safe (instr, block) {
725 		if (instr->type != nir_instr_type_intrinsic)
726 			continue;
727 
728 		nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
729 
730 		switch (intr->intrinsic) {
731 		case nir_intrinsic_load_tess_coord: {
732 			b->cursor = nir_after_instr(&intr->instr);
733 			nir_ssa_def *x = nir_channel(b, &intr->dest.ssa, 0);
734 			nir_ssa_def *y = nir_channel(b, &intr->dest.ssa, 1);
735 			nir_ssa_def *z;
736 
737 			if (state->topology == IR3_TESS_TRIANGLES)
738 				z = nir_fsub(b, nir_fsub(b, nir_imm_float(b, 1.0f), y), x);
739 			else
740 				z = nir_imm_float(b, 0.0f);
741 
742 			nir_ssa_def *coord = nir_vec3(b, x, y, z);
743 
744 			nir_ssa_def_rewrite_uses_after(&intr->dest.ssa,
745 					nir_src_for_ssa(coord),
746 					b->cursor.instr);
747 			break;
748 		}
749 
750 		case nir_intrinsic_load_per_vertex_input: {
751 			// src[] = { vertex, offset }.
752 
753 			b->cursor = nir_before_instr(&intr->instr);
754 
755 			nir_ssa_def *address = nir_load_tess_param_base_ir3(b);
756 			nir_ssa_def *offset = build_per_vertex_offset(b, state,
757 					intr->src[0].ssa,
758 					nir_intrinsic_io_semantics(intr).location,
759 					nir_intrinsic_component(intr),
760 				   	intr->src[1].ssa);
761 
762 			replace_intrinsic(b, intr, nir_intrinsic_load_global_ir3, address, offset, NULL);
763 			break;
764 		}
765 
766 		case nir_intrinsic_load_input: {
767 			// src[] = { offset }.
768 
769 			b->cursor = nir_before_instr(&intr->instr);
770 
771 			nir_ssa_def *address, *offset;
772 
773 			/* note if vectorization of the tess level loads ever happens:
774 			 * "ldg" across 16-byte boundaries can behave incorrectly if results
775 			 * are never used. most likely some issue with (sy) not properly
776 			 * syncing with values coming from a second memory transaction.
777 			 */
778 			gl_varying_slot location = nir_intrinsic_io_semantics(intr).location;
779 			if (is_tess_levels(location)) {
780 				assert(intr->dest.ssa.num_components == 1);
781 				address = nir_load_tess_factor_base_ir3(b);
782 				offset = build_tessfactor_base(b, location, state);
783 			} else {
784 				address = nir_load_tess_param_base_ir3(b);
785 				offset = build_patch_offset(b, state,
786 											location,
787 											nir_intrinsic_component(intr),
788 											intr->src[0].ssa);
789 			}
790 
791 			offset = nir_iadd(b, offset, nir_imm_int(b, nir_intrinsic_component(intr)));
792 
793 			replace_intrinsic(b, intr, nir_intrinsic_load_global_ir3, address, offset, NULL);
794 			break;
795 		}
796 
797 		default:
798 			break;
799 		}
800 	}
801 }
802 
803 void
ir3_nir_lower_tess_eval(nir_shader * shader,struct ir3_shader_variant * v,unsigned topology)804 ir3_nir_lower_tess_eval(nir_shader *shader, struct ir3_shader_variant *v, unsigned topology)
805 {
806 	struct state state = { .topology = topology };
807 
808 	if (shader_debug_enabled(shader->info.stage)) {
809 		fprintf(stderr, "NIR (before tess lowering) for %s shader:\n",
810 				_mesa_shader_stage_to_string(shader->info.stage));
811 		nir_print_shader(shader, stderr);
812 	}
813 
814 	nir_function_impl *impl = nir_shader_get_entrypoint(shader);
815 	assert(impl);
816 
817 	nir_builder b;
818 	nir_builder_init(&b, impl);
819 
820 	nir_foreach_block_safe (block, impl)
821 		lower_tess_eval_block(block, &b, &state);
822 
823 	v->input_size = calc_primitive_map_size(shader);
824 
825 	nir_metadata_preserve(impl, 0);
826 }
827 
828 static void
lower_gs_block(nir_block * block,nir_builder * b,struct state * state)829 lower_gs_block(nir_block *block, nir_builder *b, struct state *state)
830 {
831 	nir_foreach_instr_safe (instr, block) {
832 		if (instr->type != nir_instr_type_intrinsic)
833 			continue;
834 
835 		nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
836 
837 		switch (intr->intrinsic) {
838 		case nir_intrinsic_end_primitive: {
839 			/* Note: This ignores the stream, which seems to match the blob
840 			 * behavior. I'm guessing the HW ignores any extraneous cut
841 			 * signals from an EndPrimitive() that doesn't correspond to the
842 			 * rasterized stream.
843 			 */
844 			b->cursor = nir_before_instr(&intr->instr);
845 			nir_store_var(b, state->vertex_flags_out, nir_imm_int(b, 4), 0x1);
846 			nir_instr_remove(&intr->instr);
847 			break;
848 		}
849 
850 		case nir_intrinsic_emit_vertex: {
851 			/* Load the vertex count */
852 			b->cursor = nir_before_instr(&intr->instr);
853 			nir_ssa_def *count = nir_load_var(b, state->vertex_count_var);
854 
855 			nir_push_if(b, nir_ieq(b, count, local_thread_id(b)));
856 
857 			unsigned stream = nir_intrinsic_stream_id(intr);
858 			/* vertex_flags_out |= stream */
859 			nir_store_var(b, state->vertex_flags_out,
860 						  nir_ior(b, nir_load_var(b, state->vertex_flags_out),
861 								  nir_imm_int(b, stream)), 0x1 /* .x */);
862 
863 			foreach_two_lists(dest_node, &state->emit_outputs, src_node, &state->old_outputs) {
864 				nir_variable *dest = exec_node_data(nir_variable, dest_node, node);
865 				nir_variable *src = exec_node_data(nir_variable, src_node, node);
866 				nir_copy_var(b, dest, src);
867 			}
868 
869 			nir_instr_remove(&intr->instr);
870 
871 			nir_store_var(b, state->emitted_vertex_var,
872 					nir_iadd(b, nir_load_var(b, state->emitted_vertex_var), nir_imm_int(b, 1)), 0x1);
873 
874 			nir_pop_if(b, NULL);
875 
876 			/* Increment the vertex count by 1 */
877 			nir_store_var(b, state->vertex_count_var,
878 					nir_iadd(b, count, nir_imm_int(b, 1)), 0x1); /* .x */
879 			nir_store_var(b, state->vertex_flags_out, nir_imm_int(b, 0), 0x1);
880 
881 			break;
882 		}
883 
884 		default:
885 			break;
886 		}
887 	}
888 }
889 
890 void
ir3_nir_lower_gs(nir_shader * shader)891 ir3_nir_lower_gs(nir_shader *shader)
892 {
893 	struct state state = { };
894 
895 	if (shader_debug_enabled(shader->info.stage)) {
896 		fprintf(stderr, "NIR (before gs lowering):\n");
897 		nir_print_shader(shader, stderr);
898 	}
899 
900 	/* Create an output var for vertex_flags. This will be shadowed below,
901 	 * same way regular outputs get shadowed, and this variable will become a
902 	 * temporary.
903 	 */
904 	state.vertex_flags_out = nir_variable_create(shader, nir_var_shader_out,
905 			glsl_uint_type(), "vertex_flags");
906 	state.vertex_flags_out->data.driver_location = shader->num_outputs++;
907 	state.vertex_flags_out->data.location = VARYING_SLOT_GS_VERTEX_FLAGS_IR3;
908 	state.vertex_flags_out->data.interpolation = INTERP_MODE_NONE;
909 
910 	nir_function_impl *impl = nir_shader_get_entrypoint(shader);
911 	assert(impl);
912 
913 	nir_builder b;
914 	nir_builder_init(&b, impl);
915 	b.cursor = nir_before_cf_list(&impl->body);
916 
917 	state.header = nir_load_gs_header_ir3(&b);
918 
919 	/* Generate two set of shadow vars for the output variables.  The first
920 	 * set replaces the real outputs and the second set (emit_outputs) we'll
921 	 * assign in the emit_vertex conditionals.  Then at the end of the shader
922 	 * we copy the emit_outputs to the real outputs, so that we get
923 	 * store_output in uniform control flow.
924 	 */
925 	exec_list_make_empty(&state.old_outputs);
926 	nir_foreach_shader_out_variable_safe(var, shader) {
927 		exec_node_remove(&var->node);
928 		exec_list_push_tail(&state.old_outputs, &var->node);
929 	}
930 	exec_list_make_empty(&state.new_outputs);
931 	exec_list_make_empty(&state.emit_outputs);
932 	nir_foreach_variable_in_list(var, &state.old_outputs) {
933 		/* Create a new output var by cloning the original output var and
934 		 * stealing the name.
935 		 */
936 		nir_variable *output = nir_variable_clone(var, shader);
937 		exec_list_push_tail(&state.new_outputs, &output->node);
938 
939 		/* Rewrite the original output to be a shadow variable. */
940 		var->name = ralloc_asprintf(var, "%s@gs-temp", output->name);
941 		var->data.mode = nir_var_shader_temp;
942 
943 		/* Clone the shadow variable to create the emit shadow variable that
944 		 * we'll assign in the emit conditionals.
945 		 */
946 		nir_variable *emit_output = nir_variable_clone(var, shader);
947 		emit_output->name = ralloc_asprintf(var, "%s@emit-temp", output->name);
948 		exec_list_push_tail(&state.emit_outputs, &emit_output->node);
949 	}
950 
951 	/* During the shader we'll keep track of which vertex we're currently
952 	 * emitting for the EmitVertex test and how many vertices we emitted so we
953 	 * know to discard if didn't emit any.  In most simple shaders, this can
954 	 * all be statically determined and gets optimized away.
955 	 */
956 	state.vertex_count_var =
957 		nir_local_variable_create(impl, glsl_uint_type(), "vertex_count");
958 	state.emitted_vertex_var =
959 		nir_local_variable_create(impl, glsl_uint_type(), "emitted_vertex");
960 
961 	/* Initialize to 0. */
962 	b.cursor = nir_before_cf_list(&impl->body);
963 	nir_store_var(&b, state.vertex_count_var, nir_imm_int(&b, 0), 0x1);
964 	nir_store_var(&b, state.emitted_vertex_var, nir_imm_int(&b, 0), 0x1);
965 	nir_store_var(&b, state.vertex_flags_out, nir_imm_int(&b, 4), 0x1);
966 
967 	nir_foreach_block_safe (block, impl)
968 		lower_gs_block(block, &b, &state);
969 
970 	set_foreach(impl->end_block->predecessors, block_entry) {
971 		struct nir_block *block = (void *)block_entry->key;
972 		b.cursor = nir_after_block_before_jump(block);
973 
974 		nir_intrinsic_instr *discard_if =
975 			nir_intrinsic_instr_create(b.shader, nir_intrinsic_discard_if);
976 
977 		nir_ssa_def *cond = nir_ieq_imm(&b, nir_load_var(&b, state.emitted_vertex_var), 0);
978 
979 		discard_if->src[0] = nir_src_for_ssa(cond);
980 
981 		nir_builder_instr_insert(&b, &discard_if->instr);
982 
983 		foreach_two_lists(dest_node, &state.new_outputs, src_node, &state.emit_outputs) {
984 			nir_variable *dest = exec_node_data(nir_variable, dest_node, node);
985 			nir_variable *src = exec_node_data(nir_variable, src_node, node);
986 			nir_copy_var(&b, dest, src);
987 		}
988 	}
989 
990 	exec_list_append(&shader->variables, &state.old_outputs);
991 	exec_list_append(&shader->variables, &state.emit_outputs);
992 	exec_list_append(&shader->variables, &state.new_outputs);
993 
994 	nir_metadata_preserve(impl, 0);
995 
996 	nir_lower_global_vars_to_local(shader);
997 	nir_split_var_copies(shader);
998 	nir_lower_var_copies(shader);
999 
1000 	nir_fixup_deref_modes(shader);
1001 
1002 	if (shader_debug_enabled(shader->info.stage)) {
1003 		fprintf(stderr, "NIR (after gs lowering):\n");
1004 		nir_print_shader(shader, stderr);
1005 	}
1006 }
1007 
1008