1 /*
2  * Copyright 2013 Vadim Girlin <vadimgirlin@gmail.com>
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * on the rights to use, copy, modify, merge, publish, distribute, sub
8  * license, and/or sell copies of the Software, and to permit persons to whom
9  * the Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
22  *
23  * Authors:
24  *      Vadim Girlin
25  */
26 
27 #define BCP_DEBUG 0
28 
29 #if BCP_DEBUG
30 #define BCP_DUMP(q) do { q } while (0)
31 #else
32 #define BCP_DUMP(q)
33 #endif
34 
35 #include "r600_pipe.h"
36 #include "r600_shader.h"
37 #include "eg_sq.h" // CM_V_SQ_MOVA_DST_CF_IDX0/1
38 
39 #include <stack>
40 
41 #include "sb_bc.h"
42 #include "sb_shader.h"
43 #include "sb_pass.h"
44 #include "util/macros.h"
45 
46 namespace r600_sb {
47 
decode()48 int bc_parser::decode() {
49 
50 	dw = bc->bytecode;
51 	bc_ndw = bc->ndw;
52 	max_cf = 0;
53 
54 	dec = new bc_decoder(ctx, dw, bc_ndw);
55 
56 	shader_target t = TARGET_UNKNOWN;
57 
58 	if (pshader) {
59 		switch (bc->type) {
60 		case PIPE_SHADER_FRAGMENT: t = TARGET_PS; break;
61 		case PIPE_SHADER_VERTEX:
62 			t = pshader->vs_as_ls ? TARGET_LS : (pshader->vs_as_es ? TARGET_ES : TARGET_VS);
63 			break;
64 		case PIPE_SHADER_GEOMETRY: t = TARGET_GS; break;
65 		case PIPE_SHADER_COMPUTE: t = TARGET_COMPUTE; break;
66 		case PIPE_SHADER_TESS_CTRL: t = TARGET_HS; break;
67 		case PIPE_SHADER_TESS_EVAL: t = pshader->tes_as_es ? TARGET_ES : TARGET_VS; break;
68 		default: assert(!"unknown shader target"); return -1; break;
69 		}
70 	} else {
71 		if (bc->type == PIPE_SHADER_COMPUTE)
72 			t = TARGET_COMPUTE;
73 		else
74 			t = TARGET_FETCH;
75 	}
76 
77 	sh = new shader(ctx, t, bc->debug_id);
78 	sh->safe_math = sb_context::safe_math || (t == TARGET_COMPUTE);
79 
80 	int r = decode_shader();
81 
82 	delete dec;
83 
84 	sh->ngpr = bc->ngpr;
85 	sh->nstack = bc->nstack;
86 
87 	return r;
88 }
89 
decode_shader()90 int bc_parser::decode_shader() {
91 	int r = 0;
92 	unsigned i = 0;
93 	bool eop = false;
94 
95 	sh->init();
96 
97 	do {
98 		eop = false;
99 		if ((r = decode_cf(i, eop)))
100 			return r;
101 
102 	} while (!eop || (i >> 1) < max_cf);
103 
104 	return 0;
105 }
106 
prepare()107 int bc_parser::prepare() {
108 	int r = 0;
109 	if ((r = parse_decls()))
110 		return r;
111 	if ((r = prepare_ir()))
112 		return r;
113 	return 0;
114 }
115 
parse_decls()116 int bc_parser::parse_decls() {
117 
118 	if (!pshader) {
119 		if (gpr_reladdr)
120 			sh->add_gpr_array(0, bc->ngpr, 0x0F);
121 
122 		// compute shaders have some values preloaded in R0, R1
123 		sh->add_input(0 /* GPR */, true /* preloaded */, 0x0F /* mask */);
124 		sh->add_input(1 /* GPR */, true /* preloaded */, 0x0F /* mask */);
125 		return 0;
126 	}
127 
128 	if (pshader->indirect_files & ~((1 << TGSI_FILE_CONSTANT) | (1 << TGSI_FILE_SAMPLER))) {
129 
130 		assert(pshader->num_arrays);
131 
132 		if (pshader->num_arrays) {
133 			for (unsigned i = 0; i < pshader->num_arrays; ++i) {
134 				r600_shader_array &a = pshader->arrays[i];
135 				sh->add_gpr_array(a.gpr_start, a.gpr_count, a.comp_mask);
136 			}
137 		} else {
138 			sh->add_gpr_array(0, pshader->bc.ngpr, 0x0F);
139 		}
140 	}
141 
142 	// GS inputs can add indirect addressing
143 	if (sh->target == TARGET_GS) {
144 		if (pshader->num_arrays) {
145 			for (unsigned i = 0; i < pshader->num_arrays; ++i) {
146 				r600_shader_array &a = pshader->arrays[i];
147 				sh->add_gpr_array(a.gpr_start, a.gpr_count, a.comp_mask);
148 			}
149 		}
150 	}
151 
152 	if (sh->target == TARGET_VS || sh->target == TARGET_ES || sh->target == TARGET_HS || sh->target == TARGET_LS)
153 		sh->add_input(0, 1, 0x0F);
154 	else if (sh->target == TARGET_GS) {
155 		sh->add_input(0, 1, 0x0F);
156 		sh->add_input(1, 1, 0x0F);
157 	} else if (sh->target == TARGET_COMPUTE) {
158 		sh->add_input(0, 1, 0x0F);
159 		sh->add_input(1, 1, 0x0F);
160 	}
161 
162 	bool ps_interp = ctx.hw_class >= HW_CLASS_EVERGREEN
163 			&& sh->target == TARGET_PS;
164 
165 	bool ij_interpolators[6];
166 	memset(ij_interpolators, 0, sizeof(ij_interpolators));
167 
168 	for (unsigned i = 0; i < pshader->ninput; ++i) {
169 		r600_shader_io & in = pshader->input[i];
170 		bool preloaded = sh->target == TARGET_PS && !(ps_interp && in.spi_sid);
171 		sh->add_input(in.gpr, preloaded, /*in.write_mask*/ 0x0F);
172 		if (ps_interp && in.spi_sid) {
173 			int k = eg_get_interpolator_index(in.interpolate, in.interpolate_location);
174 			if (k >= 0)
175 				ij_interpolators[k] |= true;
176 		}
177 	}
178 
179 	if (ps_interp) {
180 		/* add the egcm ij interpolators to live inputs */
181 		unsigned num_ij = 0;
182 		for (unsigned i = 0; i < ARRAY_SIZE(ij_interpolators); i++) {
183 			num_ij += ij_interpolators[i];
184 		}
185 
186 		unsigned mask = (1 << (2 * num_ij)) - 1;
187 		unsigned gpr = 0;
188 
189 		while (mask) {
190 			sh->add_input(gpr, true, mask & 0x0F);
191 			++gpr;
192 			mask >>= 4;
193 		}
194 	}
195 
196 	return 0;
197 }
198 
decode_cf(unsigned & i,bool & eop)199 int bc_parser::decode_cf(unsigned &i, bool &eop) {
200 
201 	int r;
202 
203 	cf_node *cf = sh->create_cf();
204 	sh->root->push_back(cf);
205 
206 	unsigned id = i >> 1;
207 
208 	cf->bc.id = id;
209 
210 	if (cf_map.size() < id + 1)
211 		cf_map.resize(id + 1);
212 
213 	cf_map[id] = cf;
214 
215 	if ((r = dec->decode_cf(i, cf->bc)))
216 		return r;
217 
218 	cf_op_flags flags = (cf_op_flags)cf->bc.op_ptr->flags;
219 
220 	if (flags & CF_ALU) {
221 		if ((r = decode_alu_clause(cf)))
222 			return r;
223 	} else if (flags & CF_FETCH) {
224 		if ((r = decode_fetch_clause(cf)))
225 			return r;
226 	} else if (flags & CF_EXP) {
227 		if (cf->bc.rw_rel)
228 			gpr_reladdr = true;
229 		assert(!cf->bc.rw_rel);
230 	} else if (flags & CF_MEM) {
231 		if (cf->bc.rw_rel)
232 			gpr_reladdr = true;
233 		assert(!cf->bc.rw_rel);
234 	} else if (flags & CF_BRANCH) {
235 		if (cf->bc.addr > max_cf)
236 			max_cf = cf->bc.addr;
237 	}
238 
239 	eop = cf->bc.end_of_program || cf->bc.op == CF_OP_CF_END ||
240 			cf->bc.op == CF_OP_RET;
241 	return 0;
242 }
243 
decode_alu_clause(cf_node * cf)244 int bc_parser::decode_alu_clause(cf_node* cf) {
245 	unsigned i = cf->bc.addr << 1, cnt = cf->bc.count + 1, gcnt;
246 
247 	cf->subtype = NST_ALU_CLAUSE;
248 
249 	cgroup = 0;
250 	memset(slots[0], 0, 5*sizeof(slots[0][0]));
251 
252 	unsigned ng = 0;
253 
254 	do {
255 		decode_alu_group(cf, i, gcnt);
256 		assert(gcnt <= cnt);
257 		cnt -= gcnt;
258 		ng++;
259 	} while (cnt);
260 
261 	return 0;
262 }
263 
decode_alu_group(cf_node * cf,unsigned & i,unsigned & gcnt)264 int bc_parser::decode_alu_group(cf_node* cf, unsigned &i, unsigned &gcnt) {
265 	int r;
266 	alu_node *n;
267 	alu_group_node *g = sh->create_alu_group();
268 
269 	cgroup = !cgroup;
270 	memset(slots[cgroup], 0, 5*sizeof(slots[0][0]));
271 	gcnt = 0;
272 
273 	unsigned literal_mask = 0;
274 
275 	do {
276 		n = sh->create_alu();
277 		g->push_back(n);
278 
279 		if ((r = dec->decode_alu(i, n->bc)))
280 			return r;
281 
282 		if (!sh->assign_slot(n, slots[cgroup])) {
283 			assert(!"alu slot assignment failed");
284 			return -1;
285 		}
286 
287 		gcnt++;
288 
289 	} while (gcnt <= 5 && !n->bc.last);
290 
291 	assert(n->bc.last);
292 
293 	for (node_iterator I = g->begin(), E = g->end(); I != E; ++I) {
294 		n = static_cast<alu_node*>(*I);
295 
296 		if (n->bc.dst_rel)
297 			gpr_reladdr = true;
298 
299 		for (int k = 0; k < n->bc.op_ptr->src_count; ++k) {
300 			bc_alu_src &src = n->bc.src[k];
301 			if (src.rel)
302 				gpr_reladdr = true;
303 			if (src.sel == ALU_SRC_LITERAL) {
304 				literal_mask |= (1 << src.chan);
305 				src.value.u = dw[i + src.chan];
306 			}
307 		}
308 	}
309 
310 	unsigned literal_ndw = 0;
311 	while (literal_mask) {
312 		g->literals.push_back(dw[i + literal_ndw]);
313 		literal_ndw += 1;
314 		literal_mask >>= 1;
315 	}
316 
317 	literal_ndw = (literal_ndw + 1) & ~1u;
318 
319 	i += literal_ndw;
320 	gcnt += literal_ndw >> 1;
321 
322 	cf->push_back(g);
323 	return 0;
324 }
325 
prepare_alu_clause(cf_node * cf)326 int bc_parser::prepare_alu_clause(cf_node* cf) {
327 
328 	// loop over alu groups
329 	for (node_iterator I = cf->begin(), E = cf->end(); I != E; ++I) {
330 		assert(I->subtype == NST_ALU_GROUP);
331 		alu_group_node *g = static_cast<alu_group_node*>(*I);
332 		prepare_alu_group(cf, g);
333 	}
334 
335 	return 0;
336 }
337 
save_set_cf_index(value * val,unsigned idx)338 void bc_parser::save_set_cf_index(value *val, unsigned idx)
339 {
340 	assert(idx <= 1);
341 	assert(val);
342 	cf_index_value[idx] = val;
343 }
get_cf_index_value(unsigned idx)344 value *bc_parser::get_cf_index_value(unsigned idx)
345 {
346 	assert(idx <= 1);
347 	assert(cf_index_value[idx]);
348 	return cf_index_value[idx];
349 }
save_mova(alu_node * mova)350 void bc_parser::save_mova(alu_node *mova)
351 {
352 	assert(mova);
353 	this->mova = mova;
354 }
get_mova()355 alu_node *bc_parser::get_mova()
356 {
357 	assert(mova);
358 	return mova;
359 }
360 
prepare_alu_group(cf_node * cf,alu_group_node * g)361 int bc_parser::prepare_alu_group(cf_node* cf, alu_group_node *g) {
362 
363 	alu_node *n;
364 
365 	cgroup = !cgroup;
366 	memset(slots[cgroup], 0, 5*sizeof(slots[0][0]));
367 
368 	for (node_iterator I = g->begin(), E = g->end();
369 			I != E; ++I) {
370 		n = static_cast<alu_node*>(*I);
371 		bool ubo_indexing[2] = {};
372 
373 		if (!sh->assign_slot(n, slots[cgroup])) {
374 			assert(!"alu slot assignment failed");
375 			return -1;
376 		}
377 
378 		unsigned src_count = n->bc.op_ptr->src_count;
379 
380 		if (ctx.alu_slots(n->bc.op) & AF_4SLOT)
381 			n->flags |= NF_ALU_4SLOT;
382 
383 		n->src.resize(src_count);
384 
385 		unsigned flags = n->bc.op_ptr->flags;
386 
387 		if (flags & AF_LDS) {
388 			bool need_rw = false, need_oqa = false, need_oqb = false;
389 			int ndst = 0, ncount = 0;
390 
391 			/* all non-read operations have side effects */
392 			if (n->bc.op != LDS_OP2_LDS_READ2_RET &&
393 			    n->bc.op != LDS_OP1_LDS_READ_REL_RET &&
394 			    n->bc.op != LDS_OP1_LDS_READ_RET) {
395 				n->flags |= NF_DONT_KILL;
396 				ndst++;
397 				need_rw = true;
398 			}
399 
400 			if (n->bc.op >= LDS_OP2_LDS_ADD_RET && n->bc.op <= LDS_OP1_LDS_USHORT_READ_RET) {
401 				need_oqa = true;
402 				ndst++;
403 			}
404 
405 			if (n->bc.op == LDS_OP2_LDS_READ2_RET || n->bc.op == LDS_OP1_LDS_READ_REL_RET) {
406 				need_oqb = true;
407 				ndst++;
408 			}
409 
410 			n->dst.resize(ndst);
411 			if (need_oqa)
412 				n->dst[ncount++] = sh->get_special_value(SV_LDS_OQA);
413 			if (need_oqb)
414 				n->dst[ncount++] = sh->get_special_value(SV_LDS_OQB);
415 			if (need_rw)
416 				n->dst[ncount++] = sh->get_special_value(SV_LDS_RW);
417 
418 			n->flags |= NF_DONT_MOVE | NF_DONT_HOIST;
419 
420 		} else if (flags & AF_PRED) {
421 			n->dst.resize(3);
422 			if (n->bc.update_pred)
423 				n->dst[1] = sh->get_special_value(SV_ALU_PRED);
424 			if (n->bc.update_exec_mask)
425 				n->dst[2] = sh->get_special_value(SV_EXEC_MASK);
426 
427 			n->flags |= NF_DONT_HOIST;
428 
429 		} else if (flags & AF_KILL) {
430 
431 			n->dst.resize(2);
432 			n->dst[1] = sh->get_special_value(SV_VALID_MASK);
433 			sh->set_uses_kill();
434 
435 			n->flags |= NF_DONT_HOIST | NF_DONT_MOVE |
436 					NF_DONT_KILL | NF_SCHEDULE_EARLY;
437 
438 		} else {
439 			n->dst.resize(1);
440 		}
441 
442 		if (n->bc.op == ALU_OP0_SET_CF_IDX0 || n->bc.op == ALU_OP0_SET_CF_IDX1) {
443 			// Move CF_IDX value into tex instruction operands, scheduler will later re-emit setting of CF_IDX
444 			// DCE will kill this op
445 			save_set_cf_index(get_mova()->src[0], n->bc.op == ALU_OP0_SET_CF_IDX1);
446 		} else if (flags & AF_MOVA) {
447 
448 			n->dst[0] = sh->get_special_value(SV_AR_INDEX);
449 			save_mova(n);
450 
451 			n->flags |= NF_DONT_HOIST;
452 
453 		} else if ((n->bc.op_ptr->src_count == 3 || n->bc.write_mask) && !(flags & AF_LDS)) {
454 			assert(!n->bc.dst_rel || n->bc.index_mode == INDEX_AR_X);
455 
456 			value *v = sh->get_gpr_value(false, n->bc.dst_gpr, n->bc.dst_chan,
457 					n->bc.dst_rel);
458 
459 			n->dst[0] = v;
460 		}
461 
462 		if (n->bc.pred_sel) {
463 			sh->has_alu_predication = true;
464 			n->pred = sh->get_special_value(SV_ALU_PRED);
465 		}
466 
467 		for (unsigned s = 0; s < src_count; ++s) {
468 			bc_alu_src &src = n->bc.src[s];
469 
470 			if (src.sel == ALU_SRC_LITERAL) {
471 				n->src[s] = sh->get_const_value(src.value);
472 			} else if (src.sel == ALU_SRC_PS || src.sel == ALU_SRC_PV) {
473 				unsigned pgroup = !cgroup, prev_slot = src.sel == ALU_SRC_PS ?
474 						SLOT_TRANS : src.chan;
475 
476 				// XXX shouldn't happen but llvm backend uses PS on cayman
477 				if (prev_slot == SLOT_TRANS && ctx.is_cayman())
478 					prev_slot = SLOT_X;
479 
480 				alu_node *prev_alu = slots[pgroup][prev_slot];
481 
482 				assert(prev_alu);
483 
484 				if (!prev_alu->dst[0]) {
485 					value * t = sh->create_temp_value();
486 					prev_alu->dst[0] = t;
487 				}
488 
489 				value *d = prev_alu->dst[0];
490 
491 				if (d->is_rel()) {
492 					d = sh->get_gpr_value(true, prev_alu->bc.dst_gpr,
493 					                      prev_alu->bc.dst_chan,
494 					                      prev_alu->bc.dst_rel);
495 				}
496 
497 				n->src[s] = d;
498 			} else if (ctx.is_kcache_sel(src.sel)) {
499 				unsigned sel = src.sel, kc_addr;
500 				unsigned kc_set = ((sel >> 7) & 2) + ((sel >> 5) & 1);
501 
502 				bc_kcache &kc = cf->bc.kc[kc_set];
503 				kc_addr = (kc.addr << 4) + (sel & 0x1F);
504 				n->src[s] = sh->get_kcache_value(kc.bank, kc_addr, src.chan, (alu_kcache_index_mode)kc.index_mode);
505 
506 				if (kc.index_mode != KC_INDEX_NONE) {
507 					assert(kc.index_mode != KC_LOCK_LOOP);
508 					ubo_indexing[kc.index_mode - KC_INDEX_0] = true;
509 				}
510 			} else if (src.sel < MAX_GPR) {
511 				value *v = sh->get_gpr_value(true, src.sel, src.chan, src.rel);
512 
513 				n->src[s] = v;
514 
515 			} else if (src.sel >= ALU_SRC_PARAM_OFFSET) {
516 				// using slot for value channel because in fact the slot
517 				// determines the channel that is loaded by INTERP_LOAD_P0
518 				// (and maybe some others).
519 				// otherwise GVN will consider INTERP_LOAD_P0s with the same
520 				// param index as equal instructions and leave only one of them
521 				n->src[s] = sh->get_special_ro_value(sel_chan(src.sel,
522 				                                              n->bc.slot));
523 			} else if (ctx.is_lds_oq(src.sel)) {
524 				switch (src.sel) {
525 				case ALU_SRC_LDS_OQ_A:
526 				case ALU_SRC_LDS_OQ_B:
527 					assert(!"Unsupported LDS queue access in SB");
528 					break;
529 				case ALU_SRC_LDS_OQ_A_POP:
530 					n->src[s] = sh->get_special_value(SV_LDS_OQA);
531 					break;
532 				case ALU_SRC_LDS_OQ_B_POP:
533 					n->src[s] = sh->get_special_value(SV_LDS_OQB);
534 					break;
535 				}
536 				n->flags |= NF_DONT_HOIST | NF_DONT_MOVE;
537 
538 			} else {
539 				switch (src.sel) {
540 				case ALU_SRC_0:
541 					n->src[s] = sh->get_const_value(0);
542 					break;
543 				case ALU_SRC_0_5:
544 					n->src[s] = sh->get_const_value(0.5f);
545 					break;
546 				case ALU_SRC_1:
547 					n->src[s] = sh->get_const_value(1.0f);
548 					break;
549 				case ALU_SRC_1_INT:
550 					n->src[s] = sh->get_const_value(1);
551 					break;
552 				case ALU_SRC_M_1_INT:
553 					n->src[s] = sh->get_const_value(-1);
554 					break;
555 				default:
556 					n->src[s] = sh->get_special_ro_value(src.sel);
557 					break;
558 				}
559 			}
560 		}
561 
562 		// add UBO index values if any as dependencies
563 		if (ubo_indexing[0]) {
564 			n->src.push_back(get_cf_index_value(0));
565 		}
566 		if (ubo_indexing[1]) {
567 			n->src.push_back(get_cf_index_value(1));
568 		}
569 
570 		if ((flags & AF_MOVA) && (n->bc.dst_gpr == CM_V_SQ_MOVA_DST_CF_IDX0 || n->bc.dst_gpr == CM_V_SQ_MOVA_DST_CF_IDX1) &&
571 		    ctx.is_cayman())
572 			// Move CF_IDX value into tex instruction operands, scheduler will later re-emit setting of CF_IDX
573 			save_set_cf_index(n->src[0], n->bc.dst_gpr == CM_V_SQ_MOVA_DST_CF_IDX1);
574 	}
575 
576 	// pack multislot instructions into alu_packed_node
577 
578 	alu_packed_node *p = NULL;
579 	for (node_iterator N, I = g->begin(), E = g->end(); I != E; I = N) {
580 		N = I + 1;
581 		alu_node *a = static_cast<alu_node*>(*I);
582 		unsigned sflags = a->bc.slot_flags;
583 
584 		if (sflags == AF_4V || (ctx.is_cayman() && sflags == AF_S)) {
585 			if (!p)
586 				p = sh->create_alu_packed();
587 
588 			a->remove();
589 			p->push_back(a);
590 		}
591 	}
592 
593 	if (p) {
594 		g->push_front(p);
595 
596 		if (p->count() == 3 && ctx.is_cayman()) {
597 			// cayman's scalar instruction that can use 3 or 4 slots
598 
599 			// FIXME for simplicity we'll always add 4th slot,
600 			// but probably we might want to always remove 4th slot and make
601 			// sure that regalloc won't choose 'w' component for dst
602 
603 			alu_node *f = static_cast<alu_node*>(p->first);
604 			alu_node *a = sh->create_alu();
605 			a->src = f->src;
606 			a->dst.resize(f->dst.size());
607 			a->bc = f->bc;
608 			a->bc.slot = SLOT_W;
609 			p->push_back(a);
610 		}
611 	}
612 
613 	return 0;
614 }
615 
decode_fetch_clause(cf_node * cf)616 int bc_parser::decode_fetch_clause(cf_node* cf) {
617 	int r;
618 	unsigned i = cf->bc.addr << 1, cnt = cf->bc.count + 1;
619 
620 	if (cf->bc.op_ptr->flags && FF_GDS)
621 		cf->subtype = NST_GDS_CLAUSE;
622 	else
623 		cf->subtype = NST_TEX_CLAUSE;
624 
625 	while (cnt--) {
626 		fetch_node *n = sh->create_fetch();
627 		cf->push_back(n);
628 		if ((r = dec->decode_fetch(i, n->bc)))
629 			return r;
630 		if (n->bc.src_rel || n->bc.dst_rel)
631 			gpr_reladdr = true;
632 
633 	}
634 	return 0;
635 }
636 
prepare_fetch_clause(cf_node * cf)637 int bc_parser::prepare_fetch_clause(cf_node *cf) {
638 
639 	vvec grad_v, grad_h, texture_offsets;
640 
641 	for (node_iterator I = cf->begin(), E = cf->end(); I != E; ++I) {
642 
643 		fetch_node *n = static_cast<fetch_node*>(*I);
644 		assert(n->is_valid());
645 
646 		unsigned flags = n->bc.op_ptr->flags;
647 
648 		unsigned vtx = flags & FF_VTX;
649 		unsigned gds = flags & FF_GDS;
650 		unsigned num_src = gds ? 2 : vtx ? ctx.vtx_src_num : 4;
651 
652 		n->dst.resize(4);
653 
654 		if (gds) {
655 			n->flags |= NF_DONT_HOIST | NF_DONT_MOVE | NF_DONT_KILL;
656 		}
657 		if (flags & (FF_SETGRAD | FF_USEGRAD | FF_GETGRAD)) {
658 			sh->uses_gradients = true;
659 		}
660 
661 		if (flags & (FF_SETGRAD | FF_SET_TEXTURE_OFFSETS)) {
662 
663 			vvec *grad = NULL;
664 
665 			switch (n->bc.op) {
666 				case FETCH_OP_SET_GRADIENTS_V:
667 					grad = &grad_v;
668 					break;
669 				case FETCH_OP_SET_GRADIENTS_H:
670 					grad = &grad_h;
671 					break;
672 				case FETCH_OP_SET_TEXTURE_OFFSETS:
673 					grad = &texture_offsets;
674 					break;
675 				default:
676 					assert(!"unexpected SET_GRAD instruction");
677 					return -1;
678 			}
679 
680 			if (grad->empty())
681 				grad->resize(4);
682 
683 			for(unsigned s = 0; s < 4; ++s) {
684 				unsigned sw = n->bc.src_sel[s];
685 				if (sw <= SEL_W)
686 					(*grad)[s] = sh->get_gpr_value(true, n->bc.src_gpr,
687 					                               sw, false);
688 				else if (sw == SEL_0)
689 					(*grad)[s] = sh->get_const_value(0.0f);
690 				else if (sw == SEL_1)
691 					(*grad)[s] = sh->get_const_value(1.0f);
692 			}
693 		} else {
694 			// Fold source values for instructions with hidden target values in to the instructions
695 			// using them. The set instructions are later re-emitted by bc_finalizer
696 			if (flags & FF_USEGRAD) {
697 				n->src.resize(12);
698 				std::copy(grad_v.begin(), grad_v.end(), n->src.begin() + 4);
699 				std::copy(grad_h.begin(), grad_h.end(), n->src.begin() + 8);
700 			} else if (flags & FF_USE_TEXTURE_OFFSETS) {
701 				n->src.resize(8);
702 				std::copy(texture_offsets.begin(), texture_offsets.end(), n->src.begin() + 4);
703 			} else {
704 				n->src.resize(4);
705 			}
706 
707 			for(int s = 0; s < 4; ++s) {
708 				if (n->bc.dst_sel[s] != SEL_MASK)
709 					n->dst[s] = sh->get_gpr_value(false, n->bc.dst_gpr, s, false);
710 				// NOTE: it doesn't matter here which components of the result we
711 				// are using, but original n->bc.dst_sel should be taken into
712 				// account when building the bytecode
713 			}
714 			for(unsigned s = 0; s < num_src; ++s) {
715 				if (n->bc.src_sel[s] <= SEL_W)
716 					n->src[s] = sh->get_gpr_value(true, n->bc.src_gpr,
717 					                              n->bc.src_sel[s], false);
718 			}
719 
720 			// Scheduler will emit the appropriate instructions to set CF_IDX0/1
721 			if (n->bc.sampler_index_mode != V_SQ_CF_INDEX_NONE) {
722 				n->src.push_back(get_cf_index_value(n->bc.sampler_index_mode == V_SQ_CF_INDEX_1));
723 			}
724 			if (n->bc.resource_index_mode != V_SQ_CF_INDEX_NONE) {
725 				n->src.push_back(get_cf_index_value(n->bc.resource_index_mode == V_SQ_CF_INDEX_1));
726 			}
727 		}
728 	}
729 
730 	return 0;
731 }
732 
prepare_ir()733 int bc_parser::prepare_ir() {
734 
735 	for(id_cf_map::iterator I = cf_map.begin(), E = cf_map.end(); I != E; ++I) {
736 		cf_node *c = *I;
737 
738 		if (!c)
739 			continue;
740 
741 		unsigned flags = c->bc.op_ptr->flags;
742 
743 		if (flags & CF_ALU) {
744 			prepare_alu_clause(c);
745 		} else if (flags & CF_FETCH) {
746 			prepare_fetch_clause(c);
747 		} else if (c->bc.op == CF_OP_CALL_FS) {
748 			sh->init_call_fs(c);
749 			c->flags |= NF_SCHEDULE_EARLY | NF_DONT_MOVE;
750 		} else if (flags & CF_LOOP_START) {
751 			prepare_loop(c);
752 		} else if (c->bc.op == CF_OP_JUMP) {
753 			prepare_if(c);
754 		} else if (c->bc.op == CF_OP_LOOP_END) {
755 			loop_stack.pop();
756 		} else if (c->bc.op == CF_OP_LOOP_CONTINUE) {
757 			assert(!loop_stack.empty());
758 			repeat_node *rep = sh->create_repeat(loop_stack.top());
759 			if (c->parent->first != c)
760 				rep->move(c->parent->first, c);
761 			c->replace_with(rep);
762 			sh->simplify_dep_rep(rep);
763 		} else if (c->bc.op == CF_OP_LOOP_BREAK) {
764 			assert(!loop_stack.empty());
765 			depart_node *dep = sh->create_depart(loop_stack.top());
766 			if (c->parent->first != c)
767 				dep->move(c->parent->first, c);
768 			c->replace_with(dep);
769 			sh->simplify_dep_rep(dep);
770 		} else if (flags & CF_EXP) {
771 
772 			// unroll burst exports
773 
774 			assert(c->bc.op == CF_OP_EXPORT || c->bc.op == CF_OP_EXPORT_DONE);
775 
776 			c->bc.set_op(CF_OP_EXPORT);
777 
778 			unsigned burst_count = c->bc.burst_count;
779 			unsigned eop = c->bc.end_of_program;
780 
781 			c->bc.end_of_program = 0;
782 			c->bc.burst_count = 0;
783 
784 			do {
785 				c->src.resize(4);
786 
787 				for(int s = 0; s < 4; ++s) {
788 					switch (c->bc.sel[s]) {
789 					case SEL_0:
790 						c->src[s] = sh->get_const_value(0.0f);
791 						break;
792 					case SEL_1:
793 						c->src[s] = sh->get_const_value(1.0f);
794 						break;
795 					case SEL_MASK:
796 						break;
797 					default:
798 						if (c->bc.sel[s] <= SEL_W)
799 							c->src[s] = sh->get_gpr_value(true, c->bc.rw_gpr,
800 									c->bc.sel[s], false);
801 						else
802 							assert(!"invalid src_sel for export");
803 					}
804 				}
805 
806 				if (!burst_count--)
807 					break;
808 
809 				cf_node *cf_next = sh->create_cf();
810 				cf_next->bc = c->bc;
811 				++cf_next->bc.rw_gpr;
812 				++cf_next->bc.array_base;
813 
814 				c->insert_after(cf_next);
815 				c = cf_next;
816 
817 			} while (1);
818 
819 			c->bc.end_of_program = eop;
820 		} else if (flags & CF_MEM) {
821 
822 			unsigned burst_count = c->bc.burst_count;
823 			unsigned eop = c->bc.end_of_program;
824 
825 			c->bc.end_of_program = 0;
826 			c->bc.burst_count = 0;
827 
828 			do {
829 
830 				c->src.resize(4);
831 
832 				for(int s = 0; s < 4; ++s) {
833 					if (c->bc.comp_mask & (1 << s))
834 						c->src[s] =
835 								sh->get_gpr_value(true, c->bc.rw_gpr, s, false);
836 				}
837 
838 				if (((flags & CF_RAT) || (!(flags & CF_STRM))) && (c->bc.type & 1)) { // indexed write
839 					c->src.resize(8);
840 					for(int s = 0; s < 3; ++s) {
841 						c->src[4 + s] =
842 							sh->get_gpr_value(true, c->bc.index_gpr, s, false);
843 					}
844 
845 					// FIXME probably we can relax it a bit
846 					c->flags |= NF_DONT_HOIST | NF_DONT_MOVE;
847 				}
848 
849 				if (flags & CF_EMIT) {
850 					// Instruction implicitly depends on prior [EMIT_][CUT]_VERTEX
851 					c->src.push_back(sh->get_special_value(SV_GEOMETRY_EMIT));
852 					c->dst.push_back(sh->get_special_value(SV_GEOMETRY_EMIT));
853 					if (sh->target == TARGET_ES) {
854 						// For ES shaders this is an export
855 						c->flags |= NF_DONT_KILL;
856 					}
857 				}
858 
859 				if (!burst_count--)
860 					break;
861 
862 				cf_node *cf_next = sh->create_cf();
863 				cf_next->bc = c->bc;
864 				++cf_next->bc.rw_gpr;
865 
866 				// FIXME is it correct?
867 				cf_next->bc.array_base += cf_next->bc.elem_size + 1;
868 
869 				c->insert_after(cf_next);
870 				c = cf_next;
871 			} while (1);
872 
873 			c->bc.end_of_program = eop;
874 
875 		} else if (flags & CF_EMIT) {
876 			/* quick peephole */
877 			cf_node *prev = static_cast<cf_node *>(c->prev);
878 			if (c->bc.op == CF_OP_CUT_VERTEX &&
879 				prev && prev->is_valid() &&
880 				prev->bc.op == CF_OP_EMIT_VERTEX &&
881 				c->bc.count == prev->bc.count) {
882 				prev->bc.set_op(CF_OP_EMIT_CUT_VERTEX);
883 				prev->bc.end_of_program = c->bc.end_of_program;
884 				c->remove();
885 			}
886 			else {
887 				c->flags |= NF_DONT_KILL | NF_DONT_HOIST | NF_DONT_MOVE;
888 
889 				c->src.push_back(sh->get_special_value(SV_GEOMETRY_EMIT));
890 				c->dst.push_back(sh->get_special_value(SV_GEOMETRY_EMIT));
891 			}
892 		}
893 	}
894 
895 	assert(loop_stack.empty());
896 	return 0;
897 }
898 
prepare_loop(cf_node * c)899 int bc_parser::prepare_loop(cf_node* c) {
900 	assert(c->bc.addr-1 < cf_map.size());
901 
902 	cf_node *end = cf_map[c->bc.addr - 1];
903 	assert(end->bc.op == CF_OP_LOOP_END);
904 	assert(c->parent == end->parent);
905 
906 	region_node *reg = sh->create_region();
907 	repeat_node *rep = sh->create_repeat(reg);
908 
909 	reg->push_back(rep);
910 	c->insert_before(reg);
911 	rep->move(c, end->next);
912 
913 	reg->src_loop = true;
914 
915 	loop_stack.push(reg);
916 	return 0;
917 }
918 
prepare_if(cf_node * c)919 int bc_parser::prepare_if(cf_node* c) {
920 	assert(c->bc.addr-1 < cf_map.size());
921 	cf_node *c_else = NULL, *end = cf_map[c->bc.addr];
922 
923 	if (!end)
924 		return 0; // not quite sure how this happens, malformed input?
925 
926 	BCP_DUMP(
927 		sblog << "parsing JUMP @" << c->bc.id;
928 		sblog << "\n";
929 	);
930 
931 	if (end->bc.op == CF_OP_ELSE) {
932 		BCP_DUMP(
933 			sblog << "  found ELSE : ";
934 			dump::dump_op(end);
935 			sblog << "\n";
936 		);
937 
938 		c_else = end;
939 		end = cf_map[c_else->bc.addr];
940 	} else {
941 		BCP_DUMP(
942 			sblog << "  no else\n";
943 		);
944 
945 		c_else = end;
946 	}
947 
948 	if (c_else->parent != c->parent)
949 		c_else = NULL;
950 
951 	if (end && end->parent != c->parent)
952 		end = NULL;
953 
954 	region_node *reg = sh->create_region();
955 
956 	depart_node *dep2 = sh->create_depart(reg);
957 	depart_node *dep = sh->create_depart(reg);
958 	if_node *n_if = sh->create_if();
959 
960 	c->insert_before(reg);
961 
962 	if (c_else != end)
963 		dep->move(c_else, end);
964 	dep2->move(c, end);
965 
966 	reg->push_back(dep);
967 	dep->push_front(n_if);
968 	n_if->push_back(dep2);
969 
970 	n_if->cond = sh->get_special_value(SV_EXEC_MASK);
971 
972 	return 0;
973 }
974 
975 
976 } // namespace r600_sb
977