1 /*
2  * Copyright 2013 Vadim Girlin <vadimgirlin@gmail.com>
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * on the rights to use, copy, modify, merge, publish, distribute, sub
8  * license, and/or sell copies of the Software, and to permit persons to whom
9  * the Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
22  *
23  * Authors:
24  *      Vadim Girlin
25  */
26 
27 #ifndef SB_PASS_H_
28 #define SB_PASS_H_
29 
30 #include <stack>
31 
32 namespace r600_sb {
33 
34 class pass {
35 protected:
36 	sb_context &ctx;
37 	shader &sh;
38 
39 public:
40 	pass(shader &s);
41 
42 	virtual int run();
43 
~pass()44 	virtual ~pass() {}
45 };
46 
47 class vpass : public pass {
48 
49 public:
50 
vpass(shader & s)51 	vpass(shader &s) : pass(s) {}
52 
53 	virtual int init();
54 	virtual int done();
55 
56 	virtual int run();
57 	virtual void run_on(container_node &n);
58 
59 	virtual bool visit(node &n, bool enter);
60 	virtual bool visit(container_node &n, bool enter);
61 	virtual bool visit(alu_group_node &n, bool enter);
62 	virtual bool visit(cf_node &n, bool enter);
63 	virtual bool visit(alu_node &n, bool enter);
64 	virtual bool visit(alu_packed_node &n, bool enter);
65 	virtual bool visit(fetch_node &n, bool enter);
66 	virtual bool visit(region_node &n, bool enter);
67 	virtual bool visit(repeat_node &n, bool enter);
68 	virtual bool visit(depart_node &n, bool enter);
69 	virtual bool visit(if_node &n, bool enter);
70 	virtual bool visit(bb_node &n, bool enter);
71 
72 };
73 
74 class rev_vpass : public vpass {
75 
76 public:
rev_vpass(shader & s)77 	rev_vpass(shader &s) : vpass(s) {}
78 
79 	virtual void run_on(container_node &n);
80 };
81 
82 
83 // =================== PASSES
84 
85 class bytecode;
86 
87 class bc_dump : public vpass {
88 	using vpass::visit;
89 
90 	uint32_t *bc_data;
91 	unsigned ndw;
92 
93 	unsigned id;
94 
95 	unsigned new_group, group_index;
96 
97 public:
98 
99 	bc_dump(shader &s, bytecode *bc = NULL);
100 
bc_dump(shader & s,uint32_t * bc_ptr,unsigned ndw)101 	bc_dump(shader &s, uint32_t *bc_ptr, unsigned ndw) :
102 		vpass(s), bc_data(bc_ptr), ndw(ndw), id(), new_group(), group_index() {}
103 
104 	virtual int init();
105 	virtual int done();
106 
107 	virtual bool visit(cf_node &n, bool enter);
108 	virtual bool visit(alu_node &n, bool enter);
109 	virtual bool visit(fetch_node &n, bool enter);
110 
111 	void dump_dw(unsigned dw_id, unsigned count = 2);
112 
113 	void dump(cf_node& n);
114 	void dump(alu_node& n);
115 	void dump(fetch_node& n);
116 };
117 
118 
119 class dce_cleanup : public vpass {
120 	using vpass::visit;
121 
122 	bool remove_unused;
123 
124 public:
125 
dce_cleanup(shader & s)126 	dce_cleanup(shader &s) : vpass(s),
127 		remove_unused(s.dce_flags & DF_REMOVE_UNUSED), nodes_changed(false) {}
128 
129 	virtual int run();
130 
131 	virtual bool visit(node &n, bool enter);
132 	virtual bool visit(alu_group_node &n, bool enter);
133 	virtual bool visit(cf_node &n, bool enter);
134 	virtual bool visit(alu_node &n, bool enter);
135 	virtual bool visit(alu_packed_node &n, bool enter);
136 	virtual bool visit(fetch_node &n, bool enter);
137 	virtual bool visit(region_node &n, bool enter);
138 	virtual bool visit(container_node &n, bool enter);
139 
140 private:
141 
142 	void cleanup_dst(node &n);
143 	bool cleanup_dst_vec(vvec &vv);
144 
145 	// Did we alter/remove nodes during a single pass?
146 	bool nodes_changed;
147 };
148 
149 
150 class def_use : public pass {
151 
152 public:
153 
def_use(shader & sh)154 	def_use(shader &sh) : pass(sh) {}
155 
156 	virtual int run();
157 	void run_on(node *n, bool defs);
158 
159 private:
160 
161 	void process_uses(node *n);
162 	void process_defs(node *n, vvec &vv, bool arr_def);
163 	void process_phi(container_node *c, bool defs, bool uses);
164 };
165 
166 
167 
168 class dump : public vpass {
169 	using vpass::visit;
170 
171 	int level;
172 
173 public:
174 
dump(shader & s)175 	dump(shader &s) : vpass(s), level(0) {}
176 
177 	virtual bool visit(node &n, bool enter);
178 	virtual bool visit(container_node &n, bool enter);
179 	virtual bool visit(alu_group_node &n, bool enter);
180 	virtual bool visit(cf_node &n, bool enter);
181 	virtual bool visit(alu_node &n, bool enter);
182 	virtual bool visit(alu_packed_node &n, bool enter);
183 	virtual bool visit(fetch_node &n, bool enter);
184 	virtual bool visit(region_node &n, bool enter);
185 	virtual bool visit(repeat_node &n, bool enter);
186 	virtual bool visit(depart_node &n, bool enter);
187 	virtual bool visit(if_node &n, bool enter);
188 	virtual bool visit(bb_node &n, bool enter);
189 
190 
191 	static void dump_op(node &n, const char *name);
192 	static void dump_vec(const vvec & vv);
193 	static void dump_set(shader &sh, val_set & v);
194 
195 	static void dump_rels(vvec & vv);
196 
197 	static void dump_val(value *v);
198 	static void dump_op(node *n);
199 
200 	static void dump_op_list(container_node *c);
201 	static void dump_queue(sched_queue &q);
202 
203 	static void dump_alu(alu_node *n);
204 
205 private:
206 
207 	void indent();
208 
209 	void dump_common(node &n);
210 	void dump_flags(node &n);
211 
212 	void dump_live_values(container_node &n, bool before);
213 };
214 
215 
216 // Global Code Motion
217 
218 class gcm : public pass {
219 
220 	sched_queue bu_ready[SQ_NUM];
221 	sched_queue bu_ready_next[SQ_NUM];
222 	sched_queue bu_ready_early[SQ_NUM];
223 	sched_queue ready;
224 	sched_queue ready_above;
225 
226 	unsigned outstanding_lds_oq;
227 	container_node pending;
228 
229 	struct op_info {
230 		bb_node* top_bb;
231 		bb_node* bottom_bb;
op_infoop_info232 		op_info() : top_bb(), bottom_bb() {}
233 	};
234 
235 	typedef std::map<node*, op_info> op_info_map;
236 
237 	typedef std::map<node*, unsigned> nuc_map;
238 
239 	op_info_map op_map;
240 	nuc_map uses;
241 
242 	typedef std::vector<nuc_map> nuc_stack;
243 
244 	nuc_stack nuc_stk;
245 	unsigned ucs_level;
246 
247 	bb_node * bu_bb;
248 
249 	vvec pending_defs;
250 
251 	node_list pending_nodes;
252 
253 	unsigned cur_sq;
254 
255 	// for register pressure tracking in bottom-up pass
256 	val_set live;
257 	int live_count;
258 
259 	static const int rp_threshold = 100;
260 
261 	bool pending_exec_mask_update;
262 
263 public:
264 
gcm(shader & sh)265 	gcm(shader &sh) : pass(sh),
266 		bu_ready(), bu_ready_next(), bu_ready_early(),
267 		ready(), outstanding_lds_oq(),
268 		op_map(), uses(), nuc_stk(1), ucs_level(),
269 		bu_bb(), pending_defs(), pending_nodes(), cur_sq(),
270 		live(), live_count(), pending_exec_mask_update() {}
271 
272 	virtual int run();
273 
274 private:
275 
276 	void collect_instructions(container_node *c, bool early_pass);
277 
278 	void sched_early(container_node *n);
279 	void td_sched_bb(bb_node *bb);
280 	bool td_is_ready(node *n);
281 	void td_release_uses(vvec &v);
282 	void td_release_val(value *v);
283 	void td_schedule(bb_node *bb, node *n);
284 
285 	void sched_late(container_node *n);
286 	void bu_sched_bb(bb_node *bb);
287 	void bu_release_defs(vvec &v, bool src);
288 	void bu_release_phi_defs(container_node *p, unsigned op);
289 	bool bu_is_ready(node *n);
290 	void bu_release_val(value *v);
291 	void bu_release_op(node * n);
292 	void bu_find_best_bb(node *n, op_info &oi);
293 	void bu_schedule(container_node *bb, node *n);
294 
295 	void push_uc_stack();
296 	void pop_uc_stack();
297 
298 	void init_def_count(nuc_map &m, container_node &s);
299 	void init_use_count(nuc_map &m, container_node &s);
300 	unsigned get_uc_vec(vvec &vv);
301 	unsigned get_dc_vec(vvec &vv, bool src);
302 
303 	void add_ready(node *n);
304 
305 	void dump_uc_stack();
306 
307 	unsigned real_alu_count(sched_queue &q, unsigned max);
308 
309 	// check if we have not less than threshold ready alu instructions
310 	bool check_alu_ready_count(unsigned threshold);
311 };
312 
313 
314 class gvn : public vpass {
315 	using vpass::visit;
316 
317 public:
318 
gvn(shader & sh)319 	gvn(shader &sh) : vpass(sh) {}
320 
321 	virtual bool visit(node &n, bool enter);
322 	virtual bool visit(cf_node &n, bool enter);
323 	virtual bool visit(alu_node &n, bool enter);
324 	virtual bool visit(alu_packed_node &n, bool enter);
325 	virtual bool visit(fetch_node &n, bool enter);
326 	virtual bool visit(region_node &n, bool enter);
327 
328 private:
329 
330 	void process_op(node &n, bool rewrite = true);
331 
332 	// returns true if the value was rewritten
333 	bool process_src(value* &v, bool rewrite);
334 
335 
336 	void process_alu_src_constants(node &n, value* &v);
337 };
338 
339 
340 class if_conversion : public pass {
341 
342 public:
343 
if_conversion(shader & sh)344 	if_conversion(shader &sh) : pass(sh) {}
345 
346 	virtual int run();
347 
348 	bool run_on(region_node *r);
349 
350 	void convert_kill_instructions(region_node *r, value *em, bool branch,
351 	                               container_node *c);
352 
353 	bool check_and_convert(region_node *r);
354 
355 	alu_node* convert_phi(value *select, node *phi);
356 
357 };
358 
359 
360 class liveness : public rev_vpass {
361 	using vpass::visit;
362 
363 	val_set live;
364 	bool live_changed;
365 
366 public:
367 
liveness(shader & s)368 	liveness(shader &s) : rev_vpass(s), live_changed(false) {}
369 
370 	virtual int init();
371 
372 	virtual bool visit(node &n, bool enter);
373 	virtual bool visit(bb_node &n, bool enter);
374 	virtual bool visit(container_node &n, bool enter);
375 	virtual bool visit(alu_group_node &n, bool enter);
376 	virtual bool visit(cf_node &n, bool enter);
377 	virtual bool visit(alu_node &n, bool enter);
378 	virtual bool visit(alu_packed_node &n, bool enter);
379 	virtual bool visit(fetch_node &n, bool enter);
380 	virtual bool visit(region_node &n, bool enter);
381 	virtual bool visit(repeat_node &n, bool enter);
382 	virtual bool visit(depart_node &n, bool enter);
383 	virtual bool visit(if_node &n, bool enter);
384 
385 private:
386 
387 	void update_interferences();
388 	void process_op(node &n);
389 
390 	bool remove_val(value *v);
391 	bool remove_vec(vvec &v);
392 	bool process_outs(node& n);
393 	void process_ins(node& n);
394 
395 	void process_phi_outs(container_node *phi);
396 	void process_phi_branch(container_node *phi, unsigned id);
397 
398 	bool process_maydef(value *v);
399 
400 	bool add_vec(vvec &vv, bool src);
401 
402 	void update_src_vec(vvec &vv, bool src);
403 };
404 
405 
406 struct bool_op_info {
407 	bool invert;
408 	unsigned int_cvt;
409 
410 	alu_node *n;
411 };
412 
413 class peephole : public pass {
414 
415 public:
416 
peephole(shader & sh)417 	peephole(shader &sh) : pass(sh) {}
418 
419 	virtual int run();
420 
421 	void run_on(container_node *c);
422 
423 	void optimize_cc_op(alu_node *a);
424 
425 	void optimize_cc_op2(alu_node *a);
426 	void optimize_CNDcc_op(alu_node *a);
427 
428 	bool get_bool_op_info(value *b, bool_op_info& bop);
429 	bool get_bool_flt_to_int_source(alu_node* &a);
430 	void convert_float_setcc(alu_node *f2i, alu_node *s);
431 };
432 
433 
434 class psi_ops : public rev_vpass {
435 	using rev_vpass::visit;
436 
437 public:
438 
psi_ops(shader & s)439 	psi_ops(shader &s) : rev_vpass(s) {}
440 
441 	virtual bool visit(node &n, bool enter);
442 	virtual bool visit(alu_node &n, bool enter);
443 
444 	bool try_inline(node &n);
445 	bool try_reduce(node &n);
446 	bool eliminate(node &n);
447 
448 	void unpredicate(node *n);
449 };
450 
451 
452 // check correctness of the generated code, e.g.:
453 // - expected source operand value is the last value written to its gpr,
454 // - all arguments of phi node should be allocated to the same gpr,
455 // TODO other tests
456 class ra_checker : public pass {
457 
458 	typedef std::map<sel_chan, value *> reg_value_map;
459 
460 	typedef std::vector<reg_value_map> regmap_stack;
461 
462 	regmap_stack rm_stack;
463 	unsigned rm_stk_level;
464 
465 	value* prev_dst[5];
466 
467 public:
468 
ra_checker(shader & sh)469 	ra_checker(shader &sh) : pass(sh), rm_stk_level(0), prev_dst() {}
470 
471 	virtual int run();
472 
473 	void run_on(container_node *c);
474 
475 	void dump_error(const error_info &e);
476 	void dump_all_errors();
477 
478 private:
479 
rmap()480 	reg_value_map& rmap() { return rm_stack[rm_stk_level]; }
481 
482 	void push_stack();
483 	void pop_stack();
484 
485 	// when going out of the alu clause, values in the clause temporary gprs,
486 	// AR, predicate values, PS/PV are destroyed
487 	void kill_alu_only_regs();
488 	void error(node *n, unsigned id, std::string msg);
489 
490 	void check_phi_src(container_node *p, unsigned id);
491 	void process_phi_dst(container_node *p);
492 	void check_alu_group(alu_group_node *g);
493 	void process_op_dst(node *n);
494 	void check_op_src(node *n);
495 	void check_src_vec(node *n, unsigned id, vvec &vv, bool src);
496 	void check_value_gpr(node *n, unsigned id, value *v);
497 };
498 
499 // =======================================
500 
501 
502 class ra_coalesce : public pass {
503 
504 public:
505 
ra_coalesce(shader & sh)506 	ra_coalesce(shader &sh) : pass(sh) {}
507 
508 	virtual int run();
509 };
510 
511 
512 
513 // =======================================
514 
515 class ra_init : public pass {
516 
517 public:
518 
ra_init(shader & sh)519 	ra_init(shader &sh) : pass(sh), prev_chans() {
520 
521 		// The parameter below affects register channels distribution.
522 		// For cayman (VLIW-4) we're trying to distribute the channels
523 		// uniformly, this means significantly better alu slots utilization
524 		// at the expense of higher gpr usage. Hopefully this will improve
525 		// performance, though it has to be proven with real benchmarks yet.
526 		// For VLIW-5 this method could also slightly improve slots
527 		// utilization, but increased register pressure seems more significant
528 		// and overall performance effect is negative according to some
529 		// benchmarks, so it's not used currently. Basically, VLIW-5 doesn't
530 		// really need it because trans slot (unrestricted by register write
531 		// channel) allows to consume most deviations from uniform channel
532 		// distribution.
533 		// Value 3 means that for new allocation we'll use channel that differs
534 		// from 3 last used channels. 0 for VLIW-5 effectively turns this off.
535 
536 		ra_tune = sh.get_ctx().is_cayman() ? 3 : 0;
537 	}
538 
539 	virtual int run();
540 
541 private:
542 
543 	unsigned prev_chans;
544 	unsigned ra_tune;
545 
546 	void add_prev_chan(unsigned chan);
547 	unsigned get_preferable_chan_mask();
548 
549 	void ra_node(container_node *c);
550 	void process_op(node *n);
551 
552 	void color(value *v);
553 
554 	void color_bs_constraint(ra_constraint *c);
555 
556 	void assign_color(value *v, sel_chan c);
557 	void alloc_arrays();
558 };
559 
560 // =======================================
561 
562 class ra_split : public pass {
563 
564 public:
565 
ra_split(shader & sh)566 	ra_split(shader &sh) : pass(sh) {}
567 
568 	virtual int run();
569 
570 	void split(container_node *n);
571 	void split_op(node *n);
572 	void split_alu_packed(alu_packed_node *n);
573 	void split_vector_inst(node *n);
574 
575 	void split_packed_ins(alu_packed_node *n);
576 
577 #if 0
578 	void split_pinned_outs(node *n);
579 #endif
580 
581 	void split_vec(vvec &vv, vvec &v1, vvec &v2, bool allow_swz);
582 
583 	void split_phi_src(container_node *loc, container_node *c, unsigned id,
584 	                   bool loop);
585 	void split_phi_dst(node *loc, container_node *c, bool loop);
586 	void init_phi_constraints(container_node *c);
587 };
588 
589 
590 
591 class ssa_prepare : public vpass {
592 	using vpass::visit;
593 
594 	typedef std::vector<val_set> vd_stk;
595 	vd_stk stk;
596 
597 	unsigned level;
598 
599 public:
ssa_prepare(shader & s)600 	ssa_prepare(shader &s) : vpass(s), level(0) {}
601 
602 	virtual bool visit(cf_node &n, bool enter);
603 	virtual bool visit(alu_node &n, bool enter);
604 	virtual bool visit(fetch_node &n, bool enter);
605 	virtual bool visit(region_node &n, bool enter);
606 	virtual bool visit(repeat_node &n, bool enter);
607 	virtual bool visit(depart_node &n, bool enter);
608 
609 private:
610 
push_stk()611 	void push_stk() {
612 		++level;
613 		if (level + 1 > stk.size())
614 			stk.resize(level+1);
615 		else
616 			stk[level].clear();
617 	}
pop_stk()618 	void pop_stk() {
619 		assert(level);
620 		--level;
621 		stk[level].add_set(stk[level + 1]);
622 	}
623 
624 	void add_defs(node &n);
625 
cur_set()626 	val_set & cur_set() { return stk[level]; }
627 
628 	container_node* create_phi_nodes(int count);
629 };
630 
631 class ssa_rename : public vpass {
632 	using vpass::visit;
633 
634 	typedef sb_map<value*, unsigned> def_map;
635 
636 	def_map def_count;
637 	def_map lds_oq_count;
638 	def_map lds_rw_count;
639 	std::stack<def_map> rename_stack;
640 	std::stack<def_map> rename_lds_oq_stack;
641 	std::stack<def_map> rename_lds_rw_stack;
642 
643 	typedef std::map<uint32_t, value*> val_map;
644 	val_map values;
645 
646 public:
647 
ssa_rename(shader & s)648 	ssa_rename(shader &s) : vpass(s) {}
649 
650 	virtual int init();
651 
652 	virtual bool visit(container_node &n, bool enter);
653 	virtual bool visit(node &n, bool enter);
654 	virtual bool visit(alu_group_node &n, bool enter);
655 	virtual bool visit(cf_node &n, bool enter);
656 	virtual bool visit(alu_node &n, bool enter);
657 	virtual bool visit(alu_packed_node &n, bool enter);
658 	virtual bool visit(fetch_node &n, bool enter);
659 	virtual bool visit(region_node &n, bool enter);
660 	virtual bool visit(repeat_node &n, bool enter);
661 	virtual bool visit(depart_node &n, bool enter);
662 	virtual bool visit(if_node &n, bool enter);
663 
664 private:
665 
666 	void push(node *phi);
667 	void pop();
668 
669 	unsigned get_index(def_map& m, value* v);
670 	void set_index(def_map& m, value* v, unsigned index);
671 	unsigned new_index(def_map& m, value* v);
672 
673 	value* rename_use(node *n, value* v);
674 	value* rename_def(node *def, value* v);
675 
676 	void rename_src_vec(node *n, vvec &vv, bool src);
677 	void rename_dst_vec(node *def, vvec &vv, bool set_def);
678 
679 	void rename_src(node *n);
680 	void rename_dst(node *n);
681 
682 	void rename_phi_args(container_node *phi, unsigned op, bool def);
683 
684 	void rename_virt(node *n);
685 	void rename_virt_val(node *n, value *v);
686 };
687 
688 class bc_finalizer : public pass {
689 
690 	cf_node *last_export[EXP_TYPE_COUNT];
691 	cf_node *last_cf;
692 
693 	unsigned ngpr;
694 	unsigned nstack;
695 
696 public:
697 
bc_finalizer(shader & sh)698 	bc_finalizer(shader &sh) : pass(sh), last_export(), last_cf(), ngpr(),
699 		nstack() {}
700 
701 	virtual int run();
702 
703 	void finalize_loop(region_node *r);
704 	void finalize_if(region_node *r);
705 
706 	void run_on(container_node *c);
707 
708 	void insert_rv6xx_load_ar_workaround(alu_group_node *b4);
709 	void finalize_alu_group(alu_group_node *g, node *prev_node);
710 	bool finalize_alu_src(alu_group_node *g, alu_node *a, alu_group_node *prev_node);
711 
712 	void emit_set_grad(fetch_node* f);
713 	void finalize_fetch(fetch_node *f);
714 
715 	void finalize_cf(cf_node *c);
716 
717 	sel_chan translate_kcache(cf_node *alu, value *v);
718 
719 	void update_ngpr(unsigned gpr);
720 	void update_nstack(region_node *r, unsigned add = 0);
721 
722 	unsigned get_stack_depth(node *n, unsigned &loops, unsigned &ifs,
723 	                         unsigned add = 0);
724 
725 	void cf_peephole();
726 
727 private:
728 	void copy_fetch_src(fetch_node &dst, fetch_node &src, unsigned arg_start);
729 	void emit_set_texture_offsets(fetch_node &f);
730 };
731 
732 
733 } // namespace r600_sb
734 
735 #endif /* SB_PASS_H_ */
736