1 /* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
2 
3 /*
4  * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the "Software"),
8  * to deal in the Software without restriction, including without limitation
9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10  * and/or sell copies of the Software, and to permit persons to whom the
11  * Software is furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice (including the next
14  * paragraph) shall be included in all copies or substantial portions of the
15  * Software.
16  *
17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23  * SOFTWARE.
24  *
25  * Authors:
26  *    Rob Clark <robclark@freedesktop.org>
27  */
28 
29 #include "util/u_math.h"
30 #include "util/register_allocate.h"
31 #include "util/ralloc.h"
32 #include "util/bitset.h"
33 
34 #include "freedreno_util.h"
35 
36 #include "ir3.h"
37 #include "ir3_compiler.h"
38 
39 /*
40  * Register Assignment:
41  *
42  * Uses the register_allocate util, which implements graph coloring
43  * algo with interference classes.  To handle the cases where we need
44  * consecutive registers (for example, texture sample instructions),
45  * we model these as larger (double/quad/etc) registers which conflict
46  * with the corresponding registers in other classes.
47  *
48  * Additionally we create additional classes for half-regs, which
49  * do not conflict with the full-reg classes.  We do need at least
50  * sizes 1-4 (to deal w/ texture sample instructions output to half-
51  * reg).  At the moment we don't create the higher order half-reg
52  * classes as half-reg frequently does not have enough precision
53  * for texture coords at higher resolutions.
54  *
55  * There are some additional cases that we need to handle specially,
56  * as the graph coloring algo doesn't understand "partial writes".
57  * For example, a sequence like:
58  *
59  *   add r0.z, ...
60  *   sam (f32)(xy)r0.x, ...
61  *   ...
62  *   sam (f32)(xyzw)r0.w, r0.x, ...  ; 3d texture, so r0.xyz are coord
63  *
64  * In this scenario, we treat r0.xyz as class size 3, which is written
65  * (from a use/def perspective) at the 'add' instruction and ignore the
66  * subsequent partial writes to r0.xy.  So the 'add r0.z, ...' is the
67  * defining instruction, as it is the first to partially write r0.xyz.
68  *
69  * Note i965 has a similar scenario, which they solve with a virtual
70  * LOAD_PAYLOAD instruction which gets turned into multiple MOV's after
71  * register assignment.  But for us that is horrible from a scheduling
72  * standpoint.  Instead what we do is use idea of 'definer' instruction.
73  * Ie. the first instruction (lowest ip) to write to the variable is the
74  * one we consider from use/def perspective when building interference
75  * graph.  (Other instructions which write other variable components
76  * just define the variable some more.)
77  *
78  * Arrays of arbitrary size are handled via pre-coloring a consecutive
79  * sequence of registers.  Additional scalar (single component) reg
80  * names are allocated starting at ctx->class_base[total_class_count]
81  * (see arr->base), which are pre-colored.  In the use/def graph direct
82  * access is treated as a single element use/def, and indirect access
83  * is treated as use or def of all array elements.  (Only the first
84  * def is tracked, in case of multiple indirect writes, etc.)
85  */
86 
87 static const unsigned class_sizes[] = {
88 	1, 2, 3, 4,
89 	4 + 4, /* txd + 1d/2d */
90 	4 + 6, /* txd + 3d */
91 };
92 #define class_count ARRAY_SIZE(class_sizes)
93 
94 static const unsigned half_class_sizes[] = {
95 	1, 2, 3, 4,
96 };
97 #define half_class_count  ARRAY_SIZE(half_class_sizes)
98 
99 /* seems to just be used for compute shaders?  Seems like vec1 and vec3
100  * are sufficient (for now?)
101  */
102 static const unsigned high_class_sizes[] = {
103 	1, 3,
104 };
105 #define high_class_count ARRAY_SIZE(high_class_sizes)
106 
107 #define total_class_count (class_count + half_class_count + high_class_count)
108 
109 /* Below a0.x are normal regs.  RA doesn't need to assign a0.x/p0.x. */
110 #define NUM_REGS             (4 * 48)  /* r0 to r47 */
111 #define NUM_HIGH_REGS        (4 * 8)   /* r48 to r55 */
112 #define FIRST_HIGH_REG       (4 * 48)
113 /* Number of virtual regs in a given class: */
114 #define CLASS_REGS(i)        (NUM_REGS - (class_sizes[i] - 1))
115 #define HALF_CLASS_REGS(i)   (NUM_REGS - (half_class_sizes[i] - 1))
116 #define HIGH_CLASS_REGS(i)   (NUM_HIGH_REGS - (high_class_sizes[i] - 1))
117 
118 #define HALF_OFFSET          (class_count)
119 #define HIGH_OFFSET          (class_count + half_class_count)
120 
121 /* register-set, created one time, used for all shaders: */
122 struct ir3_ra_reg_set {
123 	struct ra_regs *regs;
124 	unsigned int classes[class_count];
125 	unsigned int half_classes[half_class_count];
126 	unsigned int high_classes[high_class_count];
127 	/* maps flat virtual register space to base gpr: */
128 	uint16_t *ra_reg_to_gpr;
129 	/* maps cls,gpr to flat virtual register space: */
130 	uint16_t **gpr_to_ra_reg;
131 };
132 
133 static void
build_q_values(unsigned int ** q_values,unsigned off,const unsigned * sizes,unsigned count)134 build_q_values(unsigned int **q_values, unsigned off,
135 		const unsigned *sizes, unsigned count)
136 {
137 	for (unsigned i = 0; i < count; i++) {
138 		q_values[i + off] = rzalloc_array(q_values, unsigned, total_class_count);
139 
140 		/* From register_allocate.c:
141 		 *
142 		 * q(B,C) (indexed by C, B is this register class) in
143 		 * Runeson/Nyström paper.  This is "how many registers of B could
144 		 * the worst choice register from C conflict with".
145 		 *
146 		 * If we just let the register allocation algorithm compute these
147 		 * values, is extremely expensive.  However, since all of our
148 		 * registers are laid out, we can very easily compute them
149 		 * ourselves.  View the register from C as fixed starting at GRF n
150 		 * somewhere in the middle, and the register from B as sliding back
151 		 * and forth.  Then the first register to conflict from B is the
152 		 * one starting at n - class_size[B] + 1 and the last register to
153 		 * conflict will start at n + class_size[B] - 1.  Therefore, the
154 		 * number of conflicts from B is class_size[B] + class_size[C] - 1.
155 		 *
156 		 *   +-+-+-+-+-+-+     +-+-+-+-+-+-+
157 		 * B | | | | | |n| --> | | | | | | |
158 		 *   +-+-+-+-+-+-+     +-+-+-+-+-+-+
159 		 *             +-+-+-+-+-+
160 		 * C           |n| | | | |
161 		 *             +-+-+-+-+-+
162 		 *
163 		 * (Idea copied from brw_fs_reg_allocate.cpp)
164 		 */
165 		for (unsigned j = 0; j < count; j++)
166 			q_values[i + off][j + off] = sizes[i] + sizes[j] - 1;
167 	}
168 }
169 
170 /* One-time setup of RA register-set, which describes all the possible
171  * "virtual" registers and their interferences.  Ie. double register
172  * occupies (and conflicts with) two single registers, and so forth.
173  * Since registers do not need to be aligned to their class size, they
174  * can conflict with other registers in the same class too.  Ie:
175  *
176  *    Single (base) |  Double
177  *    --------------+---------------
178  *       R0         |  D0
179  *       R1         |  D0 D1
180  *       R2         |     D1 D2
181  *       R3         |        D2
182  *           .. and so on..
183  *
184  * (NOTE the disassembler uses notation like r0.x/y/z/w but those are
185  * really just four scalar registers.  Don't let that confuse you.)
186  */
187 struct ir3_ra_reg_set *
ir3_ra_alloc_reg_set(void * memctx)188 ir3_ra_alloc_reg_set(void *memctx)
189 {
190 	struct ir3_ra_reg_set *set = rzalloc(memctx, struct ir3_ra_reg_set);
191 	unsigned ra_reg_count, reg, first_half_reg, first_high_reg, base;
192 	unsigned int **q_values;
193 
194 	/* calculate # of regs across all classes: */
195 	ra_reg_count = 0;
196 	for (unsigned i = 0; i < class_count; i++)
197 		ra_reg_count += CLASS_REGS(i);
198 	for (unsigned i = 0; i < half_class_count; i++)
199 		ra_reg_count += HALF_CLASS_REGS(i);
200 	for (unsigned i = 0; i < high_class_count; i++)
201 		ra_reg_count += HIGH_CLASS_REGS(i);
202 
203 	/* allocate and populate q_values: */
204 	q_values = ralloc_array(set, unsigned *, total_class_count);
205 
206 	build_q_values(q_values, 0, class_sizes, class_count);
207 	build_q_values(q_values, HALF_OFFSET, half_class_sizes, half_class_count);
208 	build_q_values(q_values, HIGH_OFFSET, high_class_sizes, high_class_count);
209 
210 	/* allocate the reg-set.. */
211 	set->regs = ra_alloc_reg_set(set, ra_reg_count, true);
212 	set->ra_reg_to_gpr = ralloc_array(set, uint16_t, ra_reg_count);
213 	set->gpr_to_ra_reg = ralloc_array(set, uint16_t *, total_class_count);
214 
215 	/* .. and classes */
216 	reg = 0;
217 	for (unsigned i = 0; i < class_count; i++) {
218 		set->classes[i] = ra_alloc_reg_class(set->regs);
219 
220 		set->gpr_to_ra_reg[i] = ralloc_array(set, uint16_t, CLASS_REGS(i));
221 
222 		for (unsigned j = 0; j < CLASS_REGS(i); j++) {
223 			ra_class_add_reg(set->regs, set->classes[i], reg);
224 
225 			set->ra_reg_to_gpr[reg] = j;
226 			set->gpr_to_ra_reg[i][j] = reg;
227 
228 			for (unsigned br = j; br < j + class_sizes[i]; br++)
229 				ra_add_transitive_reg_conflict(set->regs, br, reg);
230 
231 			reg++;
232 		}
233 	}
234 
235 	first_half_reg = reg;
236 	base = HALF_OFFSET;
237 
238 	for (unsigned i = 0; i < half_class_count; i++) {
239 		set->half_classes[i] = ra_alloc_reg_class(set->regs);
240 
241 		set->gpr_to_ra_reg[base + i] =
242 				ralloc_array(set, uint16_t, HALF_CLASS_REGS(i));
243 
244 		for (unsigned j = 0; j < HALF_CLASS_REGS(i); j++) {
245 			ra_class_add_reg(set->regs, set->half_classes[i], reg);
246 
247 			set->ra_reg_to_gpr[reg] = j;
248 			set->gpr_to_ra_reg[base + i][j] = reg;
249 
250 			for (unsigned br = j; br < j + half_class_sizes[i]; br++)
251 				ra_add_transitive_reg_conflict(set->regs, br + first_half_reg, reg);
252 
253 			reg++;
254 		}
255 	}
256 
257 	first_high_reg = reg;
258 	base = HIGH_OFFSET;
259 
260 	for (unsigned i = 0; i < high_class_count; i++) {
261 		set->high_classes[i] = ra_alloc_reg_class(set->regs);
262 
263 		set->gpr_to_ra_reg[base + i] =
264 				ralloc_array(set, uint16_t, HIGH_CLASS_REGS(i));
265 
266 		for (unsigned j = 0; j < HIGH_CLASS_REGS(i); j++) {
267 			ra_class_add_reg(set->regs, set->high_classes[i], reg);
268 
269 			set->ra_reg_to_gpr[reg] = j;
270 			set->gpr_to_ra_reg[base + i][j] = reg;
271 
272 			for (unsigned br = j; br < j + high_class_sizes[i]; br++)
273 				ra_add_transitive_reg_conflict(set->regs, br + first_high_reg, reg);
274 
275 			reg++;
276 		}
277 	}
278 
279 
280 	ra_set_finalize(set->regs, q_values);
281 
282 	ralloc_free(q_values);
283 
284 	return set;
285 }
286 
287 /* additional block-data (per-block) */
288 struct ir3_ra_block_data {
289 	BITSET_WORD *def;        /* variables defined before used in block */
290 	BITSET_WORD *use;        /* variables used before defined in block */
291 	BITSET_WORD *livein;     /* which defs reach entry point of block */
292 	BITSET_WORD *liveout;    /* which defs reach exit point of block */
293 };
294 
295 /* additional instruction-data (per-instruction) */
296 struct ir3_ra_instr_data {
297 	/* cached instruction 'definer' info: */
298 	struct ir3_instruction *defn;
299 	int off, sz, cls;
300 };
301 
302 /* register-assign context, per-shader */
303 struct ir3_ra_ctx {
304 	struct ir3 *ir;
305 	enum shader_t type;
306 	bool frag_face;
307 
308 	struct ir3_ra_reg_set *set;
309 	struct ra_graph *g;
310 	unsigned alloc_count;
311 	/* one per class, plus one slot for arrays: */
312 	unsigned class_alloc_count[total_class_count + 1];
313 	unsigned class_base[total_class_count + 1];
314 	unsigned instr_cnt;
315 	unsigned *def, *use;     /* def/use table */
316 	struct ir3_ra_instr_data *instrd;
317 };
318 
319 /* does it conflict? */
320 static inline bool
intersects(unsigned a_start,unsigned a_end,unsigned b_start,unsigned b_end)321 intersects(unsigned a_start, unsigned a_end, unsigned b_start, unsigned b_end)
322 {
323 	return !((a_start >= b_end) || (b_start >= a_end));
324 }
325 
326 static bool
is_half(struct ir3_instruction * instr)327 is_half(struct ir3_instruction *instr)
328 {
329 	return !!(instr->regs[0]->flags & IR3_REG_HALF);
330 }
331 
332 static bool
is_high(struct ir3_instruction * instr)333 is_high(struct ir3_instruction *instr)
334 {
335 	return !!(instr->regs[0]->flags & IR3_REG_HIGH);
336 }
337 
338 static int
size_to_class(unsigned sz,bool half,bool high)339 size_to_class(unsigned sz, bool half, bool high)
340 {
341 	if (high) {
342 		for (unsigned i = 0; i < high_class_count; i++)
343 			if (high_class_sizes[i] >= sz)
344 				return i + HIGH_OFFSET;
345 	} else if (half) {
346 		for (unsigned i = 0; i < half_class_count; i++)
347 			if (half_class_sizes[i] >= sz)
348 				return i + HALF_OFFSET;
349 	} else {
350 		for (unsigned i = 0; i < class_count; i++)
351 			if (class_sizes[i] >= sz)
352 				return i;
353 	}
354 	debug_assert(0);
355 	return -1;
356 }
357 
358 static bool
is_temp(struct ir3_register * reg)359 is_temp(struct ir3_register *reg)
360 {
361 	if (reg->flags & (IR3_REG_CONST | IR3_REG_IMMED))
362 		return false;
363 	if ((reg->num == regid(REG_A0, 0)) ||
364 			(reg->num == regid(REG_P0, 0)))
365 		return false;
366 	return true;
367 }
368 
369 static bool
writes_gpr(struct ir3_instruction * instr)370 writes_gpr(struct ir3_instruction *instr)
371 {
372 	if (is_store(instr))
373 		return false;
374 	/* is dest a normal temp register: */
375 	return is_temp(instr->regs[0]);
376 }
377 
378 static bool
instr_before(struct ir3_instruction * a,struct ir3_instruction * b)379 instr_before(struct ir3_instruction *a, struct ir3_instruction *b)
380 {
381 	if (a->flags & IR3_INSTR_UNUSED)
382 		return false;
383 	return (a->ip < b->ip);
384 }
385 
386 static struct ir3_instruction *
get_definer(struct ir3_ra_ctx * ctx,struct ir3_instruction * instr,int * sz,int * off)387 get_definer(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr,
388 		int *sz, int *off)
389 {
390 	struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
391 	struct ir3_instruction *d = NULL;
392 
393 	if (id->defn) {
394 		*sz = id->sz;
395 		*off = id->off;
396 		return id->defn;
397 	}
398 
399 	if (instr->opc == OPC_META_FI) {
400 		/* What about the case where collect is subset of array, we
401 		 * need to find the distance between where actual array starts
402 		 * and fanin..  that probably doesn't happen currently.
403 		 */
404 		struct ir3_register *src;
405 		int dsz, doff;
406 
407 		/* note: don't use foreach_ssa_src as this gets called once
408 		 * while assigning regs (which clears SSA flag)
409 		 */
410 		foreach_src_n(src, n, instr) {
411 			struct ir3_instruction *dd;
412 			if (!src->instr)
413 				continue;
414 
415 			dd = get_definer(ctx, src->instr, &dsz, &doff);
416 
417 			if ((!d) || instr_before(dd, d)) {
418 				d = dd;
419 				*sz = dsz;
420 				*off = doff - n;
421 			}
422 		}
423 
424 	} else if (instr->cp.right || instr->cp.left) {
425 		/* covers also the meta:fo case, which ends up w/ single
426 		 * scalar instructions for each component:
427 		 */
428 		struct ir3_instruction *f = ir3_neighbor_first(instr);
429 
430 		/* by definition, the entire sequence forms one linked list
431 		 * of single scalar register nodes (even if some of them may
432 		 * be fanouts from a texture sample (for example) instr.  We
433 		 * just need to walk the list finding the first element of
434 		 * the group defined (lowest ip)
435 		 */
436 		int cnt = 0;
437 
438 		/* need to skip over unused in the group: */
439 		while (f && (f->flags & IR3_INSTR_UNUSED)) {
440 			f = f->cp.right;
441 			cnt++;
442 		}
443 
444 		while (f) {
445 			if ((!d) || instr_before(f, d))
446 				d = f;
447 			if (f == instr)
448 				*off = cnt;
449 			f = f->cp.right;
450 			cnt++;
451 		}
452 
453 		*sz = cnt;
454 
455 	} else {
456 		/* second case is looking directly at the instruction which
457 		 * produces multiple values (eg, texture sample), rather
458 		 * than the fanout nodes that point back to that instruction.
459 		 * This isn't quite right, because it may be part of a larger
460 		 * group, such as:
461 		 *
462 		 *     sam (f32)(xyzw)r0.x, ...
463 		 *     add r1.x, ...
464 		 *     add r1.y, ...
465 		 *     sam (f32)(xyzw)r2.x, r0.w  <-- (r0.w, r1.x, r1.y)
466 		 *
467 		 * need to come up with a better way to handle that case.
468 		 */
469 		if (instr->address) {
470 			*sz = instr->regs[0]->size;
471 		} else {
472 			*sz = util_last_bit(instr->regs[0]->wrmask);
473 		}
474 		*off = 0;
475 		d = instr;
476 	}
477 
478 	if (d->regs[0]->flags & IR3_REG_PHI_SRC) {
479 		struct ir3_instruction *phi = d->regs[0]->instr;
480 		struct ir3_instruction *dd;
481 		int dsz, doff;
482 
483 		dd = get_definer(ctx, phi, &dsz, &doff);
484 
485 		*sz = MAX2(*sz, dsz);
486 		*off = doff;
487 
488 		if (instr_before(dd, d)) {
489 			d = dd;
490 		}
491 	}
492 
493 	if (d->opc == OPC_META_PHI) {
494 		/* we have already inserted parallel-copies into
495 		 * the phi, so we don't need to chase definers
496 		 */
497 		struct ir3_register *src;
498 		struct ir3_instruction *dd = d;
499 
500 		/* note: don't use foreach_ssa_src as this gets called once
501 		 * while assigning regs (which clears SSA flag)
502 		 */
503 		foreach_src(src, d) {
504 			if (!src->instr)
505 				continue;
506 			if (instr_before(src->instr, dd))
507 				dd = src->instr;
508 		}
509 
510 		d = dd;
511 	}
512 
513 	if (d->opc == OPC_META_FO) {
514 		struct ir3_instruction *dd;
515 		int dsz, doff;
516 
517 		dd = get_definer(ctx, d->regs[1]->instr, &dsz, &doff);
518 
519 		/* by definition, should come before: */
520 		debug_assert(instr_before(dd, d));
521 
522 		*sz = MAX2(*sz, dsz);
523 
524 		debug_assert(instr->opc == OPC_META_FO);
525 		*off = MAX2(*off, instr->fo.off);
526 
527 		d = dd;
528 	}
529 
530 	id->defn = d;
531 	id->sz = *sz;
532 	id->off = *off;
533 
534 	return d;
535 }
536 
537 static void
ra_block_find_definers(struct ir3_ra_ctx * ctx,struct ir3_block * block)538 ra_block_find_definers(struct ir3_ra_ctx *ctx, struct ir3_block *block)
539 {
540 	list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
541 		struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
542 		if (instr->regs_count == 0)
543 			continue;
544 		/* couple special cases: */
545 		if (writes_addr(instr) || writes_pred(instr)) {
546 			id->cls = -1;
547 		} else if (instr->regs[0]->flags & IR3_REG_ARRAY) {
548 			id->cls = total_class_count;
549 			id->defn = instr;
550 		} else {
551 			id->defn = get_definer(ctx, instr, &id->sz, &id->off);
552 			id->cls = size_to_class(id->sz, is_half(id->defn), is_high(id->defn));
553 		}
554 	}
555 }
556 
557 /* give each instruction a name (and ip), and count up the # of names
558  * of each class
559  */
560 static void
ra_block_name_instructions(struct ir3_ra_ctx * ctx,struct ir3_block * block)561 ra_block_name_instructions(struct ir3_ra_ctx *ctx, struct ir3_block *block)
562 {
563 	list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
564 		struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
565 
566 #ifdef DEBUG
567 		instr->name = ~0;
568 #endif
569 
570 		ctx->instr_cnt++;
571 
572 		if (instr->regs_count == 0)
573 			continue;
574 
575 		if (!writes_gpr(instr))
576 			continue;
577 
578 		if (id->defn != instr)
579 			continue;
580 
581 		/* arrays which don't fit in one of the pre-defined class
582 		 * sizes are pre-colored:
583 		 */
584 		if (id->cls >= 0) {
585 			instr->name = ctx->class_alloc_count[id->cls]++;
586 			ctx->alloc_count++;
587 		}
588 	}
589 }
590 
591 static void
ra_init(struct ir3_ra_ctx * ctx)592 ra_init(struct ir3_ra_ctx *ctx)
593 {
594 	unsigned n, base;
595 
596 	ir3_clear_mark(ctx->ir);
597 	n = ir3_count_instructions(ctx->ir);
598 
599 	ctx->instrd = rzalloc_array(NULL, struct ir3_ra_instr_data, n);
600 
601 	list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
602 		ra_block_find_definers(ctx, block);
603 	}
604 
605 	list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
606 		ra_block_name_instructions(ctx, block);
607 	}
608 
609 	/* figure out the base register name for each class.  The
610 	 * actual ra name is class_base[cls] + instr->name;
611 	 */
612 	ctx->class_base[0] = 0;
613 	for (unsigned i = 1; i <= total_class_count; i++) {
614 		ctx->class_base[i] = ctx->class_base[i-1] +
615 				ctx->class_alloc_count[i-1];
616 	}
617 
618 	/* and vreg names for array elements: */
619 	base = ctx->class_base[total_class_count];
620 	list_for_each_entry (struct ir3_array, arr, &ctx->ir->array_list, node) {
621 		arr->base = base;
622 		ctx->class_alloc_count[total_class_count] += arr->length;
623 		base += arr->length;
624 	}
625 	ctx->alloc_count += ctx->class_alloc_count[total_class_count];
626 
627 	ctx->g = ra_alloc_interference_graph(ctx->set->regs, ctx->alloc_count);
628 	ralloc_steal(ctx->g, ctx->instrd);
629 	ctx->def = rzalloc_array(ctx->g, unsigned, ctx->alloc_count);
630 	ctx->use = rzalloc_array(ctx->g, unsigned, ctx->alloc_count);
631 }
632 
633 static unsigned
__ra_name(struct ir3_ra_ctx * ctx,int cls,struct ir3_instruction * defn)634 __ra_name(struct ir3_ra_ctx *ctx, int cls, struct ir3_instruction *defn)
635 {
636 	unsigned name;
637 	debug_assert(cls >= 0);
638 	debug_assert(cls < total_class_count);  /* we shouldn't get arrays here.. */
639 	name = ctx->class_base[cls] + defn->name;
640 	debug_assert(name < ctx->alloc_count);
641 	return name;
642 }
643 
644 static int
ra_name(struct ir3_ra_ctx * ctx,struct ir3_ra_instr_data * id)645 ra_name(struct ir3_ra_ctx *ctx, struct ir3_ra_instr_data *id)
646 {
647 	/* TODO handle name mapping for arrays */
648 	return __ra_name(ctx, id->cls, id->defn);
649 }
650 
651 static void
ra_destroy(struct ir3_ra_ctx * ctx)652 ra_destroy(struct ir3_ra_ctx *ctx)
653 {
654 	ralloc_free(ctx->g);
655 }
656 
657 static void
ra_block_compute_live_ranges(struct ir3_ra_ctx * ctx,struct ir3_block * block)658 ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block)
659 {
660 	struct ir3_ra_block_data *bd;
661 	unsigned bitset_words = BITSET_WORDS(ctx->alloc_count);
662 
663 #define def(name, instr) \
664 		do { \
665 			/* defined on first write: */ \
666 			if (!ctx->def[name]) \
667 				ctx->def[name] = instr->ip; \
668 			ctx->use[name] = instr->ip; \
669 			BITSET_SET(bd->def, name); \
670 		} while(0);
671 
672 #define use(name, instr) \
673 		do { \
674 			ctx->use[name] = MAX2(ctx->use[name], instr->ip); \
675 			if (!BITSET_TEST(bd->def, name)) \
676 				BITSET_SET(bd->use, name); \
677 		} while(0);
678 
679 	bd = rzalloc(ctx->g, struct ir3_ra_block_data);
680 
681 	bd->def     = rzalloc_array(bd, BITSET_WORD, bitset_words);
682 	bd->use     = rzalloc_array(bd, BITSET_WORD, bitset_words);
683 	bd->livein  = rzalloc_array(bd, BITSET_WORD, bitset_words);
684 	bd->liveout = rzalloc_array(bd, BITSET_WORD, bitset_words);
685 
686 	block->data = bd;
687 
688 	list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
689 		struct ir3_instruction *src;
690 		struct ir3_register *reg;
691 
692 		if (instr->regs_count == 0)
693 			continue;
694 
695 		/* There are a couple special cases to deal with here:
696 		 *
697 		 * fanout: used to split values from a higher class to a lower
698 		 *     class, for example split the results of a texture fetch
699 		 *     into individual scalar values;  We skip over these from
700 		 *     a 'def' perspective, and for a 'use' we walk the chain
701 		 *     up to the defining instruction.
702 		 *
703 		 * fanin: used to collect values from lower class and assemble
704 		 *     them together into a higher class, for example arguments
705 		 *     to texture sample instructions;  We consider these to be
706 		 *     defined at the earliest fanin source.
707 		 *
708 		 * phi: used to merge values from different flow control paths
709 		 *     to the same reg.  Consider defined at earliest phi src,
710 		 *     and update all the other phi src's (which may come later
711 		 *     in the program) as users to extend the var's live range.
712 		 *
713 		 * Most of this, other than phi, is completely handled in the
714 		 * get_definer() helper.
715 		 *
716 		 * In either case, we trace the instruction back to the original
717 		 * definer and consider that as the def/use ip.
718 		 */
719 
720 		if (writes_gpr(instr)) {
721 			struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
722 			struct ir3_register *dst = instr->regs[0];
723 
724 			if (dst->flags & IR3_REG_ARRAY) {
725 				struct ir3_array *arr =
726 					ir3_lookup_array(ctx->ir, dst->array.id);
727 				unsigned i;
728 
729 				debug_assert(!(dst->flags & IR3_REG_PHI_SRC));
730 
731 				arr->start_ip = MIN2(arr->start_ip, instr->ip);
732 				arr->end_ip = MAX2(arr->end_ip, instr->ip);
733 
734 				/* set the node class now.. in case we don't encounter
735 				 * this array dst again.  From register_alloc algo's
736 				 * perspective, these are all single/scalar regs:
737 				 */
738 				for (i = 0; i < arr->length; i++) {
739 					unsigned name = arr->base + i;
740 					ra_set_node_class(ctx->g, name, ctx->set->classes[0]);
741 				}
742 
743 				/* indirect write is treated like a write to all array
744 				 * elements, since we don't know which one is actually
745 				 * written:
746 				 */
747 				if (dst->flags & IR3_REG_RELATIV) {
748 					for (i = 0; i < arr->length; i++) {
749 						unsigned name = arr->base + i;
750 						def(name, instr);
751 					}
752 				} else {
753 					unsigned name = arr->base + dst->array.offset;
754 					def(name, instr);
755 				}
756 
757 			} else if (id->defn == instr) {
758 				unsigned name = ra_name(ctx, id);
759 
760 				/* since we are in SSA at this point: */
761 				debug_assert(!BITSET_TEST(bd->use, name));
762 
763 				def(name, id->defn);
764 
765 				if (is_high(id->defn)) {
766 					ra_set_node_class(ctx->g, name,
767 							ctx->set->high_classes[id->cls - HIGH_OFFSET]);
768 				} else if (is_half(id->defn)) {
769 					ra_set_node_class(ctx->g, name,
770 							ctx->set->half_classes[id->cls - HALF_OFFSET]);
771 				} else {
772 					ra_set_node_class(ctx->g, name,
773 							ctx->set->classes[id->cls]);
774 				}
775 
776 				/* extend the live range for phi srcs, which may come
777 				 * from the bottom of the loop
778 				 */
779 				if (id->defn->regs[0]->flags & IR3_REG_PHI_SRC) {
780 					struct ir3_instruction *phi = id->defn->regs[0]->instr;
781 					foreach_ssa_src(src, phi) {
782 						/* if src is after phi, then we need to extend
783 						 * the liverange to the end of src's block:
784 						 */
785 						if (src->ip > phi->ip) {
786 							struct ir3_instruction *last =
787 									list_last_entry(&src->block->instr_list,
788 											struct ir3_instruction, node);
789 							ctx->use[name] = MAX2(ctx->use[name], last->ip);
790 						}
791 					}
792 				}
793 			}
794 		}
795 
796 		foreach_src(reg, instr) {
797 			if (reg->flags & IR3_REG_ARRAY) {
798 				struct ir3_array *arr =
799 					ir3_lookup_array(ctx->ir, reg->array.id);
800 				arr->start_ip = MIN2(arr->start_ip, instr->ip);
801 				arr->end_ip = MAX2(arr->end_ip, instr->ip);
802 				/* indirect read is treated like a read fromall array
803 				 * elements, since we don't know which one is actually
804 				 * read:
805 				 */
806 				if (reg->flags & IR3_REG_RELATIV) {
807 					unsigned i;
808 					for (i = 0; i < arr->length; i++) {
809 						unsigned name = arr->base + i;
810 						use(name, instr);
811 					}
812 				} else {
813 					unsigned name = arr->base + reg->array.offset;
814 					use(name, instr);
815 					debug_assert(reg->array.offset < arr->length);
816 				}
817 			} else if ((src = ssa(reg)) && writes_gpr(src)) {
818 				unsigned name = ra_name(ctx, &ctx->instrd[src->ip]);
819 				use(name, instr);
820 			}
821 		}
822 	}
823 }
824 
825 static bool
ra_compute_livein_liveout(struct ir3_ra_ctx * ctx)826 ra_compute_livein_liveout(struct ir3_ra_ctx *ctx)
827 {
828 	unsigned bitset_words = BITSET_WORDS(ctx->alloc_count);
829 	bool progress = false;
830 
831 	list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
832 		struct ir3_ra_block_data *bd = block->data;
833 
834 		/* update livein: */
835 		for (unsigned i = 0; i < bitset_words; i++) {
836 			BITSET_WORD new_livein =
837 				(bd->use[i] | (bd->liveout[i] & ~bd->def[i]));
838 
839 			if (new_livein & ~bd->livein[i]) {
840 				bd->livein[i] |= new_livein;
841 				progress = true;
842 			}
843 		}
844 
845 		/* update liveout: */
846 		for (unsigned j = 0; j < ARRAY_SIZE(block->successors); j++) {
847 			struct ir3_block *succ = block->successors[j];
848 			struct ir3_ra_block_data *succ_bd;
849 
850 			if (!succ)
851 				continue;
852 
853 			succ_bd = succ->data;
854 
855 			for (unsigned i = 0; i < bitset_words; i++) {
856 				BITSET_WORD new_liveout =
857 					(succ_bd->livein[i] & ~bd->liveout[i]);
858 
859 				if (new_liveout) {
860 					bd->liveout[i] |= new_liveout;
861 					progress = true;
862 				}
863 			}
864 		}
865 	}
866 
867 	return progress;
868 }
869 
870 static void
print_bitset(const char * name,BITSET_WORD * bs,unsigned cnt)871 print_bitset(const char *name, BITSET_WORD *bs, unsigned cnt)
872 {
873 	bool first = true;
874 	debug_printf("  %s:", name);
875 	for (unsigned i = 0; i < cnt; i++) {
876 		if (BITSET_TEST(bs, i)) {
877 			if (!first)
878 				debug_printf(",");
879 			debug_printf(" %04u", i);
880 			first = false;
881 		}
882 	}
883 	debug_printf("\n");
884 }
885 
886 static void
ra_add_interference(struct ir3_ra_ctx * ctx)887 ra_add_interference(struct ir3_ra_ctx *ctx)
888 {
889 	struct ir3 *ir = ctx->ir;
890 
891 	/* initialize array live ranges: */
892 	list_for_each_entry (struct ir3_array, arr, &ir->array_list, node) {
893 		arr->start_ip = ~0;
894 		arr->end_ip = 0;
895 	}
896 
897 	/* compute live ranges (use/def) on a block level, also updating
898 	 * block's def/use bitmasks (used below to calculate per-block
899 	 * livein/liveout):
900 	 */
901 	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
902 		ra_block_compute_live_ranges(ctx, block);
903 	}
904 
905 	/* update per-block livein/liveout: */
906 	while (ra_compute_livein_liveout(ctx)) {}
907 
908 	if (fd_mesa_debug & FD_DBG_OPTMSGS) {
909 		debug_printf("AFTER LIVEIN/OUT:\n");
910 		ir3_print(ir);
911 		list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
912 			struct ir3_ra_block_data *bd = block->data;
913 			debug_printf("block%u:\n", block_id(block));
914 			print_bitset("def", bd->def, ctx->alloc_count);
915 			print_bitset("use", bd->use, ctx->alloc_count);
916 			print_bitset("l/i", bd->livein, ctx->alloc_count);
917 			print_bitset("l/o", bd->liveout, ctx->alloc_count);
918 		}
919 	}
920 
921 	/* extend start/end ranges based on livein/liveout info from cfg: */
922 	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
923 		struct ir3_ra_block_data *bd = block->data;
924 
925 		for (unsigned i = 0; i < ctx->alloc_count; i++) {
926 			if (BITSET_TEST(bd->livein, i)) {
927 				ctx->def[i] = MIN2(ctx->def[i], block->start_ip);
928 				ctx->use[i] = MAX2(ctx->use[i], block->start_ip);
929 			}
930 
931 			if (BITSET_TEST(bd->liveout, i)) {
932 				ctx->def[i] = MIN2(ctx->def[i], block->end_ip);
933 				ctx->use[i] = MAX2(ctx->use[i], block->end_ip);
934 			}
935 		}
936 	}
937 
938 	/* need to fix things up to keep outputs live: */
939 	for (unsigned i = 0; i < ir->noutputs; i++) {
940 		struct ir3_instruction *instr = ir->outputs[i];
941 		unsigned name = ra_name(ctx, &ctx->instrd[instr->ip]);
942 		ctx->use[name] = ctx->instr_cnt;
943 	}
944 
945 	for (unsigned i = 0; i < ctx->alloc_count; i++) {
946 		for (unsigned j = 0; j < ctx->alloc_count; j++) {
947 			if (intersects(ctx->def[i], ctx->use[i],
948 					ctx->def[j], ctx->use[j])) {
949 				ra_add_node_interference(ctx->g, i, j);
950 			}
951 		}
952 	}
953 }
954 
955 /* some instructions need fix-up if dst register is half precision: */
fixup_half_instr_dst(struct ir3_instruction * instr)956 static void fixup_half_instr_dst(struct ir3_instruction *instr)
957 {
958 	switch (opc_cat(instr->opc)) {
959 	case 1: /* move instructions */
960 		instr->cat1.dst_type = half_type(instr->cat1.dst_type);
961 		break;
962 	case 3:
963 		switch (instr->opc) {
964 		case OPC_MAD_F32:
965 			instr->opc = OPC_MAD_F16;
966 			break;
967 		case OPC_SEL_B32:
968 			instr->opc = OPC_SEL_B16;
969 			break;
970 		case OPC_SEL_S32:
971 			instr->opc = OPC_SEL_S16;
972 			break;
973 		case OPC_SEL_F32:
974 			instr->opc = OPC_SEL_F16;
975 			break;
976 		case OPC_SAD_S32:
977 			instr->opc = OPC_SAD_S16;
978 			break;
979 		/* instructions may already be fixed up: */
980 		case OPC_MAD_F16:
981 		case OPC_SEL_B16:
982 		case OPC_SEL_S16:
983 		case OPC_SEL_F16:
984 		case OPC_SAD_S16:
985 			break;
986 		default:
987 			assert(0);
988 			break;
989 		}
990 		break;
991 	case 5:
992 		instr->cat5.type = half_type(instr->cat5.type);
993 		break;
994 	}
995 }
996 /* some instructions need fix-up if src register is half precision: */
fixup_half_instr_src(struct ir3_instruction * instr)997 static void fixup_half_instr_src(struct ir3_instruction *instr)
998 {
999 	switch (instr->opc) {
1000 	case OPC_MOV:
1001 		instr->cat1.src_type = half_type(instr->cat1.src_type);
1002 		break;
1003 	default:
1004 		break;
1005 	}
1006 }
1007 
1008 /* NOTE: instr could be NULL for IR3_REG_ARRAY case, for the first
1009  * array access(es) which do not have any previous access to depend
1010  * on from scheduling point of view
1011  */
1012 static void
reg_assign(struct ir3_ra_ctx * ctx,struct ir3_register * reg,struct ir3_instruction * instr)1013 reg_assign(struct ir3_ra_ctx *ctx, struct ir3_register *reg,
1014 		struct ir3_instruction *instr)
1015 {
1016 	struct ir3_ra_instr_data *id;
1017 
1018 	if (reg->flags & IR3_REG_ARRAY) {
1019 		struct ir3_array *arr =
1020 			ir3_lookup_array(ctx->ir, reg->array.id);
1021 		unsigned name = arr->base + reg->array.offset;
1022 		unsigned r = ra_get_node_reg(ctx->g, name);
1023 		unsigned num = ctx->set->ra_reg_to_gpr[r];
1024 
1025 		if (reg->flags & IR3_REG_RELATIV) {
1026 			reg->array.offset = num;
1027 		} else {
1028 			reg->num = num;
1029 		}
1030 
1031 		reg->flags &= ~IR3_REG_ARRAY;
1032 	} else if ((id = &ctx->instrd[instr->ip]) && id->defn) {
1033 		unsigned name = ra_name(ctx, id);
1034 		unsigned r = ra_get_node_reg(ctx->g, name);
1035 		unsigned num = ctx->set->ra_reg_to_gpr[r] + id->off;
1036 
1037 		debug_assert(!(reg->flags & IR3_REG_RELATIV));
1038 
1039 		if (is_high(id->defn))
1040 			num += FIRST_HIGH_REG;
1041 
1042 		reg->num = num;
1043 		reg->flags &= ~(IR3_REG_SSA | IR3_REG_PHI_SRC);
1044 
1045 		if (is_half(id->defn))
1046 			reg->flags |= IR3_REG_HALF;
1047 	}
1048 }
1049 
1050 static void
ra_block_alloc(struct ir3_ra_ctx * ctx,struct ir3_block * block)1051 ra_block_alloc(struct ir3_ra_ctx *ctx, struct ir3_block *block)
1052 {
1053 	list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
1054 		struct ir3_register *reg;
1055 
1056 		if (instr->regs_count == 0)
1057 			continue;
1058 
1059 		if (writes_gpr(instr)) {
1060 			reg_assign(ctx, instr->regs[0], instr);
1061 			if (instr->regs[0]->flags & IR3_REG_HALF)
1062 				fixup_half_instr_dst(instr);
1063 		}
1064 
1065 		foreach_src_n(reg, n, instr) {
1066 			struct ir3_instruction *src = reg->instr;
1067 			/* Note: reg->instr could be null for IR3_REG_ARRAY */
1068 			if (!(src || (reg->flags & IR3_REG_ARRAY)))
1069 				continue;
1070 			reg_assign(ctx, instr->regs[n+1], src);
1071 			if (instr->regs[n+1]->flags & IR3_REG_HALF)
1072 				fixup_half_instr_src(instr);
1073 		}
1074 	}
1075 }
1076 
1077 static int
ra_alloc(struct ir3_ra_ctx * ctx)1078 ra_alloc(struct ir3_ra_ctx *ctx)
1079 {
1080 	unsigned n = 0;
1081 
1082 	/* frag shader inputs get pre-assigned, since we have some
1083 	 * constraints/unknowns about setup for some of these regs:
1084 	 */
1085 	if (ctx->type == SHADER_FRAGMENT) {
1086 		struct ir3 *ir = ctx->ir;
1087 		unsigned i = 0, j;
1088 		if (ctx->frag_face && (i < ir->ninputs) && ir->inputs[i]) {
1089 			struct ir3_instruction *instr = ir->inputs[i];
1090 			int cls = size_to_class(1, true, false);
1091 			unsigned name = __ra_name(ctx, cls, instr);
1092 			unsigned reg = ctx->set->gpr_to_ra_reg[cls][0];
1093 
1094 			/* if we have frag_face, it gets hr0.x */
1095 			ra_set_node_reg(ctx->g, name, reg);
1096 			i += 4;
1097 		}
1098 
1099 		j = 0;
1100 		for (; i < ir->ninputs; i++) {
1101 			struct ir3_instruction *instr = ir->inputs[i];
1102 			if (instr) {
1103 				struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
1104 
1105 				if (id->defn == instr) {
1106 					unsigned name, reg;
1107 
1108 					name = ra_name(ctx, id);
1109 					reg = ctx->set->gpr_to_ra_reg[id->cls][j];
1110 
1111 					ra_set_node_reg(ctx->g, name, reg);
1112 					j += id->sz;
1113 				}
1114 			}
1115 		}
1116 		n = j;
1117 	}
1118 
1119 	/* pre-assign array elements:
1120 	 */
1121 	list_for_each_entry (struct ir3_array, arr, &ctx->ir->array_list, node) {
1122 		unsigned base = n;
1123 
1124 		if (arr->end_ip == 0)
1125 			continue;
1126 
1127 		/* figure out what else we conflict with which has already
1128 		 * been assigned:
1129 		 */
1130 retry:
1131 		list_for_each_entry (struct ir3_array, arr2, &ctx->ir->array_list, node) {
1132 			if (arr2 == arr)
1133 				break;
1134 			if (arr2->end_ip == 0)
1135 				continue;
1136 			/* if it intersects with liverange AND register range.. */
1137 			if (intersects(arr->start_ip, arr->end_ip,
1138 					arr2->start_ip, arr2->end_ip) &&
1139 				intersects(base, base + arr->length,
1140 					arr2->reg, arr2->reg + arr2->length)) {
1141 				base = MAX2(base, arr2->reg + arr2->length);
1142 				goto retry;
1143 			}
1144 		}
1145 
1146 		arr->reg = base;
1147 
1148 		for (unsigned i = 0; i < arr->length; i++) {
1149 			unsigned name, reg;
1150 
1151 			name = arr->base + i;
1152 			reg = ctx->set->gpr_to_ra_reg[0][base++];
1153 
1154 			ra_set_node_reg(ctx->g, name, reg);
1155 		}
1156 	}
1157 
1158 	if (!ra_allocate(ctx->g))
1159 		return -1;
1160 
1161 	list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
1162 		ra_block_alloc(ctx, block);
1163 	}
1164 
1165 	return 0;
1166 }
1167 
ir3_ra(struct ir3 * ir,enum shader_t type,bool frag_coord,bool frag_face)1168 int ir3_ra(struct ir3 *ir, enum shader_t type,
1169 		bool frag_coord, bool frag_face)
1170 {
1171 	struct ir3_ra_ctx ctx = {
1172 			.ir = ir,
1173 			.type = type,
1174 			.frag_face = frag_face,
1175 			.set = ir->compiler->set,
1176 	};
1177 	int ret;
1178 
1179 	ra_init(&ctx);
1180 	ra_add_interference(&ctx);
1181 	ret = ra_alloc(&ctx);
1182 	ra_destroy(&ctx);
1183 
1184 	return ret;
1185 }
1186