1 /* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
2
3 /*
4 * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice (including the next
14 * paragraph) shall be included in all copies or substantial portions of the
15 * Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 * SOFTWARE.
24 *
25 * Authors:
26 * Rob Clark <robclark@freedesktop.org>
27 */
28
29 #include "util/u_math.h"
30 #include "util/register_allocate.h"
31 #include "util/ralloc.h"
32 #include "util/bitset.h"
33
34 #include "freedreno_util.h"
35
36 #include "ir3.h"
37 #include "ir3_compiler.h"
38
39 /*
40 * Register Assignment:
41 *
42 * Uses the register_allocate util, which implements graph coloring
43 * algo with interference classes. To handle the cases where we need
44 * consecutive registers (for example, texture sample instructions),
45 * we model these as larger (double/quad/etc) registers which conflict
46 * with the corresponding registers in other classes.
47 *
48 * Additionally we create additional classes for half-regs, which
49 * do not conflict with the full-reg classes. We do need at least
50 * sizes 1-4 (to deal w/ texture sample instructions output to half-
51 * reg). At the moment we don't create the higher order half-reg
52 * classes as half-reg frequently does not have enough precision
53 * for texture coords at higher resolutions.
54 *
55 * There are some additional cases that we need to handle specially,
56 * as the graph coloring algo doesn't understand "partial writes".
57 * For example, a sequence like:
58 *
59 * add r0.z, ...
60 * sam (f32)(xy)r0.x, ...
61 * ...
62 * sam (f32)(xyzw)r0.w, r0.x, ... ; 3d texture, so r0.xyz are coord
63 *
64 * In this scenario, we treat r0.xyz as class size 3, which is written
65 * (from a use/def perspective) at the 'add' instruction and ignore the
66 * subsequent partial writes to r0.xy. So the 'add r0.z, ...' is the
67 * defining instruction, as it is the first to partially write r0.xyz.
68 *
69 * Note i965 has a similar scenario, which they solve with a virtual
70 * LOAD_PAYLOAD instruction which gets turned into multiple MOV's after
71 * register assignment. But for us that is horrible from a scheduling
72 * standpoint. Instead what we do is use idea of 'definer' instruction.
73 * Ie. the first instruction (lowest ip) to write to the variable is the
74 * one we consider from use/def perspective when building interference
75 * graph. (Other instructions which write other variable components
76 * just define the variable some more.)
77 *
78 * Arrays of arbitrary size are handled via pre-coloring a consecutive
79 * sequence of registers. Additional scalar (single component) reg
80 * names are allocated starting at ctx->class_base[total_class_count]
81 * (see arr->base), which are pre-colored. In the use/def graph direct
82 * access is treated as a single element use/def, and indirect access
83 * is treated as use or def of all array elements. (Only the first
84 * def is tracked, in case of multiple indirect writes, etc.)
85 */
86
87 static const unsigned class_sizes[] = {
88 1, 2, 3, 4,
89 4 + 4, /* txd + 1d/2d */
90 4 + 6, /* txd + 3d */
91 };
92 #define class_count ARRAY_SIZE(class_sizes)
93
94 static const unsigned half_class_sizes[] = {
95 1, 2, 3, 4,
96 };
97 #define half_class_count ARRAY_SIZE(half_class_sizes)
98 #define total_class_count (class_count + half_class_count)
99
100 /* Below a0.x are normal regs. RA doesn't need to assign a0.x/p0.x. */
101 #define NUM_REGS (4 * 48)
102 /* Number of virtual regs in a given class: */
103 #define CLASS_REGS(i) (NUM_REGS - (class_sizes[i] - 1))
104 #define HALF_CLASS_REGS(i) (NUM_REGS - (half_class_sizes[i] - 1))
105
106 /* register-set, created one time, used for all shaders: */
107 struct ir3_ra_reg_set {
108 struct ra_regs *regs;
109 unsigned int classes[class_count];
110 unsigned int half_classes[half_class_count];
111 /* maps flat virtual register space to base gpr: */
112 uint16_t *ra_reg_to_gpr;
113 /* maps cls,gpr to flat virtual register space: */
114 uint16_t **gpr_to_ra_reg;
115 };
116
117 /* One-time setup of RA register-set, which describes all the possible
118 * "virtual" registers and their interferences. Ie. double register
119 * occupies (and conflicts with) two single registers, and so forth.
120 * Since registers do not need to be aligned to their class size, they
121 * can conflict with other registers in the same class too. Ie:
122 *
123 * Single (base) | Double
124 * --------------+---------------
125 * R0 | D0
126 * R1 | D0 D1
127 * R2 | D1 D2
128 * R3 | D2
129 * .. and so on..
130 *
131 * (NOTE the disassembler uses notation like r0.x/y/z/w but those are
132 * really just four scalar registers. Don't let that confuse you.)
133 */
134 struct ir3_ra_reg_set *
ir3_ra_alloc_reg_set(void * memctx)135 ir3_ra_alloc_reg_set(void *memctx)
136 {
137 struct ir3_ra_reg_set *set = rzalloc(memctx, struct ir3_ra_reg_set);
138 unsigned ra_reg_count, reg, first_half_reg;
139 unsigned int **q_values;
140
141 /* calculate # of regs across all classes: */
142 ra_reg_count = 0;
143 for (unsigned i = 0; i < class_count; i++)
144 ra_reg_count += CLASS_REGS(i);
145 for (unsigned i = 0; i < half_class_count; i++)
146 ra_reg_count += HALF_CLASS_REGS(i);
147
148 /* allocate and populate q_values: */
149 q_values = ralloc_array(set, unsigned *, total_class_count);
150 for (unsigned i = 0; i < class_count; i++) {
151 q_values[i] = rzalloc_array(q_values, unsigned, total_class_count);
152
153 /* From register_allocate.c:
154 *
155 * q(B,C) (indexed by C, B is this register class) in
156 * Runeson/Nyström paper. This is "how many registers of B could
157 * the worst choice register from C conflict with".
158 *
159 * If we just let the register allocation algorithm compute these
160 * values, is extremely expensive. However, since all of our
161 * registers are laid out, we can very easily compute them
162 * ourselves. View the register from C as fixed starting at GRF n
163 * somewhere in the middle, and the register from B as sliding back
164 * and forth. Then the first register to conflict from B is the
165 * one starting at n - class_size[B] + 1 and the last register to
166 * conflict will start at n + class_size[B] - 1. Therefore, the
167 * number of conflicts from B is class_size[B] + class_size[C] - 1.
168 *
169 * +-+-+-+-+-+-+ +-+-+-+-+-+-+
170 * B | | | | | |n| --> | | | | | | |
171 * +-+-+-+-+-+-+ +-+-+-+-+-+-+
172 * +-+-+-+-+-+
173 * C |n| | | | |
174 * +-+-+-+-+-+
175 *
176 * (Idea copied from brw_fs_reg_allocate.cpp)
177 */
178 for (unsigned j = 0; j < class_count; j++)
179 q_values[i][j] = class_sizes[i] + class_sizes[j] - 1;
180 }
181
182 for (unsigned i = class_count; i < total_class_count; i++) {
183 q_values[i] = ralloc_array(q_values, unsigned, total_class_count);
184
185 /* see comment above: */
186 for (unsigned j = class_count; j < total_class_count; j++) {
187 q_values[i][j] = half_class_sizes[i - class_count] +
188 half_class_sizes[j - class_count] - 1;
189 }
190 }
191
192 /* allocate the reg-set.. */
193 set->regs = ra_alloc_reg_set(set, ra_reg_count, true);
194 set->ra_reg_to_gpr = ralloc_array(set, uint16_t, ra_reg_count);
195 set->gpr_to_ra_reg = ralloc_array(set, uint16_t *, total_class_count);
196
197 /* .. and classes */
198 reg = 0;
199 for (unsigned i = 0; i < class_count; i++) {
200 set->classes[i] = ra_alloc_reg_class(set->regs);
201
202 set->gpr_to_ra_reg[i] = ralloc_array(set, uint16_t, CLASS_REGS(i));
203
204 for (unsigned j = 0; j < CLASS_REGS(i); j++) {
205 ra_class_add_reg(set->regs, set->classes[i], reg);
206
207 set->ra_reg_to_gpr[reg] = j;
208 set->gpr_to_ra_reg[i][j] = reg;
209
210 for (unsigned br = j; br < j + class_sizes[i]; br++)
211 ra_add_transitive_reg_conflict(set->regs, br, reg);
212
213 reg++;
214 }
215 }
216
217 first_half_reg = reg;
218
219 for (unsigned i = 0; i < half_class_count; i++) {
220 set->half_classes[i] = ra_alloc_reg_class(set->regs);
221
222 set->gpr_to_ra_reg[class_count + i] =
223 ralloc_array(set, uint16_t, CLASS_REGS(i));
224
225 for (unsigned j = 0; j < HALF_CLASS_REGS(i); j++) {
226 ra_class_add_reg(set->regs, set->half_classes[i], reg);
227
228 set->ra_reg_to_gpr[reg] = j;
229 set->gpr_to_ra_reg[class_count + i][j] = reg;
230
231 for (unsigned br = j; br < j + half_class_sizes[i]; br++)
232 ra_add_transitive_reg_conflict(set->regs, br + first_half_reg, reg);
233
234 reg++;
235 }
236 }
237
238 ra_set_finalize(set->regs, q_values);
239
240 ralloc_free(q_values);
241
242 return set;
243 }
244
245 /* additional block-data (per-block) */
246 struct ir3_ra_block_data {
247 BITSET_WORD *def; /* variables defined before used in block */
248 BITSET_WORD *use; /* variables used before defined in block */
249 BITSET_WORD *livein; /* which defs reach entry point of block */
250 BITSET_WORD *liveout; /* which defs reach exit point of block */
251 };
252
253 /* additional instruction-data (per-instruction) */
254 struct ir3_ra_instr_data {
255 /* cached instruction 'definer' info: */
256 struct ir3_instruction *defn;
257 int off, sz, cls;
258 };
259
260 /* register-assign context, per-shader */
261 struct ir3_ra_ctx {
262 struct ir3 *ir;
263 enum shader_t type;
264 bool frag_face;
265
266 struct ir3_ra_reg_set *set;
267 struct ra_graph *g;
268 unsigned alloc_count;
269 /* one per class, plus one slot for arrays: */
270 unsigned class_alloc_count[total_class_count + 1];
271 unsigned class_base[total_class_count + 1];
272 unsigned instr_cnt;
273 unsigned *def, *use; /* def/use table */
274 struct ir3_ra_instr_data *instrd;
275 };
276
277 /* does it conflict? */
278 static inline bool
intersects(unsigned a_start,unsigned a_end,unsigned b_start,unsigned b_end)279 intersects(unsigned a_start, unsigned a_end, unsigned b_start, unsigned b_end)
280 {
281 return !((a_start >= b_end) || (b_start >= a_end));
282 }
283
284 static bool
is_half(struct ir3_instruction * instr)285 is_half(struct ir3_instruction *instr)
286 {
287 return !!(instr->regs[0]->flags & IR3_REG_HALF);
288 }
289
290 static int
size_to_class(unsigned sz,bool half)291 size_to_class(unsigned sz, bool half)
292 {
293 if (half) {
294 for (unsigned i = 0; i < half_class_count; i++)
295 if (half_class_sizes[i] >= sz)
296 return i + class_count;
297 } else {
298 for (unsigned i = 0; i < class_count; i++)
299 if (class_sizes[i] >= sz)
300 return i;
301 }
302 debug_assert(0);
303 return -1;
304 }
305
306 static bool
is_temp(struct ir3_register * reg)307 is_temp(struct ir3_register *reg)
308 {
309 if (reg->flags & (IR3_REG_CONST | IR3_REG_IMMED))
310 return false;
311 if ((reg->num == regid(REG_A0, 0)) ||
312 (reg->num == regid(REG_P0, 0)))
313 return false;
314 return true;
315 }
316
317 static bool
writes_gpr(struct ir3_instruction * instr)318 writes_gpr(struct ir3_instruction *instr)
319 {
320 if (is_store(instr))
321 return false;
322 /* is dest a normal temp register: */
323 return is_temp(instr->regs[0]);
324 }
325
326 static bool
instr_before(struct ir3_instruction * a,struct ir3_instruction * b)327 instr_before(struct ir3_instruction *a, struct ir3_instruction *b)
328 {
329 if (a->flags & IR3_INSTR_UNUSED)
330 return false;
331 return (a->ip < b->ip);
332 }
333
334 static struct ir3_instruction *
get_definer(struct ir3_ra_ctx * ctx,struct ir3_instruction * instr,int * sz,int * off)335 get_definer(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr,
336 int *sz, int *off)
337 {
338 struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
339 struct ir3_instruction *d = NULL;
340
341 if (id->defn) {
342 *sz = id->sz;
343 *off = id->off;
344 return id->defn;
345 }
346
347 if (instr->opc == OPC_META_FI) {
348 /* What about the case where collect is subset of array, we
349 * need to find the distance between where actual array starts
350 * and fanin.. that probably doesn't happen currently.
351 */
352 struct ir3_register *src;
353 int dsz, doff;
354
355 /* note: don't use foreach_ssa_src as this gets called once
356 * while assigning regs (which clears SSA flag)
357 */
358 foreach_src_n(src, n, instr) {
359 struct ir3_instruction *dd;
360 if (!src->instr)
361 continue;
362
363 dd = get_definer(ctx, src->instr, &dsz, &doff);
364
365 if ((!d) || instr_before(dd, d)) {
366 d = dd;
367 *sz = dsz;
368 *off = doff - n;
369 }
370 }
371
372 } else if (instr->cp.right || instr->cp.left) {
373 /* covers also the meta:fo case, which ends up w/ single
374 * scalar instructions for each component:
375 */
376 struct ir3_instruction *f = ir3_neighbor_first(instr);
377
378 /* by definition, the entire sequence forms one linked list
379 * of single scalar register nodes (even if some of them may
380 * be fanouts from a texture sample (for example) instr. We
381 * just need to walk the list finding the first element of
382 * the group defined (lowest ip)
383 */
384 int cnt = 0;
385
386 /* need to skip over unused in the group: */
387 while (f && (f->flags & IR3_INSTR_UNUSED)) {
388 f = f->cp.right;
389 cnt++;
390 }
391
392 while (f) {
393 if ((!d) || instr_before(f, d))
394 d = f;
395 if (f == instr)
396 *off = cnt;
397 f = f->cp.right;
398 cnt++;
399 }
400
401 *sz = cnt;
402
403 } else {
404 /* second case is looking directly at the instruction which
405 * produces multiple values (eg, texture sample), rather
406 * than the fanout nodes that point back to that instruction.
407 * This isn't quite right, because it may be part of a larger
408 * group, such as:
409 *
410 * sam (f32)(xyzw)r0.x, ...
411 * add r1.x, ...
412 * add r1.y, ...
413 * sam (f32)(xyzw)r2.x, r0.w <-- (r0.w, r1.x, r1.y)
414 *
415 * need to come up with a better way to handle that case.
416 */
417 if (instr->address) {
418 *sz = instr->regs[0]->size;
419 } else {
420 *sz = util_last_bit(instr->regs[0]->wrmask);
421 }
422 *off = 0;
423 d = instr;
424 }
425
426 if (d->regs[0]->flags & IR3_REG_PHI_SRC) {
427 struct ir3_instruction *phi = d->regs[0]->instr;
428 struct ir3_instruction *dd;
429 int dsz, doff;
430
431 dd = get_definer(ctx, phi, &dsz, &doff);
432
433 *sz = MAX2(*sz, dsz);
434 *off = doff;
435
436 if (instr_before(dd, d)) {
437 d = dd;
438 }
439 }
440
441 if (d->opc == OPC_META_PHI) {
442 /* we have already inserted parallel-copies into
443 * the phi, so we don't need to chase definers
444 */
445 struct ir3_register *src;
446 struct ir3_instruction *dd = d;
447
448 /* note: don't use foreach_ssa_src as this gets called once
449 * while assigning regs (which clears SSA flag)
450 */
451 foreach_src(src, d) {
452 if (!src->instr)
453 continue;
454 if (instr_before(src->instr, dd))
455 dd = src->instr;
456 }
457
458 d = dd;
459 }
460
461 if (d->opc == OPC_META_FO) {
462 struct ir3_instruction *dd;
463 int dsz, doff;
464
465 dd = get_definer(ctx, d->regs[1]->instr, &dsz, &doff);
466
467 /* by definition, should come before: */
468 debug_assert(instr_before(dd, d));
469
470 *sz = MAX2(*sz, dsz);
471
472 debug_assert(instr->opc == OPC_META_FO);
473 *off = MAX2(*off, instr->fo.off);
474
475 d = dd;
476 }
477
478 id->defn = d;
479 id->sz = *sz;
480 id->off = *off;
481
482 return d;
483 }
484
485 static void
ra_block_find_definers(struct ir3_ra_ctx * ctx,struct ir3_block * block)486 ra_block_find_definers(struct ir3_ra_ctx *ctx, struct ir3_block *block)
487 {
488 list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
489 struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
490 if (instr->regs_count == 0)
491 continue;
492 /* couple special cases: */
493 if (writes_addr(instr) || writes_pred(instr)) {
494 id->cls = -1;
495 } else if (instr->regs[0]->flags & IR3_REG_ARRAY) {
496 id->cls = total_class_count;
497 id->defn = instr;
498 } else {
499 id->defn = get_definer(ctx, instr, &id->sz, &id->off);
500 id->cls = size_to_class(id->sz, is_half(id->defn));
501 }
502 }
503 }
504
505 /* give each instruction a name (and ip), and count up the # of names
506 * of each class
507 */
508 static void
ra_block_name_instructions(struct ir3_ra_ctx * ctx,struct ir3_block * block)509 ra_block_name_instructions(struct ir3_ra_ctx *ctx, struct ir3_block *block)
510 {
511 list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
512 struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
513
514 #ifdef DEBUG
515 instr->name = ~0;
516 #endif
517
518 ctx->instr_cnt++;
519
520 if (instr->regs_count == 0)
521 continue;
522
523 if (!writes_gpr(instr))
524 continue;
525
526 if (id->defn != instr)
527 continue;
528
529 /* arrays which don't fit in one of the pre-defined class
530 * sizes are pre-colored:
531 */
532 if (id->cls >= 0) {
533 instr->name = ctx->class_alloc_count[id->cls]++;
534 ctx->alloc_count++;
535 }
536 }
537 }
538
539 static void
ra_init(struct ir3_ra_ctx * ctx)540 ra_init(struct ir3_ra_ctx *ctx)
541 {
542 unsigned n, base;
543
544 ir3_clear_mark(ctx->ir);
545 n = ir3_count_instructions(ctx->ir);
546
547 ctx->instrd = rzalloc_array(NULL, struct ir3_ra_instr_data, n);
548
549 list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
550 ra_block_find_definers(ctx, block);
551 }
552
553 list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
554 ra_block_name_instructions(ctx, block);
555 }
556
557 /* figure out the base register name for each class. The
558 * actual ra name is class_base[cls] + instr->name;
559 */
560 ctx->class_base[0] = 0;
561 for (unsigned i = 1; i <= total_class_count; i++) {
562 ctx->class_base[i] = ctx->class_base[i-1] +
563 ctx->class_alloc_count[i-1];
564 }
565
566 /* and vreg names for array elements: */
567 base = ctx->class_base[total_class_count];
568 list_for_each_entry (struct ir3_array, arr, &ctx->ir->array_list, node) {
569 arr->base = base;
570 ctx->class_alloc_count[total_class_count] += arr->length;
571 base += arr->length;
572 }
573 ctx->alloc_count += ctx->class_alloc_count[total_class_count];
574
575 ctx->g = ra_alloc_interference_graph(ctx->set->regs, ctx->alloc_count);
576 ralloc_steal(ctx->g, ctx->instrd);
577 ctx->def = rzalloc_array(ctx->g, unsigned, ctx->alloc_count);
578 ctx->use = rzalloc_array(ctx->g, unsigned, ctx->alloc_count);
579 }
580
581 static unsigned
__ra_name(struct ir3_ra_ctx * ctx,int cls,struct ir3_instruction * defn)582 __ra_name(struct ir3_ra_ctx *ctx, int cls, struct ir3_instruction *defn)
583 {
584 unsigned name;
585 debug_assert(cls >= 0);
586 debug_assert(cls < total_class_count); /* we shouldn't get arrays here.. */
587 name = ctx->class_base[cls] + defn->name;
588 debug_assert(name < ctx->alloc_count);
589 return name;
590 }
591
592 static int
ra_name(struct ir3_ra_ctx * ctx,struct ir3_ra_instr_data * id)593 ra_name(struct ir3_ra_ctx *ctx, struct ir3_ra_instr_data *id)
594 {
595 /* TODO handle name mapping for arrays */
596 return __ra_name(ctx, id->cls, id->defn);
597 }
598
599 static void
ra_destroy(struct ir3_ra_ctx * ctx)600 ra_destroy(struct ir3_ra_ctx *ctx)
601 {
602 ralloc_free(ctx->g);
603 }
604
605 static void
ra_block_compute_live_ranges(struct ir3_ra_ctx * ctx,struct ir3_block * block)606 ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block)
607 {
608 struct ir3_ra_block_data *bd;
609 unsigned bitset_words = BITSET_WORDS(ctx->alloc_count);
610
611 #define def(name, instr) \
612 do { \
613 /* defined on first write: */ \
614 if (!ctx->def[name]) \
615 ctx->def[name] = instr->ip; \
616 ctx->use[name] = instr->ip; \
617 BITSET_SET(bd->def, name); \
618 } while(0);
619
620 #define use(name, instr) \
621 do { \
622 ctx->use[name] = MAX2(ctx->use[name], instr->ip); \
623 if (!BITSET_TEST(bd->def, name)) \
624 BITSET_SET(bd->use, name); \
625 } while(0);
626
627 bd = rzalloc(ctx->g, struct ir3_ra_block_data);
628
629 bd->def = rzalloc_array(bd, BITSET_WORD, bitset_words);
630 bd->use = rzalloc_array(bd, BITSET_WORD, bitset_words);
631 bd->livein = rzalloc_array(bd, BITSET_WORD, bitset_words);
632 bd->liveout = rzalloc_array(bd, BITSET_WORD, bitset_words);
633
634 block->data = bd;
635
636 list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
637 struct ir3_instruction *src;
638 struct ir3_register *reg;
639
640 if (instr->regs_count == 0)
641 continue;
642
643 /* There are a couple special cases to deal with here:
644 *
645 * fanout: used to split values from a higher class to a lower
646 * class, for example split the results of a texture fetch
647 * into individual scalar values; We skip over these from
648 * a 'def' perspective, and for a 'use' we walk the chain
649 * up to the defining instruction.
650 *
651 * fanin: used to collect values from lower class and assemble
652 * them together into a higher class, for example arguments
653 * to texture sample instructions; We consider these to be
654 * defined at the earliest fanin source.
655 *
656 * phi: used to merge values from different flow control paths
657 * to the same reg. Consider defined at earliest phi src,
658 * and update all the other phi src's (which may come later
659 * in the program) as users to extend the var's live range.
660 *
661 * Most of this, other than phi, is completely handled in the
662 * get_definer() helper.
663 *
664 * In either case, we trace the instruction back to the original
665 * definer and consider that as the def/use ip.
666 */
667
668 if (writes_gpr(instr)) {
669 struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
670 struct ir3_register *dst = instr->regs[0];
671
672 if (dst->flags & IR3_REG_ARRAY) {
673 struct ir3_array *arr =
674 ir3_lookup_array(ctx->ir, dst->array.id);
675 unsigned i;
676
677 debug_assert(!(dst->flags & IR3_REG_PHI_SRC));
678
679 arr->start_ip = MIN2(arr->start_ip, instr->ip);
680 arr->end_ip = MAX2(arr->end_ip, instr->ip);
681
682 /* set the node class now.. in case we don't encounter
683 * this array dst again. From register_alloc algo's
684 * perspective, these are all single/scalar regs:
685 */
686 for (i = 0; i < arr->length; i++) {
687 unsigned name = arr->base + i;
688 ra_set_node_class(ctx->g, name, ctx->set->classes[0]);
689 }
690
691 /* indirect write is treated like a write to all array
692 * elements, since we don't know which one is actually
693 * written:
694 */
695 if (dst->flags & IR3_REG_RELATIV) {
696 for (i = 0; i < arr->length; i++) {
697 unsigned name = arr->base + i;
698 def(name, instr);
699 }
700 } else {
701 unsigned name = arr->base + dst->array.offset;
702 def(name, instr);
703 }
704
705 } else if (id->defn == instr) {
706 unsigned name = ra_name(ctx, id);
707
708 /* since we are in SSA at this point: */
709 debug_assert(!BITSET_TEST(bd->use, name));
710
711 def(name, id->defn);
712
713 if (is_half(id->defn)) {
714 ra_set_node_class(ctx->g, name,
715 ctx->set->half_classes[id->cls - class_count]);
716 } else {
717 ra_set_node_class(ctx->g, name,
718 ctx->set->classes[id->cls]);
719 }
720
721 /* extend the live range for phi srcs, which may come
722 * from the bottom of the loop
723 */
724 if (id->defn->regs[0]->flags & IR3_REG_PHI_SRC) {
725 struct ir3_instruction *phi = id->defn->regs[0]->instr;
726 foreach_ssa_src(src, phi) {
727 /* if src is after phi, then we need to extend
728 * the liverange to the end of src's block:
729 */
730 if (src->ip > phi->ip) {
731 struct ir3_instruction *last =
732 list_last_entry(&src->block->instr_list,
733 struct ir3_instruction, node);
734 ctx->use[name] = MAX2(ctx->use[name], last->ip);
735 }
736 }
737 }
738 }
739 }
740
741 foreach_src(reg, instr) {
742 if (reg->flags & IR3_REG_ARRAY) {
743 struct ir3_array *arr =
744 ir3_lookup_array(ctx->ir, reg->array.id);
745 arr->start_ip = MIN2(arr->start_ip, instr->ip);
746 arr->end_ip = MAX2(arr->end_ip, instr->ip);
747 /* indirect read is treated like a read fromall array
748 * elements, since we don't know which one is actually
749 * read:
750 */
751 if (reg->flags & IR3_REG_RELATIV) {
752 unsigned i;
753 for (i = 0; i < arr->length; i++) {
754 unsigned name = arr->base + i;
755 use(name, instr);
756 }
757 } else {
758 unsigned name = arr->base + reg->array.offset;
759 use(name, instr);
760 debug_assert(reg->array.offset < arr->length);
761 }
762 } else if ((src = ssa(reg)) && writes_gpr(src)) {
763 unsigned name = ra_name(ctx, &ctx->instrd[src->ip]);
764 use(name, instr);
765 }
766 }
767 }
768 }
769
770 static bool
ra_compute_livein_liveout(struct ir3_ra_ctx * ctx)771 ra_compute_livein_liveout(struct ir3_ra_ctx *ctx)
772 {
773 unsigned bitset_words = BITSET_WORDS(ctx->alloc_count);
774 bool progress = false;
775
776 list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
777 struct ir3_ra_block_data *bd = block->data;
778
779 /* update livein: */
780 for (unsigned i = 0; i < bitset_words; i++) {
781 BITSET_WORD new_livein =
782 (bd->use[i] | (bd->liveout[i] & ~bd->def[i]));
783
784 if (new_livein & ~bd->livein[i]) {
785 bd->livein[i] |= new_livein;
786 progress = true;
787 }
788 }
789
790 /* update liveout: */
791 for (unsigned j = 0; j < ARRAY_SIZE(block->successors); j++) {
792 struct ir3_block *succ = block->successors[j];
793 struct ir3_ra_block_data *succ_bd;
794
795 if (!succ)
796 continue;
797
798 succ_bd = succ->data;
799
800 for (unsigned i = 0; i < bitset_words; i++) {
801 BITSET_WORD new_liveout =
802 (succ_bd->livein[i] & ~bd->liveout[i]);
803
804 if (new_liveout) {
805 bd->liveout[i] |= new_liveout;
806 progress = true;
807 }
808 }
809 }
810 }
811
812 return progress;
813 }
814
815 static void
print_bitset(const char * name,BITSET_WORD * bs,unsigned cnt)816 print_bitset(const char *name, BITSET_WORD *bs, unsigned cnt)
817 {
818 bool first = true;
819 debug_printf(" %s:", name);
820 for (unsigned i = 0; i < cnt; i++) {
821 if (BITSET_TEST(bs, i)) {
822 if (!first)
823 debug_printf(",");
824 debug_printf(" %04u", i);
825 first = false;
826 }
827 }
828 debug_printf("\n");
829 }
830
831 static void
ra_add_interference(struct ir3_ra_ctx * ctx)832 ra_add_interference(struct ir3_ra_ctx *ctx)
833 {
834 struct ir3 *ir = ctx->ir;
835
836 /* initialize array live ranges: */
837 list_for_each_entry (struct ir3_array, arr, &ir->array_list, node) {
838 arr->start_ip = ~0;
839 arr->end_ip = 0;
840 }
841
842 /* compute live ranges (use/def) on a block level, also updating
843 * block's def/use bitmasks (used below to calculate per-block
844 * livein/liveout):
845 */
846 list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
847 ra_block_compute_live_ranges(ctx, block);
848 }
849
850 /* update per-block livein/liveout: */
851 while (ra_compute_livein_liveout(ctx)) {}
852
853 if (fd_mesa_debug & FD_DBG_OPTMSGS) {
854 debug_printf("AFTER LIVEIN/OUT:\n");
855 ir3_print(ir);
856 list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
857 struct ir3_ra_block_data *bd = block->data;
858 debug_printf("block%u:\n", block_id(block));
859 print_bitset("def", bd->def, ctx->alloc_count);
860 print_bitset("use", bd->use, ctx->alloc_count);
861 print_bitset("l/i", bd->livein, ctx->alloc_count);
862 print_bitset("l/o", bd->liveout, ctx->alloc_count);
863 }
864 }
865
866 /* extend start/end ranges based on livein/liveout info from cfg: */
867 list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
868 struct ir3_ra_block_data *bd = block->data;
869
870 for (unsigned i = 0; i < ctx->alloc_count; i++) {
871 if (BITSET_TEST(bd->livein, i)) {
872 ctx->def[i] = MIN2(ctx->def[i], block->start_ip);
873 ctx->use[i] = MAX2(ctx->use[i], block->start_ip);
874 }
875
876 if (BITSET_TEST(bd->liveout, i)) {
877 ctx->def[i] = MIN2(ctx->def[i], block->end_ip);
878 ctx->use[i] = MAX2(ctx->use[i], block->end_ip);
879 }
880 }
881 }
882
883 /* need to fix things up to keep outputs live: */
884 for (unsigned i = 0; i < ir->noutputs; i++) {
885 struct ir3_instruction *instr = ir->outputs[i];
886 unsigned name = ra_name(ctx, &ctx->instrd[instr->ip]);
887 ctx->use[name] = ctx->instr_cnt;
888 }
889
890 for (unsigned i = 0; i < ctx->alloc_count; i++) {
891 for (unsigned j = 0; j < ctx->alloc_count; j++) {
892 if (intersects(ctx->def[i], ctx->use[i],
893 ctx->def[j], ctx->use[j])) {
894 ra_add_node_interference(ctx->g, i, j);
895 }
896 }
897 }
898 }
899
900 /* some instructions need fix-up if dst register is half precision: */
fixup_half_instr_dst(struct ir3_instruction * instr)901 static void fixup_half_instr_dst(struct ir3_instruction *instr)
902 {
903 switch (opc_cat(instr->opc)) {
904 case 1: /* move instructions */
905 instr->cat1.dst_type = half_type(instr->cat1.dst_type);
906 break;
907 case 3:
908 switch (instr->opc) {
909 case OPC_MAD_F32:
910 instr->opc = OPC_MAD_F16;
911 break;
912 case OPC_SEL_B32:
913 instr->opc = OPC_SEL_B16;
914 break;
915 case OPC_SEL_S32:
916 instr->opc = OPC_SEL_S16;
917 break;
918 case OPC_SEL_F32:
919 instr->opc = OPC_SEL_F16;
920 break;
921 case OPC_SAD_S32:
922 instr->opc = OPC_SAD_S16;
923 break;
924 /* instructions may already be fixed up: */
925 case OPC_MAD_F16:
926 case OPC_SEL_B16:
927 case OPC_SEL_S16:
928 case OPC_SEL_F16:
929 case OPC_SAD_S16:
930 break;
931 default:
932 assert(0);
933 break;
934 }
935 break;
936 case 5:
937 instr->cat5.type = half_type(instr->cat5.type);
938 break;
939 }
940 }
941 /* some instructions need fix-up if src register is half precision: */
fixup_half_instr_src(struct ir3_instruction * instr)942 static void fixup_half_instr_src(struct ir3_instruction *instr)
943 {
944 switch (instr->opc) {
945 case OPC_MOV:
946 instr->cat1.src_type = half_type(instr->cat1.src_type);
947 break;
948 default:
949 break;
950 }
951 }
952
953 /* NOTE: instr could be NULL for IR3_REG_ARRAY case, for the first
954 * array access(es) which do not have any previous access to depend
955 * on from scheduling point of view
956 */
957 static void
reg_assign(struct ir3_ra_ctx * ctx,struct ir3_register * reg,struct ir3_instruction * instr)958 reg_assign(struct ir3_ra_ctx *ctx, struct ir3_register *reg,
959 struct ir3_instruction *instr)
960 {
961 struct ir3_ra_instr_data *id;
962
963 if (reg->flags & IR3_REG_ARRAY) {
964 struct ir3_array *arr =
965 ir3_lookup_array(ctx->ir, reg->array.id);
966 unsigned name = arr->base + reg->array.offset;
967 unsigned r = ra_get_node_reg(ctx->g, name);
968 unsigned num = ctx->set->ra_reg_to_gpr[r];
969
970 if (reg->flags & IR3_REG_RELATIV) {
971 reg->array.offset = num;
972 } else {
973 reg->num = num;
974 }
975
976 reg->flags &= ~IR3_REG_ARRAY;
977 } else if ((id = &ctx->instrd[instr->ip]) && id->defn) {
978 unsigned name = ra_name(ctx, id);
979 unsigned r = ra_get_node_reg(ctx->g, name);
980 unsigned num = ctx->set->ra_reg_to_gpr[r] + id->off;
981
982 debug_assert(!(reg->flags & IR3_REG_RELATIV));
983
984 reg->num = num;
985 reg->flags &= ~(IR3_REG_SSA | IR3_REG_PHI_SRC);
986
987 if (is_half(id->defn))
988 reg->flags |= IR3_REG_HALF;
989 }
990 }
991
992 static void
ra_block_alloc(struct ir3_ra_ctx * ctx,struct ir3_block * block)993 ra_block_alloc(struct ir3_ra_ctx *ctx, struct ir3_block *block)
994 {
995 list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
996 struct ir3_register *reg;
997
998 if (instr->regs_count == 0)
999 continue;
1000
1001 if (writes_gpr(instr)) {
1002 reg_assign(ctx, instr->regs[0], instr);
1003 if (instr->regs[0]->flags & IR3_REG_HALF)
1004 fixup_half_instr_dst(instr);
1005 }
1006
1007 foreach_src_n(reg, n, instr) {
1008 struct ir3_instruction *src = reg->instr;
1009 /* Note: reg->instr could be null for IR3_REG_ARRAY */
1010 if (!(src || (reg->flags & IR3_REG_ARRAY)))
1011 continue;
1012 reg_assign(ctx, instr->regs[n+1], src);
1013 if (instr->regs[n+1]->flags & IR3_REG_HALF)
1014 fixup_half_instr_src(instr);
1015 }
1016 }
1017 }
1018
1019 static int
ra_alloc(struct ir3_ra_ctx * ctx)1020 ra_alloc(struct ir3_ra_ctx *ctx)
1021 {
1022 unsigned n = 0;
1023
1024 /* frag shader inputs get pre-assigned, since we have some
1025 * constraints/unknowns about setup for some of these regs:
1026 */
1027 if (ctx->type == SHADER_FRAGMENT) {
1028 struct ir3 *ir = ctx->ir;
1029 unsigned i = 0, j;
1030 if (ctx->frag_face && (i < ir->ninputs) && ir->inputs[i]) {
1031 struct ir3_instruction *instr = ir->inputs[i];
1032 int cls = size_to_class(1, true);
1033 unsigned name = __ra_name(ctx, cls, instr);
1034 unsigned reg = ctx->set->gpr_to_ra_reg[cls][0];
1035
1036 /* if we have frag_face, it gets hr0.x */
1037 ra_set_node_reg(ctx->g, name, reg);
1038 i += 4;
1039 }
1040
1041 j = 0;
1042 for (; i < ir->ninputs; i++) {
1043 struct ir3_instruction *instr = ir->inputs[i];
1044 if (instr) {
1045 struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
1046
1047 if (id->defn == instr) {
1048 unsigned name, reg;
1049
1050 name = ra_name(ctx, id);
1051 reg = ctx->set->gpr_to_ra_reg[id->cls][j];
1052
1053 ra_set_node_reg(ctx->g, name, reg);
1054 j += id->sz;
1055 }
1056 }
1057 }
1058 n = j;
1059 }
1060
1061 /* pre-assign array elements:
1062 */
1063 list_for_each_entry (struct ir3_array, arr, &ctx->ir->array_list, node) {
1064 unsigned base = n;
1065
1066 if (arr->end_ip == 0)
1067 continue;
1068
1069 /* figure out what else we conflict with which has already
1070 * been assigned:
1071 */
1072 retry:
1073 list_for_each_entry (struct ir3_array, arr2, &ctx->ir->array_list, node) {
1074 if (arr2 == arr)
1075 break;
1076 if (arr2->end_ip == 0)
1077 continue;
1078 /* if it intersects with liverange AND register range.. */
1079 if (intersects(arr->start_ip, arr->end_ip,
1080 arr2->start_ip, arr2->end_ip) &&
1081 intersects(base, base + arr->length,
1082 arr2->reg, arr2->reg + arr2->length)) {
1083 base = MAX2(base, arr2->reg + arr2->length);
1084 goto retry;
1085 }
1086 }
1087
1088 arr->reg = base;
1089
1090 for (unsigned i = 0; i < arr->length; i++) {
1091 unsigned name, reg;
1092
1093 name = arr->base + i;
1094 reg = ctx->set->gpr_to_ra_reg[0][base++];
1095
1096 ra_set_node_reg(ctx->g, name, reg);
1097 }
1098 }
1099
1100 if (!ra_allocate(ctx->g))
1101 return -1;
1102
1103 list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
1104 ra_block_alloc(ctx, block);
1105 }
1106
1107 return 0;
1108 }
1109
ir3_ra(struct ir3 * ir,enum shader_t type,bool frag_coord,bool frag_face)1110 int ir3_ra(struct ir3 *ir, enum shader_t type,
1111 bool frag_coord, bool frag_face)
1112 {
1113 struct ir3_ra_ctx ctx = {
1114 .ir = ir,
1115 .type = type,
1116 .frag_face = frag_face,
1117 .set = ir->compiler->set,
1118 };
1119 int ret;
1120
1121 ra_init(&ctx);
1122 ra_add_interference(&ctx);
1123 ret = ra_alloc(&ctx);
1124 ra_destroy(&ctx);
1125
1126 return ret;
1127 }
1128