1 /*
2  * Copyright (c) 2017 Lima Project
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sub license,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the
12  * next paragraph) shall be included in all copies or substantial portions
13  * of the Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21  * DEALINGS IN THE SOFTWARE.
22  *
23  */
24 
25 #include "gpir.h"
26 #include "util/u_dynarray.h"
27 
28 /* Per-register information */
29 
30 struct reg_info {
31    BITSET_WORD *conflicts;
32    struct util_dynarray conflict_list;
33 
34    /* Number of conflicts that must be allocated to physical registers.
35     */
36    unsigned phys_conflicts;
37 
38    unsigned node_conflicts;
39 
40    /* Number of conflicts that can be allocated to either. */
41    unsigned total_conflicts;
42 
43    int assigned_color;
44 
45    bool visited;
46 };
47 
48 struct regalloc_ctx {
49    unsigned bitset_words, num_nodes_and_regs;
50    struct reg_info *registers;
51 
52    /* Reusable scratch liveness array */
53    BITSET_WORD *live;
54 
55    unsigned *worklist;
56    unsigned worklist_start, worklist_end;
57 
58    unsigned *stack;
59    unsigned stack_size;
60 
61    gpir_compiler *comp;
62    void *mem_ctx;
63 };
64 
65 /* Liveness analysis */
66 
propagate_liveness_instr(gpir_node * node,BITSET_WORD * live,gpir_compiler * comp)67 static void propagate_liveness_instr(gpir_node *node, BITSET_WORD *live,
68                                      gpir_compiler *comp)
69 {
70    /* KILL */
71    if (node->type == gpir_node_type_store) {
72       if (node->op == gpir_op_store_reg) {
73          gpir_store_node *store = gpir_node_to_store(node);
74          BITSET_CLEAR(live, store->reg->index);
75       }
76    }
77 
78    /* GEN */
79    if (node->type == gpir_node_type_load) {
80       if (node->op == gpir_op_load_reg) {
81          gpir_load_node *load = gpir_node_to_load(node);
82          BITSET_SET(live, load->reg->index);
83       }
84    }
85 }
86 
propagate_liveness_block(gpir_block * block,struct regalloc_ctx * ctx)87 static bool propagate_liveness_block(gpir_block *block, struct regalloc_ctx *ctx)
88 {
89    for (unsigned i = 0; i < 2; i++) {
90       if (block->successors[i]) {
91          for (unsigned j = 0; j < ctx->bitset_words; j++)
92             block->live_out[j] |= block->successors[i]->live_in[j];
93       }
94    }
95 
96    memcpy(ctx->live, block->live_out, ctx->bitset_words * sizeof(BITSET_WORD));
97 
98    list_for_each_entry_rev(gpir_node, node, &block->node_list, list) {
99       propagate_liveness_instr(node, ctx->live, block->comp);
100    }
101 
102    bool changed = false;
103    for (unsigned i = 0; i < ctx->bitset_words; i++) {
104       changed |= (block->live_in[i] != ctx->live[i]);
105       block->live_in[i] = ctx->live[i];
106    }
107    return changed;
108 }
109 
calc_def_block(gpir_block * block)110 static void calc_def_block(gpir_block *block)
111 {
112    list_for_each_entry(gpir_node, node, &block->node_list, list) {
113       if (node->op == gpir_op_store_reg) {
114          gpir_store_node *store = gpir_node_to_store(node);
115          BITSET_SET(block->def_out, store->reg->index);
116       }
117    }
118 }
119 
calc_liveness(struct regalloc_ctx * ctx)120 static void calc_liveness(struct regalloc_ctx *ctx)
121 {
122    bool changed = true;
123    while (changed) {
124       changed = false;
125       list_for_each_entry_rev(gpir_block, block, &ctx->comp->block_list, list) {
126          changed |= propagate_liveness_block(block, ctx);
127       }
128    }
129 
130    list_for_each_entry(gpir_block, block, &ctx->comp->block_list, list) {
131       calc_def_block(block);
132    }
133 
134    changed = true;
135    while (changed) {
136       changed = false;
137       list_for_each_entry(gpir_block, block, &ctx->comp->block_list, list) {
138          for (unsigned i = 0; i < 2; i++) {
139             gpir_block *succ = block->successors[i];
140             if (!succ)
141                continue;
142 
143             for (unsigned j = 0; j < ctx->bitset_words; j++) {
144                BITSET_WORD new = block->def_out[j] & ~succ->def_out[j];
145                changed |= (new != 0);
146                succ->def_out[j] |= block->def_out[j];
147             }
148          }
149       }
150    }
151 }
152 
153 /* Interference calculation */
154 
add_interference(struct regalloc_ctx * ctx,unsigned i,unsigned j)155 static void add_interference(struct regalloc_ctx *ctx, unsigned i, unsigned j)
156 {
157    if (i == j)
158       return;
159 
160    struct reg_info *a = &ctx->registers[i];
161    struct reg_info *b = &ctx->registers[j];
162 
163    if (BITSET_TEST(a->conflicts, j))
164       return;
165 
166    BITSET_SET(a->conflicts, j);
167    BITSET_SET(b->conflicts, i);
168 
169    a->total_conflicts++;
170    b->total_conflicts++;
171    if (j < ctx->comp->cur_reg)
172       a->phys_conflicts++;
173    else
174       a->node_conflicts++;
175 
176    if (i < ctx->comp->cur_reg)
177       b->phys_conflicts++;
178    else
179       b->node_conflicts++;
180 
181    util_dynarray_append(&a->conflict_list, unsigned, j);
182    util_dynarray_append(&b->conflict_list, unsigned, i);
183 }
184 
185 /* Make the register or node "i" intefere with all the other live registers
186  * and nodes.
187  */
add_all_interferences(struct regalloc_ctx * ctx,unsigned i,BITSET_WORD * live_nodes,BITSET_WORD * live_regs)188 static void add_all_interferences(struct regalloc_ctx *ctx,
189                                   unsigned i,
190                                   BITSET_WORD *live_nodes,
191                                   BITSET_WORD *live_regs)
192 {
193    int live_node;
194    BITSET_FOREACH_SET(live_node, live_nodes, ctx->comp->cur_index) {
195       add_interference(ctx, i,
196                        live_node + ctx->comp->cur_reg);
197    }
198 
199    int live_reg;
200    BITSET_FOREACH_SET(live_reg, ctx->live, ctx->comp->cur_index) {
201       add_interference(ctx, i, live_reg);
202    }
203 
204 }
205 
print_liveness(struct regalloc_ctx * ctx,BITSET_WORD * live_reg,BITSET_WORD * live_val)206 static void print_liveness(struct regalloc_ctx *ctx,
207                            BITSET_WORD *live_reg, BITSET_WORD *live_val)
208 {
209    if (!(lima_debug & LIMA_DEBUG_GP))
210       return;
211 
212    int live_idx;
213    BITSET_FOREACH_SET(live_idx, live_reg, ctx->comp->cur_reg) {
214       printf("reg%d ", live_idx);
215    }
216    BITSET_FOREACH_SET(live_idx, live_val, ctx->comp->cur_index) {
217       printf("%d ", live_idx);
218    }
219    printf("\n");
220 }
221 
calc_interference(struct regalloc_ctx * ctx)222 static void calc_interference(struct regalloc_ctx *ctx)
223 {
224    BITSET_WORD *live_nodes =
225       rzalloc_array(ctx->mem_ctx, BITSET_WORD, ctx->comp->cur_index);
226 
227    list_for_each_entry(gpir_block, block, &ctx->comp->block_list, list) {
228       /* Initialize liveness at the end of the block, but exclude values that
229        * definitely aren't defined by the end. This helps out with
230        * partially-defined registers, like:
231        *
232        * if (condition) {
233        *    foo = ...;
234        * }
235        * if (condition) {
236        *    ... = foo;
237        * }
238        *
239        * If we naively propagated liveness backwards, foo would be live from
240        * the beginning of the program, but if we're not inside a loop then
241        * its value is undefined before the first if and we don't have to
242        * consider it live. Mask out registers like foo here.
243        */
244       for (unsigned i = 0; i < ctx->bitset_words; i++) {
245          ctx->live[i] = block->live_out[i] & block->def_out[i];
246       }
247 
248       list_for_each_entry_rev(gpir_node, node, &block->node_list, list) {
249          gpir_debug("processing node %d\n", node->index);
250          print_liveness(ctx, ctx->live, live_nodes);
251          if (node->type != gpir_node_type_store &&
252              node->type != gpir_node_type_branch) {
253             add_all_interferences(ctx, node->index + ctx->comp->cur_reg,
254                                   live_nodes, ctx->live);
255 
256             /* KILL */
257             BITSET_CLEAR(live_nodes, node->index);
258          } else if (node->op == gpir_op_store_reg) {
259             gpir_store_node *store = gpir_node_to_store(node);
260             add_all_interferences(ctx, store->reg->index,
261                                   live_nodes, ctx->live);
262 
263             /* KILL */
264             BITSET_CLEAR(ctx->live, store->reg->index);
265          }
266 
267          /* GEN */
268          if (node->type == gpir_node_type_store) {
269             gpir_store_node *store = gpir_node_to_store(node);
270             BITSET_SET(live_nodes, store->child->index);
271          } else if (node->type == gpir_node_type_alu) {
272             gpir_alu_node *alu = gpir_node_to_alu(node);
273             for (int i = 0; i < alu->num_child; i++)
274                BITSET_SET(live_nodes, alu->children[i]->index);
275          } else if (node->type == gpir_node_type_branch) {
276             gpir_branch_node *branch = gpir_node_to_branch(node);
277             BITSET_SET(live_nodes, branch->cond->index);
278          } else if (node->op == gpir_op_load_reg) {
279             gpir_load_node *load = gpir_node_to_load(node);
280             BITSET_SET(ctx->live, load->reg->index);
281          }
282       }
283    }
284 }
285 
286 /* Register allocation */
287 
can_simplify(struct regalloc_ctx * ctx,unsigned i)288 static bool can_simplify(struct regalloc_ctx *ctx, unsigned i)
289 {
290    struct reg_info *info = &ctx->registers[i];
291    if (i < ctx->comp->cur_reg) {
292       /* Physical regs. */
293       return info->phys_conflicts + info->node_conflicts < GPIR_PHYSICAL_REG_NUM;
294    } else {
295       /* Nodes: if we manage to allocate all of its conflicting physical
296        * registers, they will take up at most GPIR_PHYSICAL_REG_NUM colors, so
297        * we can ignore any more than that.
298        */
299       return MIN2(info->phys_conflicts, GPIR_PHYSICAL_REG_NUM) +
300          info->node_conflicts < GPIR_PHYSICAL_REG_NUM + GPIR_VALUE_REG_NUM;
301    }
302 }
303 
push_stack(struct regalloc_ctx * ctx,unsigned i)304 static void push_stack(struct regalloc_ctx *ctx, unsigned i)
305 {
306    ctx->stack[ctx->stack_size++] = i;
307    if (i < ctx->comp->cur_reg)
308       gpir_debug("pushing reg%u\n", i);
309    else
310       gpir_debug("pushing %d\n", i - ctx->comp->cur_reg);
311 
312    struct reg_info *info = &ctx->registers[i];
313    assert(info->visited);
314 
315    util_dynarray_foreach(&info->conflict_list, unsigned, conflict) {
316       struct reg_info *conflict_info = &ctx->registers[*conflict];
317       if (i < ctx->comp->cur_reg) {
318          assert(conflict_info->phys_conflicts > 0);
319          conflict_info->phys_conflicts--;
320       } else {
321          assert(conflict_info->node_conflicts > 0);
322          conflict_info->node_conflicts--;
323       }
324       if (!ctx->registers[*conflict].visited && can_simplify(ctx, *conflict)) {
325          ctx->worklist[ctx->worklist_end++] = *conflict;
326          ctx->registers[*conflict].visited = true;
327       }
328    }
329 }
330 
do_regalloc(struct regalloc_ctx * ctx)331 static bool do_regalloc(struct regalloc_ctx *ctx)
332 {
333    ctx->worklist_start = 0;
334    ctx->worklist_end = 0;
335    ctx->stack_size = 0;
336 
337    /* Step 1: find the initially simplifiable registers */
338    for (int i = 0; i < ctx->comp->cur_reg + ctx->comp->cur_index; i++) {
339       if (can_simplify(ctx, i)) {
340          ctx->worklist[ctx->worklist_end++] = i;
341          ctx->registers[i].visited = true;
342       }
343    }
344 
345    while (true) {
346       /* Step 2: push onto the stack whatever we can */
347       while (ctx->worklist_start != ctx->worklist_end) {
348          push_stack(ctx, ctx->worklist[ctx->worklist_start++]);
349       }
350 
351       if (ctx->stack_size < ctx->num_nodes_and_regs) {
352          /* If there are still unsimplifiable nodes left, we need to
353           * optimistically push a node onto the stack.  Choose the one with
354           * the smallest number of current neighbors, since that's the most
355           * likely to succeed.
356           */
357          unsigned min_conflicts = UINT_MAX;
358          unsigned best_reg = 0;
359          for (unsigned reg = 0; reg < ctx->num_nodes_and_regs; reg++) {
360             struct reg_info *info = &ctx->registers[reg];
361             if (info->visited)
362                continue;
363             if (info->phys_conflicts + info->node_conflicts < min_conflicts) {
364                best_reg = reg;
365                min_conflicts = info->phys_conflicts + info->node_conflicts;
366             }
367          }
368          gpir_debug("optimistic triggered\n");
369          ctx->registers[best_reg].visited = true;
370          push_stack(ctx, best_reg);
371       } else {
372          break;
373       }
374    }
375 
376    /* Step 4: pop off the stack and assign colors */
377    for (int i = ctx->num_nodes_and_regs - 1; i >= 0; i--) {
378       unsigned idx = ctx->stack[i];
379       struct reg_info *reg = &ctx->registers[idx];
380 
381       unsigned num_available_regs;
382       if (idx < ctx->comp->cur_reg) {
383          num_available_regs = GPIR_PHYSICAL_REG_NUM;
384       } else {
385          num_available_regs = GPIR_VALUE_REG_NUM + GPIR_PHYSICAL_REG_NUM;
386       }
387 
388       bool found = false;
389       unsigned start = i % num_available_regs;
390       for (unsigned j = 0; j < num_available_regs; j++) {
391          unsigned candidate = (j + start) % num_available_regs;
392          bool available = true;
393          util_dynarray_foreach(&reg->conflict_list, unsigned, conflict_idx) {
394             struct reg_info *conflict = &ctx->registers[*conflict_idx];
395             if (conflict->assigned_color >= 0 &&
396                 conflict->assigned_color == (int) candidate) {
397                available = false;
398                break;
399             }
400          }
401 
402          if (available) {
403             reg->assigned_color = candidate;
404             found = true;
405             break;
406          }
407       }
408 
409       /* TODO: spilling */
410       if (!found) {
411          gpir_error("Failed to allocate registers\n");
412          return false;
413       }
414    }
415 
416    return true;
417 }
418 
assign_regs(struct regalloc_ctx * ctx)419 static void assign_regs(struct regalloc_ctx *ctx)
420 {
421    list_for_each_entry(gpir_block, block, &ctx->comp->block_list, list) {
422       list_for_each_entry(gpir_node, node, &block->node_list, list) {
423          if (node->index >= 0) {
424             node->value_reg =
425                ctx->registers[ctx->comp->cur_reg + node->index].assigned_color;
426          }
427 
428          if (node->op == gpir_op_load_reg) {
429             gpir_load_node *load = gpir_node_to_load(node);
430             unsigned color = ctx->registers[load->reg->index].assigned_color;
431             load->index = color / 4;
432             load->component = color % 4;
433          }
434 
435          if (node->op == gpir_op_store_reg) {
436             gpir_store_node *store = gpir_node_to_store(node);
437             unsigned color = ctx->registers[store->reg->index].assigned_color;
438             store->index = color / 4;
439             store->component = color % 4;
440             node->value_reg = color;
441          }
442       }
443 
444       block->live_out_phys = 0;
445 
446       int reg_idx;
447       BITSET_FOREACH_SET(reg_idx, block->live_out, ctx->comp->cur_reg) {
448          if (BITSET_TEST(block->def_out, reg_idx)) {
449             block->live_out_phys |= (1ull << ctx->registers[reg_idx].assigned_color);
450          }
451       }
452    }
453 }
454 
regalloc_print_result(gpir_compiler * comp)455 static void regalloc_print_result(gpir_compiler *comp)
456 {
457    if (!(lima_debug & LIMA_DEBUG_GP))
458       return;
459 
460    int index = 0;
461    printf("======== regalloc ========\n");
462    list_for_each_entry(gpir_block, block, &comp->block_list, list) {
463       list_for_each_entry(gpir_node, node, &block->node_list, list) {
464          printf("%03d: %d/%d %s ", index++, node->index, node->value_reg,
465                 gpir_op_infos[node->op].name);
466          gpir_node_foreach_pred(node, dep) {
467             gpir_node *pred = dep->pred;
468             printf(" %d/%d", pred->index, pred->value_reg);
469          }
470          if (node->op == gpir_op_load_reg) {
471             gpir_load_node *load = gpir_node_to_load(node);
472             printf(" -/%d", 4 * load->index + load->component);
473             printf(" (%d)", load->reg->index);
474          } else if (node->op == gpir_op_store_reg) {
475             gpir_store_node *store = gpir_node_to_store(node);
476             printf(" (%d)", store->reg->index);
477          }
478          printf("\n");
479       }
480       printf("----------------------------\n");
481    }
482 }
483 
gpir_regalloc_prog(gpir_compiler * comp)484 bool gpir_regalloc_prog(gpir_compiler *comp)
485 {
486    struct regalloc_ctx ctx;
487 
488    ctx.mem_ctx = ralloc_context(NULL);
489    ctx.num_nodes_and_regs = comp->cur_reg + comp->cur_index;
490    ctx.bitset_words = BITSET_WORDS(ctx.num_nodes_and_regs);
491    ctx.live = ralloc_array(ctx.mem_ctx, BITSET_WORD, ctx.bitset_words);
492    ctx.worklist = ralloc_array(ctx.mem_ctx, unsigned, ctx.num_nodes_and_regs);
493    ctx.stack = ralloc_array(ctx.mem_ctx, unsigned, ctx.num_nodes_and_regs);
494    ctx.comp = comp;
495 
496    ctx.registers = rzalloc_array(ctx.mem_ctx, struct reg_info, ctx.num_nodes_and_regs);
497    for (unsigned i = 0; i < ctx.num_nodes_and_regs; i++) {
498       ctx.registers[i].conflicts = rzalloc_array(ctx.mem_ctx, BITSET_WORD,
499                                                  ctx.bitset_words);
500       util_dynarray_init(&ctx.registers[i].conflict_list, ctx.mem_ctx);
501    }
502 
503    list_for_each_entry(gpir_block, block, &comp->block_list, list) {
504       block->live_out = rzalloc_array(ctx.mem_ctx, BITSET_WORD, ctx.bitset_words);
505       block->live_in = rzalloc_array(ctx.mem_ctx, BITSET_WORD, ctx.bitset_words);
506       block->def_out = rzalloc_array(ctx.mem_ctx, BITSET_WORD, ctx.bitset_words);
507    }
508 
509    calc_liveness(&ctx);
510    calc_interference(&ctx);
511    if (!do_regalloc(&ctx)) {
512       ralloc_free(ctx.mem_ctx);
513       return false;
514    }
515    assign_regs(&ctx);
516 
517    regalloc_print_result(comp);
518    ralloc_free(ctx.mem_ctx);
519    return true;
520 }
521