/* * Copyright (C) 2020 Collabora Ltd. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * * Authors (Collabora): * Alyssa Rosenzweig */ #include "compiler.h" #include "bi_print.h" #include "panfrost/util/lcra.h" #include "util/u_memory.h" static void bi_compute_interference(bi_context *ctx, struct lcra_state *l) { bi_compute_liveness(ctx); bi_foreach_block(ctx, _blk) { bi_block *blk = (bi_block *) _blk; uint16_t *live = mem_dup(_blk->live_out, l->node_count * sizeof(uint16_t)); bi_foreach_instr_in_block_rev(blk, ins) { /* Mark all registers live after the instruction as * interfering with the destination */ if (ins->dest && (ins->dest < l->node_count)) { for (unsigned i = 1; i < l->node_count; ++i) { if (live[i]) lcra_add_node_interference(l, ins->dest, bi_writemask(ins), i, live[i]); } } /* Update live_in */ bi_liveness_ins_update(live, ins, l->node_count); } free(live); } } enum { BI_REG_CLASS_WORK = 0, } bi_reg_class; static struct lcra_state * bi_allocate_registers(bi_context *ctx, bool *success) { unsigned node_count = bi_max_temp(ctx); struct lcra_state *l = lcra_alloc_equations(node_count, 1); if (ctx->is_blend) { /* R0-R3 are reserved for the blend input */ l->class_start[BI_REG_CLASS_WORK] = 4 * 4; l->class_size[BI_REG_CLASS_WORK] = 64 * 4; } else { /* R0 - R63, all 32-bit */ l->class_start[BI_REG_CLASS_WORK] = 0; l->class_size[BI_REG_CLASS_WORK] = 64 * 4; } bi_foreach_instr_global(ctx, ins) { unsigned dest = ins->dest; if (!dest || (dest >= node_count)) continue; l->class[dest] = BI_REG_CLASS_WORK; lcra_set_alignment(l, dest, 2, 16); /* 2^2 = 4 */ lcra_restrict_range(l, dest, 4); } bi_compute_interference(ctx, l); *success = lcra_solve(l); return l; } static unsigned bi_reg_from_index(struct lcra_state *l, unsigned index, unsigned offset) { /* Did we run RA for this index at all */ if (index >= l->node_count) return index; /* LCRA didn't bother solving this index (how lazy!) */ signed solution = l->solutions[index]; if (solution < 0) return index; assert((solution & 0x3) == 0); unsigned reg = solution / 4; reg += offset; return BIR_INDEX_REGISTER | reg; } static void bi_adjust_src_ra(bi_instruction *ins, struct lcra_state *l, unsigned src) { if (ins->src[src] >= l->node_count) return; bool vector = (bi_class_props[ins->type] & BI_VECTOR) && src == 0; unsigned offset = 0; if (vector) { /* TODO: Do we do anything here? */ } else { /* Use the swizzle as component select */ unsigned components = bi_get_component_count(ins, src); nir_alu_type T = ins->src_types[src]; unsigned size = nir_alu_type_get_type_size(T); /* TODO: 64-bit? */ unsigned components_per_word = MAX2(32 / size, 1); for (unsigned i = 0; i < components; ++i) { unsigned off = ins->swizzle[src][i] / components_per_word; /* We can't cross register boundaries in a swizzle */ if (i == 0) offset = off; else assert(off == offset); ins->swizzle[src][i] %= components_per_word; } } ins->src[src] = bi_reg_from_index(l, ins->src[src], offset); } static void bi_adjust_dest_ra(bi_instruction *ins, struct lcra_state *l) { if (ins->dest >= l->node_count) return; ins->dest = bi_reg_from_index(l, ins->dest, ins->dest_offset); ins->dest_offset = 0; } static void bi_install_registers(bi_context *ctx, struct lcra_state *l) { bi_foreach_instr_global(ctx, ins) { bi_adjust_dest_ra(ins, l); bi_foreach_src(ins, s) bi_adjust_src_ra(ins, l, s); } } static void bi_rewrite_index_src_single(bi_instruction *ins, unsigned old, unsigned new) { bi_foreach_src(ins, i) { if (ins->src[i] == old) ins->src[i] = new; } } static bi_instruction bi_spill(unsigned node, uint64_t offset, unsigned channels) { bi_instruction store = { .type = BI_STORE, .segment = BI_SEGMENT_TLS, .vector_channels = channels, .src = { node, BIR_INDEX_CONSTANT, BIR_INDEX_CONSTANT | 32, }, .src_types = { nir_type_uint32, nir_type_uint32, nir_type_uint32 }, .constant = { .u64 = offset }, }; return store; } static bi_instruction bi_fill(unsigned node, uint64_t offset, unsigned channels) { bi_instruction load = { .type = BI_LOAD, .segment = BI_SEGMENT_TLS, .vector_channels = channels, .dest = node, .dest_type = nir_type_uint32, .src = { BIR_INDEX_CONSTANT, BIR_INDEX_CONSTANT | 32, }, .src_types = { nir_type_uint32, nir_type_uint32 }, .constant = { .u64 = offset }, }; return load; } /* Get the single instruction in a singleton clause. Precondition: clause * contains exactly 1 instruction. * * More complex scheduling implies tougher constraints on spilling. We'll cross * that bridge when we get to it. For now, just grab the one and only * instruction in the clause */ static bi_instruction * bi_unwrap_singleton(bi_clause *clause) { assert(clause->bundle_count == 1); assert((clause->bundles[0].fma != NULL) ^ (clause->bundles[0].add != NULL)); return clause->bundles[0].fma ? clause->bundles[0].fma : clause->bundles[0].add; } static inline void bi_insert_singleton(void *memctx, bi_clause *cursor, bi_block *block, bi_instruction ins, bool before) { bi_instruction *uins = rzalloc(memctx, bi_instruction); memcpy(uins, &ins, sizeof(ins)); /* Get the instruction to pivot around. Should be first/last of clause * depending on before setting, those coincide for singletons */ bi_instruction *cursor_ins = bi_unwrap_singleton(cursor); bi_clause *clause = bi_make_singleton(memctx, uins, block, 0, (1 << 0), true); if (before) { list_addtail(&clause->link, &cursor->link); list_addtail(&uins->link, &cursor_ins->link); } else { list_add(&clause->link, &cursor->link); list_add(&uins->link, &cursor_ins->link); } } /* If register allocation fails, find the best spill node */ static signed bi_choose_spill_node(bi_context *ctx, struct lcra_state *l) { /* Pick a node satisfying bi_spill_register's preconditions */ bi_foreach_instr_global(ctx, ins) { if (ins->no_spill) lcra_set_node_spill_cost(l, ins->dest, -1); } for (unsigned i = PAN_IS_REG; i < l->node_count; i += 2) lcra_set_node_spill_cost(l, i, -1); return lcra_get_best_spill_node(l); } /* Once we've chosen a spill node, spill it. Precondition: node is a valid * SSA node in the non-optimized scheduled IR that was not already * spilled (enforced by bi_choose_spill_node). Returns bytes spilled */ static unsigned bi_spill_register(bi_context *ctx, unsigned node, unsigned offset) { assert(!(node & PAN_IS_REG)); unsigned channels = 1; /* Spill after every store */ bi_foreach_block(ctx, _block) { bi_block *block = (bi_block *) _block; bi_foreach_clause_in_block_safe(block, clause) { bi_instruction *ins = bi_unwrap_singleton(clause); if (ins->dest != node) continue; ins->dest = bi_make_temp(ctx); ins->no_spill = true; channels = MAX2(channels, ins->vector_channels); bi_instruction st = bi_spill(ins->dest, offset, channels); bi_insert_singleton(ctx, clause, block, st, false); ctx->spills++; } } /* Fill before every use */ bi_foreach_block(ctx, _block) { bi_block *block = (bi_block *) _block; bi_foreach_clause_in_block_safe(block, clause) { bi_instruction *ins = bi_unwrap_singleton(clause); if (!bi_has_arg(ins, node)) continue; /* Don't rewrite spills themselves */ if (ins->segment == BI_SEGMENT_TLS) continue; unsigned index = bi_make_temp(ctx); bi_instruction ld = bi_fill(index, offset, channels); ld.no_spill = true; bi_insert_singleton(ctx, clause, block, ld, true); /* Rewrite to use */ bi_rewrite_index_src_single(ins, node, index); ctx->fills++; } } return (channels * 4); } void bi_register_allocate(bi_context *ctx) { struct lcra_state *l = NULL; bool success = false; unsigned iter_count = 100; /* max iterations */ /* Number of bytes of memory we've spilled into */ unsigned spill_count = 0; /* For instructions that both read and write from a data register, it's * the *same* data register. We enforce that constraint by just doing a * quick rewrite. TODO: are there cases where this causes RA to have no * solutions due to copyprop? */ bi_foreach_instr_global(ctx, ins) { unsigned props = bi_class_props[ins->type]; unsigned both = BI_DATA_REG_SRC | BI_DATA_REG_DEST; if ((props & both) != both) continue; bi_rewrite_uses(ctx, ins->dest, 0, ins->src[0], 0); ins->dest = ins->src[0]; } do { if (l) { signed spill_node = bi_choose_spill_node(ctx, l); lcra_free(l); l = NULL; if (spill_node == -1) unreachable("Failed to choose spill node\n"); spill_count += bi_spill_register(ctx, spill_node, spill_count); } bi_invalidate_liveness(ctx); l = bi_allocate_registers(ctx, &success); } while(!success && ((iter_count--) > 0)); assert(success); ctx->tls_size = spill_count; bi_install_registers(ctx, l); lcra_free(l); }