1 /*
2  * Copyright (C) 2020 Collabora Ltd.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  *
23  * Authors (Collabora):
24  *      Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
25  */
26 
27 #include "compiler.h"
28 #include "bi_print.h"
29 #include "panfrost/util/lcra.h"
30 #include "util/u_memory.h"
31 
32 static void
bi_compute_interference(bi_context * ctx,struct lcra_state * l)33 bi_compute_interference(bi_context *ctx, struct lcra_state *l)
34 {
35         bi_compute_liveness(ctx);
36 
37         bi_foreach_block(ctx, _blk) {
38                 bi_block *blk = (bi_block *) _blk;
39                 uint16_t *live = mem_dup(_blk->live_out, l->node_count * sizeof(uint16_t));
40 
41                 bi_foreach_instr_in_block_rev(blk, ins) {
42                         /* Mark all registers live after the instruction as
43                          * interfering with the destination */
44 
45                         if (ins->dest && (ins->dest < l->node_count)) {
46                                 for (unsigned i = 1; i < l->node_count; ++i) {
47                                         if (live[i])
48                                                 lcra_add_node_interference(l, ins->dest, bi_writemask(ins), i, live[i]);
49                                 }
50                         }
51 
52                         /* Update live_in */
53                         bi_liveness_ins_update(live, ins, l->node_count);
54                 }
55 
56                 free(live);
57         }
58 }
59 
60 enum {
61         BI_REG_CLASS_WORK = 0,
62 } bi_reg_class;
63 
64 static struct lcra_state *
bi_allocate_registers(bi_context * ctx,bool * success)65 bi_allocate_registers(bi_context *ctx, bool *success)
66 {
67         unsigned node_count = bi_max_temp(ctx);
68 
69         struct lcra_state *l =
70                 lcra_alloc_equations(node_count, 1);
71 
72         if (ctx->is_blend) {
73                 /* R0-R3 are reserved for the blend input */
74                 l->class_start[BI_REG_CLASS_WORK] = 4 * 4;
75                 l->class_size[BI_REG_CLASS_WORK] = 64 * 4;
76         } else {
77                 /* R0 - R63, all 32-bit */
78                 l->class_start[BI_REG_CLASS_WORK] = 0;
79                 l->class_size[BI_REG_CLASS_WORK] = 64 * 4;
80         }
81 
82         bi_foreach_instr_global(ctx, ins) {
83                 unsigned dest = ins->dest;
84 
85                 if (!dest || (dest >= node_count))
86                         continue;
87 
88                 l->class[dest] = BI_REG_CLASS_WORK;
89                 lcra_set_alignment(l, dest, 2, 16); /* 2^2 = 4 */
90                 lcra_restrict_range(l, dest, 4);
91         }
92 
93         bi_compute_interference(ctx, l);
94 
95         *success = lcra_solve(l);
96 
97         return l;
98 }
99 
100 static unsigned
bi_reg_from_index(struct lcra_state * l,unsigned index,unsigned offset)101 bi_reg_from_index(struct lcra_state *l, unsigned index, unsigned offset)
102 {
103         /* Did we run RA for this index at all */
104         if (index >= l->node_count)
105                 return index;
106 
107         /* LCRA didn't bother solving this index (how lazy!) */
108         signed solution = l->solutions[index];
109         if (solution < 0)
110                 return index;
111 
112         assert((solution & 0x3) == 0);
113         unsigned reg = solution / 4;
114         reg += offset;
115 
116         return BIR_INDEX_REGISTER | reg;
117 }
118 
119 static void
bi_adjust_src_ra(bi_instruction * ins,struct lcra_state * l,unsigned src)120 bi_adjust_src_ra(bi_instruction *ins, struct lcra_state *l, unsigned src)
121 {
122         if (ins->src[src] >= l->node_count)
123                 return;
124 
125         bool vector = (bi_class_props[ins->type] & BI_VECTOR) && src == 0;
126         unsigned offset = 0;
127 
128         if (vector) {
129                 /* TODO: Do we do anything here? */
130         } else {
131                 /* Use the swizzle as component select */
132                 unsigned components = bi_get_component_count(ins, src);
133 
134                 nir_alu_type T = ins->src_types[src];
135                 unsigned size = nir_alu_type_get_type_size(T);
136 
137                 /* TODO: 64-bit? */
138                 unsigned components_per_word = MAX2(32 / size, 1);
139 
140                 for (unsigned i = 0; i < components; ++i) {
141                         unsigned off = ins->swizzle[src][i] / components_per_word;
142 
143                         /* We can't cross register boundaries in a swizzle */
144                         if (i == 0)
145                                 offset = off;
146                         else
147                                 assert(off == offset);
148 
149                         ins->swizzle[src][i] %= components_per_word;
150                 }
151         }
152 
153         ins->src[src] = bi_reg_from_index(l, ins->src[src], offset);
154 }
155 
156 static void
bi_adjust_dest_ra(bi_instruction * ins,struct lcra_state * l)157 bi_adjust_dest_ra(bi_instruction *ins, struct lcra_state *l)
158 {
159         if (ins->dest >= l->node_count)
160                 return;
161 
162         ins->dest = bi_reg_from_index(l, ins->dest, ins->dest_offset);
163         ins->dest_offset = 0;
164 }
165 
166 static void
bi_install_registers(bi_context * ctx,struct lcra_state * l)167 bi_install_registers(bi_context *ctx, struct lcra_state *l)
168 {
169         bi_foreach_instr_global(ctx, ins) {
170                 bi_adjust_dest_ra(ins, l);
171 
172                 bi_foreach_src(ins, s)
173                         bi_adjust_src_ra(ins, l, s);
174         }
175 }
176 
177 static void
bi_rewrite_index_src_single(bi_instruction * ins,unsigned old,unsigned new)178 bi_rewrite_index_src_single(bi_instruction *ins, unsigned old, unsigned new)
179 {
180         bi_foreach_src(ins, i) {
181                 if (ins->src[i] == old)
182                         ins->src[i] = new;
183         }
184 }
185 
186 static bi_instruction
bi_spill(unsigned node,uint64_t offset,unsigned channels)187 bi_spill(unsigned node, uint64_t offset, unsigned channels)
188 {
189         bi_instruction store = {
190                 .type = BI_STORE,
191                 .segment = BI_SEGMENT_TLS,
192                 .vector_channels = channels,
193                 .src = {
194                         node,
195                         BIR_INDEX_CONSTANT,
196                         BIR_INDEX_CONSTANT | 32,
197                 },
198                 .src_types = {
199                         nir_type_uint32,
200                         nir_type_uint32,
201                         nir_type_uint32
202                 },
203                 .constant = { .u64 = offset },
204         };
205 
206         return store;
207 }
208 
209 static bi_instruction
bi_fill(unsigned node,uint64_t offset,unsigned channels)210 bi_fill(unsigned node, uint64_t offset, unsigned channels)
211 {
212         bi_instruction load = {
213                 .type = BI_LOAD,
214                 .segment = BI_SEGMENT_TLS,
215                 .vector_channels = channels,
216                 .dest = node,
217                 .dest_type = nir_type_uint32,
218                 .src = {
219                         BIR_INDEX_CONSTANT,
220                         BIR_INDEX_CONSTANT | 32,
221                 },
222                 .src_types = {
223                         nir_type_uint32,
224                         nir_type_uint32
225                 },
226                 .constant = { .u64 = offset },
227         };
228 
229         return load;
230 }
231 
232 /* Get the single instruction in a singleton clause. Precondition: clause
233  * contains exactly 1 instruction.
234  *
235  * More complex scheduling implies tougher constraints on spilling. We'll cross
236  * that bridge when we get to it. For now, just grab the one and only
237  * instruction in the clause */
238 
239 static bi_instruction *
bi_unwrap_singleton(bi_clause * clause)240 bi_unwrap_singleton(bi_clause *clause)
241 {
242        assert(clause->bundle_count == 1);
243        assert((clause->bundles[0].fma != NULL) ^ (clause->bundles[0].add != NULL));
244 
245        return clause->bundles[0].fma ? clause->bundles[0].fma
246                : clause->bundles[0].add;
247 }
248 
249 static inline void
bi_insert_singleton(void * memctx,bi_clause * cursor,bi_block * block,bi_instruction ins,bool before)250 bi_insert_singleton(void *memctx, bi_clause *cursor, bi_block *block,
251                 bi_instruction ins, bool before)
252 {
253         bi_instruction *uins = rzalloc(memctx, bi_instruction);
254         memcpy(uins, &ins, sizeof(ins));
255 
256         /* Get the instruction to pivot around. Should be first/last of clause
257          * depending on before setting, those coincide for singletons */
258         bi_instruction *cursor_ins = bi_unwrap_singleton(cursor);
259 
260         bi_clause *clause = bi_make_singleton(memctx, uins,
261                         block, 0, (1 << 0), true);
262 
263         if (before) {
264                 list_addtail(&clause->link, &cursor->link);
265                 list_addtail(&uins->link, &cursor_ins->link);
266         } else {
267                 list_add(&clause->link, &cursor->link);
268                 list_add(&uins->link, &cursor_ins->link);
269         }
270 }
271 
272 /* If register allocation fails, find the best spill node */
273 
274 static signed
bi_choose_spill_node(bi_context * ctx,struct lcra_state * l)275 bi_choose_spill_node(bi_context *ctx, struct lcra_state *l)
276 {
277         /* Pick a node satisfying bi_spill_register's preconditions */
278 
279         bi_foreach_instr_global(ctx, ins) {
280                 if (ins->no_spill)
281                         lcra_set_node_spill_cost(l, ins->dest, -1);
282         }
283 
284         for (unsigned i = PAN_IS_REG; i < l->node_count; i += 2)
285                 lcra_set_node_spill_cost(l, i, -1);
286 
287         return lcra_get_best_spill_node(l);
288 }
289 
290 /* Once we've chosen a spill node, spill it. Precondition: node is a valid
291  * SSA node in the non-optimized scheduled IR that was not already
292  * spilled (enforced by bi_choose_spill_node). Returns bytes spilled */
293 
294 static unsigned
bi_spill_register(bi_context * ctx,unsigned node,unsigned offset)295 bi_spill_register(bi_context *ctx, unsigned node, unsigned offset)
296 {
297         assert(!(node & PAN_IS_REG));
298 
299         unsigned channels = 1;
300 
301         /* Spill after every store */
302         bi_foreach_block(ctx, _block) {
303                 bi_block *block = (bi_block *) _block;
304                 bi_foreach_clause_in_block_safe(block, clause) {
305                         bi_instruction *ins = bi_unwrap_singleton(clause);
306 
307                         if (ins->dest != node) continue;
308 
309                         ins->dest = bi_make_temp(ctx);
310                         ins->no_spill = true;
311                         channels = MAX2(channels, ins->vector_channels);
312 
313                         bi_instruction st = bi_spill(ins->dest, offset, channels);
314                         bi_insert_singleton(ctx, clause, block, st, false);
315                         ctx->spills++;
316                 }
317         }
318 
319         /* Fill before every use */
320         bi_foreach_block(ctx, _block) {
321                 bi_block *block = (bi_block *) _block;
322                 bi_foreach_clause_in_block_safe(block, clause) {
323                         bi_instruction *ins = bi_unwrap_singleton(clause);
324                         if (!bi_has_arg(ins, node)) continue;
325 
326                         /* Don't rewrite spills themselves */
327                         if (ins->segment == BI_SEGMENT_TLS) continue;
328 
329                         unsigned index = bi_make_temp(ctx);
330 
331                         bi_instruction ld = bi_fill(index, offset, channels);
332                         ld.no_spill = true;
333                         bi_insert_singleton(ctx, clause, block, ld, true);
334 
335                         /* Rewrite to use */
336                         bi_rewrite_index_src_single(ins, node, index);
337                         ctx->fills++;
338                 }
339         }
340 
341         return (channels * 4);
342 }
343 
344 void
bi_register_allocate(bi_context * ctx)345 bi_register_allocate(bi_context *ctx)
346 {
347         struct lcra_state *l = NULL;
348         bool success = false;
349 
350         unsigned iter_count = 100; /* max iterations */
351 
352         /* Number of bytes of memory we've spilled into */
353         unsigned spill_count = 0;
354 
355         /* For instructions that both read and write from a data register, it's
356          * the *same* data register. We enforce that constraint by just doing a
357          * quick rewrite. TODO: are there cases where this causes RA to have no
358          * solutions due to copyprop? */
359         bi_foreach_instr_global(ctx, ins) {
360                 unsigned props = bi_class_props[ins->type];
361                 unsigned both = BI_DATA_REG_SRC | BI_DATA_REG_DEST;
362                 if ((props & both) != both) continue;
363 
364                 bi_rewrite_uses(ctx, ins->dest, 0, ins->src[0], 0);
365                 ins->dest = ins->src[0];
366         }
367 
368         do {
369                 if (l) {
370                         signed spill_node = bi_choose_spill_node(ctx, l);
371                         lcra_free(l);
372                         l = NULL;
373 
374 
375                         if (spill_node == -1)
376                                 unreachable("Failed to choose spill node\n");
377 
378                         spill_count += bi_spill_register(ctx, spill_node, spill_count);
379                 }
380 
381                 bi_invalidate_liveness(ctx);
382                 l = bi_allocate_registers(ctx, &success);
383         } while(!success && ((iter_count--) > 0));
384 
385         assert(success);
386 
387         ctx->tls_size = spill_count;
388         bi_install_registers(ctx, l);
389 
390         lcra_free(l);
391 }
392