1 /*
2  * Copyright (C) 2019 Collabora, Ltd.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  *
23  * Authors (Collabora):
24  *   Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
25  */
26 
27 #include "compiler.h"
28 #include "util/u_math.h"
29 #include "util/u_memory.h"
30 
31 /* This pass promotes reads from uniforms from load/store ops to uniform
32  * registers if it is beneficial to do so. Normally, this saves both
33  * instructions and total register pressure, but it does take a toll on the
34  * number of work registers that are available, so this is a balance.
35  *
36  * We use a heuristic to determine the ideal count, implemented by
37  * mir_work_heuristic, which returns the ideal number of work registers.
38  */
39 
40 static bool
mir_is_promoteable_ubo(midgard_instruction * ins)41 mir_is_promoteable_ubo(midgard_instruction *ins)
42 {
43         /* TODO: promote unaligned access via swizzle? */
44 
45         return (ins->type == TAG_LOAD_STORE_4) &&
46                 (OP_IS_UBO_READ(ins->op)) &&
47                 !(ins->constants.u32[0] & 0xF) &&
48                 !(ins->load_store.arg_1) &&
49                 (ins->load_store.arg_2 == 0x1E) &&
50                 ((ins->constants.u32[0] / 16) < 16);
51 }
52 
53 static unsigned
mir_promoteable_uniform_count(compiler_context * ctx)54 mir_promoteable_uniform_count(compiler_context *ctx)
55 {
56         unsigned count = 0;
57 
58         mir_foreach_instr_global(ctx, ins) {
59                 if (mir_is_promoteable_ubo(ins))
60                         count = MAX2(count, ins->constants.u32[0] / 16);
61         }
62 
63         return count;
64 }
65 
66 static unsigned
mir_count_live(uint16_t * live,unsigned temp_count)67 mir_count_live(uint16_t *live, unsigned temp_count)
68 {
69         unsigned count = 0;
70 
71         for (unsigned i = 0; i < temp_count; ++i)
72                 count += util_bitcount(live[i]);
73 
74         return count;
75 }
76 
77 static unsigned
mir_estimate_pressure(compiler_context * ctx)78 mir_estimate_pressure(compiler_context *ctx)
79 {
80         mir_invalidate_liveness(ctx);
81         mir_compute_liveness(ctx);
82 
83         unsigned max_live = 0;
84 
85         mir_foreach_block(ctx, _block) {
86                 midgard_block *block = (midgard_block *) _block;
87                 uint16_t *live = mem_dup(block->base.live_out, ctx->temp_count * sizeof(uint16_t));
88 
89                 mir_foreach_instr_in_block_rev(block, ins) {
90                         unsigned count = mir_count_live(live, ctx->temp_count);
91                         max_live = MAX2(max_live, count);
92                         mir_liveness_ins_update(live, ins, ctx->temp_count);
93                 }
94 
95                 free(live);
96         }
97 
98         return DIV_ROUND_UP(max_live, 16);
99 }
100 
101 static unsigned
mir_work_heuristic(compiler_context * ctx)102 mir_work_heuristic(compiler_context *ctx)
103 {
104         unsigned uniform_count = mir_promoteable_uniform_count(ctx);
105 
106         /* If there are 8 or fewer uniforms, it doesn't matter what we do, so
107          * allow as many work registers as needed */
108 
109         if (uniform_count <= 8)
110                 return 16;
111 
112         /* Otherwise, estimate the register pressure */
113 
114         unsigned pressure = mir_estimate_pressure(ctx);
115 
116         /* Prioritize not spilling above all else. The relation between the
117          * pressure estimate and the actual register pressure is a little
118          * murkier than we might like (due to scheduling, pipeline registers,
119          * failure to pack vector registers, load/store registers, texture
120          * registers...), hence why this is a heuristic parameter */
121 
122         if (pressure > 6)
123                 return 16;
124 
125         /* If there's no chance of spilling, prioritize UBOs and thread count */
126 
127         return 8;
128 }
129 
130 /* Bitset of indices that will be used as a special register -- inputs to a
131  * non-ALU op. We precompute this set so that testing is efficient, otherwise
132  * we end up O(mn) behaviour for n instructions and m uniform reads */
133 
134 static BITSET_WORD *
mir_special_indices(compiler_context * ctx)135 mir_special_indices(compiler_context *ctx)
136 {
137         mir_compute_temp_count(ctx);
138         BITSET_WORD *bset = calloc(BITSET_WORDS(ctx->temp_count), sizeof(BITSET_WORD));
139 
140         mir_foreach_instr_global(ctx, ins) {
141                 /* Look for special instructions */
142                 bool is_ldst = ins->type == TAG_LOAD_STORE_4;
143                 bool is_tex = ins->type == TAG_TEXTURE_4;
144                 bool is_writeout = ins->compact_branch && ins->writeout;
145 
146                 if (!(is_ldst || is_tex || is_writeout))
147                         continue;
148 
149                 /* Anything read by a special instruction is itself special */
150                 mir_foreach_src(ins, i) {
151                         unsigned idx = ins->src[i];
152 
153                         if (idx < ctx->temp_count)
154                                 BITSET_SET(bset, idx);
155                 }
156         }
157 
158         return bset;
159 }
160 
161 void
midgard_promote_uniforms(compiler_context * ctx)162 midgard_promote_uniforms(compiler_context *ctx)
163 {
164         unsigned work_count = mir_work_heuristic(ctx);
165         unsigned promoted_count = 24 - work_count;
166 
167         /* First, figure out special indices a priori so we don't recompute a lot */
168         BITSET_WORD *special = mir_special_indices(ctx);
169 
170         mir_foreach_instr_global_safe(ctx, ins) {
171                 if (!mir_is_promoteable_ubo(ins)) continue;
172 
173                 unsigned off = ins->constants.u32[0];
174                 unsigned address = off / 16;
175 
176                 /* Check if it's a promotable range */
177                 unsigned uniform_reg = 23 - address;
178 
179                 if (address >= promoted_count) continue;
180 
181                 /* It is, great! Let's promote */
182 
183                 ctx->uniform_cutoff = MAX2(ctx->uniform_cutoff, address + 1);
184                 unsigned promoted = SSA_FIXED_REGISTER(uniform_reg);
185 
186                 /* We do need the move for safety for a non-SSA dest, or if
187                  * we're being fed into a special class */
188 
189                 bool needs_move = ins->dest & PAN_IS_REG || ins->dest == ctx->blend_src1;
190 
191                 if (ins->dest < ctx->temp_count)
192                         needs_move |= BITSET_TEST(special, ins->dest);
193 
194                 if (needs_move) {
195                         unsigned type_size = nir_alu_type_get_type_size(ins->dest_type);
196                         midgard_instruction mov = v_mov(promoted, ins->dest);
197                         mov.dest_type = nir_type_uint | type_size;
198                         mov.src_types[1] = mov.dest_type;
199 
200                         uint16_t rounded = mir_round_bytemask_up(mir_bytemask(ins), type_size);
201                         mir_set_bytemask(&mov, rounded);
202                         mir_insert_instruction_before(ctx, ins, mov);
203                 } else {
204                         mir_rewrite_index_src(ctx, ins->dest, promoted);
205                 }
206 
207                 mir_remove_instruction(ins);
208         }
209 
210         free(special);
211 }
212