1 /*
2  * Copyright © 2011 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 extern "C" {
25 #include "main/macros.h"
26 #include "program/register_allocate.h"
27 } /* extern "C" */
28 
29 #include "brw_vec4.h"
30 #include "glsl/ir_print_visitor.h"
31 
32 using namespace brw;
33 
34 namespace brw {
35 
36 static void
assign(unsigned int * reg_hw_locations,reg * reg)37 assign(unsigned int *reg_hw_locations, reg *reg)
38 {
39    if (reg->file == GRF) {
40       reg->reg = reg_hw_locations[reg->reg];
41    }
42 }
43 
44 bool
reg_allocate_trivial()45 vec4_visitor::reg_allocate_trivial()
46 {
47    unsigned int hw_reg_mapping[this->virtual_grf_count];
48    bool virtual_grf_used[this->virtual_grf_count];
49    int i;
50    int next;
51 
52    /* Calculate which virtual GRFs are actually in use after whatever
53     * optimization passes have occurred.
54     */
55    for (int i = 0; i < this->virtual_grf_count; i++) {
56       virtual_grf_used[i] = false;
57    }
58 
59    foreach_iter(exec_list_iterator, iter, this->instructions) {
60       vec4_instruction *inst = (vec4_instruction *)iter.get();
61 
62       if (inst->dst.file == GRF)
63 	 virtual_grf_used[inst->dst.reg] = true;
64 
65       for (int i = 0; i < 3; i++) {
66 	 if (inst->src[i].file == GRF)
67 	    virtual_grf_used[inst->src[i].reg] = true;
68       }
69    }
70 
71    hw_reg_mapping[0] = this->first_non_payload_grf;
72    next = hw_reg_mapping[0] + this->virtual_grf_sizes[0];
73    for (i = 1; i < this->virtual_grf_count; i++) {
74       if (virtual_grf_used[i]) {
75 	 hw_reg_mapping[i] = next;
76 	 next += this->virtual_grf_sizes[i];
77       }
78    }
79    prog_data->total_grf = next;
80 
81    foreach_iter(exec_list_iterator, iter, this->instructions) {
82       vec4_instruction *inst = (vec4_instruction *)iter.get();
83 
84       assign(hw_reg_mapping, &inst->dst);
85       assign(hw_reg_mapping, &inst->src[0]);
86       assign(hw_reg_mapping, &inst->src[1]);
87       assign(hw_reg_mapping, &inst->src[2]);
88    }
89 
90    if (prog_data->total_grf > max_grf) {
91       fail("Ran out of regs on trivial allocator (%d/%d)\n",
92 	   prog_data->total_grf, max_grf);
93       return false;
94    }
95 
96    return true;
97 }
98 
99 static void
brw_alloc_reg_set_for_classes(struct brw_context * brw,int * class_sizes,int class_count,int base_reg_count)100 brw_alloc_reg_set_for_classes(struct brw_context *brw,
101 			      int *class_sizes,
102 			      int class_count,
103 			      int base_reg_count)
104 {
105    /* Compute the total number of registers across all classes. */
106    int ra_reg_count = 0;
107    for (int i = 0; i < class_count; i++) {
108       ra_reg_count += base_reg_count - (class_sizes[i] - 1);
109    }
110 
111    ralloc_free(brw->vs.ra_reg_to_grf);
112    brw->vs.ra_reg_to_grf = ralloc_array(brw, uint8_t, ra_reg_count);
113    ralloc_free(brw->vs.regs);
114    brw->vs.regs = ra_alloc_reg_set(brw, ra_reg_count);
115    ralloc_free(brw->vs.classes);
116    brw->vs.classes = ralloc_array(brw, int, class_count + 1);
117 
118    /* Now, add the registers to their classes, and add the conflicts
119     * between them and the base GRF registers (and also each other).
120     */
121    int reg = 0;
122    for (int i = 0; i < class_count; i++) {
123       int class_reg_count = base_reg_count - (class_sizes[i] - 1);
124       brw->vs.classes[i] = ra_alloc_reg_class(brw->vs.regs);
125 
126       for (int j = 0; j < class_reg_count; j++) {
127 	 ra_class_add_reg(brw->vs.regs, brw->vs.classes[i], reg);
128 
129 	 brw->vs.ra_reg_to_grf[reg] = j;
130 
131 	 for (int base_reg = j;
132 	      base_reg < j + class_sizes[i];
133 	      base_reg++) {
134 	    ra_add_transitive_reg_conflict(brw->vs.regs, base_reg, reg);
135 	 }
136 
137 	 reg++;
138       }
139    }
140    assert(reg == ra_reg_count);
141 
142    ra_set_finalize(brw->vs.regs);
143 }
144 
145 bool
reg_allocate()146 vec4_visitor::reg_allocate()
147 {
148    unsigned int hw_reg_mapping[virtual_grf_count];
149    int first_assigned_grf = this->first_non_payload_grf;
150    int base_reg_count = max_grf - first_assigned_grf;
151    int class_sizes[base_reg_count];
152    int class_count = 0;
153 
154    /* Using the trivial allocator can be useful in debugging undefined
155     * register access as a result of broken optimization passes.
156     */
157    if (0)
158       return reg_allocate_trivial();
159 
160    calculate_live_intervals();
161 
162    /* Set up the register classes.
163     *
164     * The base registers store a vec4.  However, we'll need larger
165     * storage for arrays, structures, and matrices, which will be sets
166     * of contiguous registers.
167     */
168    class_sizes[class_count++] = 1;
169 
170    for (int r = 0; r < virtual_grf_count; r++) {
171       int i;
172 
173       for (i = 0; i < class_count; i++) {
174 	 if (class_sizes[i] == this->virtual_grf_sizes[r])
175 	    break;
176       }
177       if (i == class_count) {
178 	 if (this->virtual_grf_sizes[r] >= base_reg_count) {
179 	    fail("Object too large to register allocate.\n");
180 	 }
181 
182 	 class_sizes[class_count++] = this->virtual_grf_sizes[r];
183       }
184    }
185 
186    brw_alloc_reg_set_for_classes(brw, class_sizes, class_count, base_reg_count);
187 
188    struct ra_graph *g = ra_alloc_interference_graph(brw->vs.regs,
189 						    virtual_grf_count);
190 
191    for (int i = 0; i < virtual_grf_count; i++) {
192       for (int c = 0; c < class_count; c++) {
193 	 if (class_sizes[c] == this->virtual_grf_sizes[i]) {
194 	    ra_set_node_class(g, i, brw->vs.classes[c]);
195 	    break;
196 	 }
197       }
198 
199       for (int j = 0; j < i; j++) {
200 	 if (virtual_grf_interferes(i, j)) {
201 	    ra_add_node_interference(g, i, j);
202 	 }
203       }
204    }
205 
206    if (!ra_allocate_no_spills(g)) {
207       /* Failed to allocate registers.  Spill a reg, and the caller will
208        * loop back into here to try again.
209        */
210       int reg = choose_spill_reg(g);
211       if (reg == -1) {
212          fail("no register to spill\n");
213       } else {
214          spill_reg(reg);
215       }
216       ralloc_free(g);
217       return false;
218    }
219 
220    /* Get the chosen virtual registers for each node, and map virtual
221     * regs in the register classes back down to real hardware reg
222     * numbers.
223     */
224    prog_data->total_grf = first_assigned_grf;
225    for (int i = 0; i < virtual_grf_count; i++) {
226       int reg = ra_get_node_reg(g, i);
227 
228       hw_reg_mapping[i] = first_assigned_grf + brw->vs.ra_reg_to_grf[reg];
229       prog_data->total_grf = MAX2(prog_data->total_grf,
230 				  hw_reg_mapping[i] + virtual_grf_sizes[i]);
231    }
232 
233    foreach_list(node, &this->instructions) {
234       vec4_instruction *inst = (vec4_instruction *)node;
235 
236       assign(hw_reg_mapping, &inst->dst);
237       assign(hw_reg_mapping, &inst->src[0]);
238       assign(hw_reg_mapping, &inst->src[1]);
239       assign(hw_reg_mapping, &inst->src[2]);
240    }
241 
242    ralloc_free(g);
243 
244    return true;
245 }
246 
247 void
evaluate_spill_costs(float * spill_costs,bool * no_spill)248 vec4_visitor::evaluate_spill_costs(float *spill_costs, bool *no_spill)
249 {
250    float loop_scale = 1.0;
251 
252    for (int i = 0; i < this->virtual_grf_count; i++) {
253       spill_costs[i] = 0.0;
254       no_spill[i] = virtual_grf_sizes[i] != 1;
255    }
256 
257    /* Calculate costs for spilling nodes.  Call it a cost of 1 per
258     * spill/unspill we'll have to do, and guess that the insides of
259     * loops run 10 times.
260     */
261    foreach_list(node, &this->instructions) {
262       vec4_instruction *inst = (vec4_instruction *) node;
263 
264       for (unsigned int i = 0; i < 3; i++) {
265 	 if (inst->src[i].file == GRF) {
266 	    spill_costs[inst->src[i].reg] += loop_scale;
267             if (inst->src[i].reladdr)
268                no_spill[inst->src[i].reg] = true;
269 	 }
270       }
271 
272       if (inst->dst.file == GRF) {
273 	 spill_costs[inst->dst.reg] += loop_scale;
274          if (inst->dst.reladdr)
275             no_spill[inst->dst.reg] = true;
276       }
277 
278       switch (inst->opcode) {
279 
280       case BRW_OPCODE_DO:
281 	 loop_scale *= 10;
282 	 break;
283 
284       case BRW_OPCODE_WHILE:
285 	 loop_scale /= 10;
286 	 break;
287 
288       case VS_OPCODE_SCRATCH_READ:
289       case VS_OPCODE_SCRATCH_WRITE:
290          for (int i = 0; i < 3; i++) {
291             if (inst->src[i].file == GRF)
292                no_spill[inst->src[i].reg] = true;
293          }
294 	 if (inst->dst.file == GRF)
295 	    no_spill[inst->dst.reg] = true;
296 	 break;
297 
298       default:
299 	 break;
300       }
301    }
302 }
303 
304 int
choose_spill_reg(struct ra_graph * g)305 vec4_visitor::choose_spill_reg(struct ra_graph *g)
306 {
307    float spill_costs[this->virtual_grf_count];
308    bool no_spill[this->virtual_grf_count];
309 
310    evaluate_spill_costs(spill_costs, no_spill);
311 
312    for (int i = 0; i < this->virtual_grf_count; i++) {
313       if (!no_spill[i])
314          ra_set_node_spill_cost(g, i, spill_costs[i]);
315    }
316 
317    return ra_get_best_spill_node(g);
318 }
319 
320 void
spill_reg(int spill_reg_nr)321 vec4_visitor::spill_reg(int spill_reg_nr)
322 {
323    assert(virtual_grf_sizes[spill_reg_nr] == 1);
324    unsigned int spill_offset = c->last_scratch++;
325 
326    /* Generate spill/unspill instructions for the objects being spilled. */
327    foreach_list(node, &this->instructions) {
328       vec4_instruction *inst = (vec4_instruction *) node;
329 
330       for (unsigned int i = 0; i < 3; i++) {
331          if (inst->src[i].file == GRF && inst->src[i].reg == spill_reg_nr) {
332             src_reg spill_reg = inst->src[i];
333             inst->src[i].reg = virtual_grf_alloc(1);
334             dst_reg temp = dst_reg(inst->src[i]);
335 
336             /* Only read the necessary channels, to avoid overwriting the rest
337              * with data that may not have been written to scratch.
338              */
339             temp.writemask = 0;
340             for (int c = 0; c < 4; c++)
341                temp.writemask |= (1 << BRW_GET_SWZ(inst->src[i].swizzle, c));
342             assert(temp.writemask != 0);
343 
344             emit_scratch_read(inst, temp, spill_reg, spill_offset);
345          }
346       }
347 
348       if (inst->dst.file == GRF && inst->dst.reg == spill_reg_nr) {
349          dst_reg spill_reg = inst->dst;
350          inst->dst.reg = virtual_grf_alloc(1);
351 
352          /* We don't want a swizzle when reading from the source; read the
353           * whole register and use spill_reg's writemask to select which
354           * channels to write.
355           */
356          src_reg temp = src_reg(inst->dst);
357          temp.swizzle = BRW_SWIZZLE_XYZW;
358          emit_scratch_write(inst, temp, spill_reg, spill_offset);
359       }
360    }
361 
362    this->live_intervals_valid = false;
363 }
364 
365 } /* namespace brw */
366