1 /*
2  * Copyright © 2014 Broadcom
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "util/ralloc.h"
25 #include "util/register_allocate.h"
26 #include "common/v3d_device_info.h"
27 #include "v3d_compiler.h"
28 
29 #define QPU_R(i) { .magic = false, .index = i }
30 
31 #define ACC_INDEX     0
32 #define ACC_COUNT     5
33 #define PHYS_INDEX    (ACC_INDEX + ACC_COUNT)
34 #define PHYS_COUNT    64
35 
36 bool
vir_init_reg_sets(struct v3d_compiler * compiler)37 vir_init_reg_sets(struct v3d_compiler *compiler)
38 {
39         /* Allocate up to 3 regfile classes, for the ways the physical
40          * register file can be divided up for fragment shader threading.
41          */
42         int max_thread_index = (compiler->devinfo->ver >= 40 ? 2 : 3);
43 
44         compiler->regs = ra_alloc_reg_set(compiler, PHYS_INDEX + PHYS_COUNT,
45                                           true);
46         if (!compiler->regs)
47                 return false;
48 
49         for (int threads = 0; threads < max_thread_index; threads++) {
50                 compiler->reg_class_phys_or_acc[threads] =
51                         ra_alloc_reg_class(compiler->regs);
52                 compiler->reg_class_phys[threads] =
53                         ra_alloc_reg_class(compiler->regs);
54 
55                 for (int i = PHYS_INDEX;
56                      i < PHYS_INDEX + (PHYS_COUNT >> threads); i++) {
57                         ra_class_add_reg(compiler->regs,
58                                          compiler->reg_class_phys_or_acc[threads], i);
59                         ra_class_add_reg(compiler->regs,
60                                          compiler->reg_class_phys[threads], i);
61                 }
62 
63                 for (int i = ACC_INDEX + 0; i < ACC_INDEX + ACC_COUNT; i++) {
64                         ra_class_add_reg(compiler->regs,
65                                          compiler->reg_class_phys_or_acc[threads], i);
66                 }
67         }
68 
69         ra_set_finalize(compiler->regs, NULL);
70 
71         return true;
72 }
73 
74 struct node_to_temp_map {
75         uint32_t temp;
76         uint32_t priority;
77 };
78 
79 static int
node_to_temp_priority(const void * in_a,const void * in_b)80 node_to_temp_priority(const void *in_a, const void *in_b)
81 {
82         const struct node_to_temp_map *a = in_a;
83         const struct node_to_temp_map *b = in_b;
84 
85         return a->priority - b->priority;
86 }
87 
88 #define CLASS_BIT_PHYS			(1 << 0)
89 #define CLASS_BIT_R0_R2			(1 << 1)
90 #define CLASS_BIT_R3			(1 << 2)
91 #define CLASS_BIT_R4			(1 << 3)
92 
93 /**
94  * Returns a mapping from QFILE_TEMP indices to struct qpu_regs.
95  *
96  * The return value should be freed by the caller.
97  */
98 struct qpu_reg *
v3d_register_allocate(struct v3d_compile * c)99 v3d_register_allocate(struct v3d_compile *c)
100 {
101         struct node_to_temp_map map[c->num_temps];
102         uint32_t temp_to_node[c->num_temps];
103         uint8_t class_bits[c->num_temps];
104         struct qpu_reg *temp_registers = calloc(c->num_temps,
105                                                 sizeof(*temp_registers));
106         int acc_nodes[ACC_COUNT];
107 
108         struct ra_graph *g = ra_alloc_interference_graph(c->compiler->regs,
109                                                          c->num_temps +
110                                                          ARRAY_SIZE(acc_nodes));
111         /* Convert 1, 2, 4 threads to 0, 1, 2 index.
112          *
113          * V3D 4.x has double the physical register space, so 64 physical regs
114          * are available at both 1x and 2x threading, and 4x has 32.
115          */
116         int thread_index = ffs(c->threads) - 1;
117         if (c->devinfo->ver >= 40) {
118                 if (thread_index >= 1)
119                         thread_index--;
120         }
121 
122         /* Make some fixed nodes for the accumulators, which we will need to
123          * interfere with when ops have implied r3/r4 writes or for the thread
124          * switches.  We could represent these as classes for the nodes to
125          * live in, but the classes take up a lot of memory to set up, so we
126          * don't want to make too many.
127          */
128         for (int i = 0; i < ARRAY_SIZE(acc_nodes); i++) {
129                 acc_nodes[i] = c->num_temps + i;
130                 ra_set_node_reg(g, acc_nodes[i], ACC_INDEX + i);
131         }
132 
133         for (uint32_t i = 0; i < c->num_temps; i++) {
134                 map[i].temp = i;
135                 map[i].priority = c->temp_end[i] - c->temp_start[i];
136         }
137         qsort(map, c->num_temps, sizeof(map[0]), node_to_temp_priority);
138         for (uint32_t i = 0; i < c->num_temps; i++) {
139                 temp_to_node[map[i].temp] = i;
140         }
141 
142         /* Figure out our register classes and preallocated registers.  We
143          * start with any temp being able to be in any file, then instructions
144          * incrementally remove bits that the temp definitely can't be in.
145          */
146         memset(class_bits,
147                CLASS_BIT_PHYS | CLASS_BIT_R0_R2 | CLASS_BIT_R3 | CLASS_BIT_R4,
148                sizeof(class_bits));
149 
150         int ip = 0;
151         vir_for_each_inst_inorder(inst, c) {
152                 /* If the instruction writes r3/r4 (and optionally moves its
153                  * result to a temp), nothing else can be stored in r3/r4 across
154                  * it.
155                  */
156                 if (vir_writes_r3(c->devinfo, inst)) {
157                         for (int i = 0; i < c->num_temps; i++) {
158                                 if (c->temp_start[i] < ip &&
159                                     c->temp_end[i] > ip) {
160                                         ra_add_node_interference(g,
161                                                                  temp_to_node[i],
162                                                                  acc_nodes[3]);
163                                 }
164                         }
165                 }
166                 if (vir_writes_r4(c->devinfo, inst)) {
167                         for (int i = 0; i < c->num_temps; i++) {
168                                 if (c->temp_start[i] < ip &&
169                                     c->temp_end[i] > ip) {
170                                         ra_add_node_interference(g,
171                                                                  temp_to_node[i],
172                                                                  acc_nodes[4]);
173                                 }
174                         }
175                 }
176 
177                 if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU) {
178                         switch (inst->qpu.alu.add.op) {
179                         case V3D_QPU_A_LDVPMV_IN:
180                         case V3D_QPU_A_LDVPMV_OUT:
181                         case V3D_QPU_A_LDVPMD_IN:
182                         case V3D_QPU_A_LDVPMD_OUT:
183                         case V3D_QPU_A_LDVPMP:
184                         case V3D_QPU_A_LDVPMG_IN:
185                         case V3D_QPU_A_LDVPMG_OUT:
186                                 /* LDVPMs only store to temps (the MA flag
187                                  * decides whether the LDVPM is in or out)
188                                  */
189                                 assert(inst->dst.file == QFILE_TEMP);
190                                 class_bits[inst->dst.index] &= CLASS_BIT_PHYS;
191                                 break;
192 
193                         default:
194                                 break;
195                         }
196                 }
197 
198                 if (inst->src[0].file == QFILE_REG) {
199                         switch (inst->src[0].index) {
200                         case 0:
201                         case 1:
202                         case 2:
203                                 /* Payload setup instructions: Force allocate
204                                  * the dst to the given register (so the MOV
205                                  * will disappear).
206                                  */
207                                 assert(inst->qpu.alu.mul.op == V3D_QPU_M_MOV);
208                                 assert(inst->dst.file == QFILE_TEMP);
209                                 ra_set_node_reg(g,
210                                                 temp_to_node[inst->dst.index],
211                                                 PHYS_INDEX +
212                                                 inst->src[0].index);
213                                 break;
214                         }
215                 }
216 
217                 if (inst->qpu.sig.thrsw) {
218                         /* All accumulators are invalidated across a thread
219                          * switch.
220                          */
221                         for (int i = 0; i < c->num_temps; i++) {
222                                 if (c->temp_start[i] < ip && c->temp_end[i] > ip)
223                                         class_bits[i] &= CLASS_BIT_PHYS;
224                         }
225                 }
226 
227                 ip++;
228         }
229 
230         for (uint32_t i = 0; i < c->num_temps; i++) {
231                 if (class_bits[i] == CLASS_BIT_PHYS) {
232                         ra_set_node_class(g, temp_to_node[i],
233                                           c->compiler->reg_class_phys[thread_index]);
234                 } else {
235                         assert(class_bits[i] == (CLASS_BIT_PHYS |
236                                                  CLASS_BIT_R0_R2 |
237                                                  CLASS_BIT_R3 |
238                                                  CLASS_BIT_R4));
239                         ra_set_node_class(g, temp_to_node[i],
240                                           c->compiler->reg_class_phys_or_acc[thread_index]);
241                 }
242         }
243 
244         for (uint32_t i = 0; i < c->num_temps; i++) {
245                 for (uint32_t j = i + 1; j < c->num_temps; j++) {
246                         if (!(c->temp_start[i] >= c->temp_end[j] ||
247                               c->temp_start[j] >= c->temp_end[i])) {
248                                 ra_add_node_interference(g,
249                                                          temp_to_node[i],
250                                                          temp_to_node[j]);
251                         }
252                 }
253         }
254 
255         bool ok = ra_allocate(g);
256         if (!ok) {
257                 free(temp_registers);
258                 return NULL;
259         }
260 
261         for (uint32_t i = 0; i < c->num_temps; i++) {
262                 int ra_reg = ra_get_node_reg(g, temp_to_node[i]);
263                 if (ra_reg < PHYS_INDEX) {
264                         temp_registers[i].magic = true;
265                         temp_registers[i].index = (V3D_QPU_WADDR_R0 +
266                                                    ra_reg - ACC_INDEX);
267                 } else {
268                         temp_registers[i].magic = false;
269                         temp_registers[i].index = ra_reg - PHYS_INDEX;
270                 }
271 
272                 /* If the value's never used, just write to the NOP register
273                  * for clarity in debug output.
274                  */
275                 if (c->temp_start[i] == c->temp_end[i]) {
276                         temp_registers[i].magic = true;
277                         temp_registers[i].index = V3D_QPU_WADDR_NOP;
278                 }
279         }
280 
281         ralloc_free(g);
282 
283         return temp_registers;
284 }
285