1 /*
2  * Copyright © 2010 Intel Corporation
3  * Copyright © 2014-2017 Broadcom
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice (including the next
13  * paragraph) shall be included in all copies or substantial portions of the
14  * Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22  * IN THE SOFTWARE.
23  */
24 
25 /**
26  * @file
27  *
28  * The basic model of the list scheduler is to take a basic block, compute a
29  * DAG of the dependencies, and make a list of the DAG heads.  Heuristically
30  * pick a DAG head, then put all the children that are now DAG heads into the
31  * list of things to schedule.
32  *
33  * The goal of scheduling here is to pack pairs of operations together in a
34  * single QPU instruction.
35  */
36 
37 #include "qpu/qpu_disasm.h"
38 #include "v3d_compiler.h"
39 #include "util/ralloc.h"
40 
41 static bool debug;
42 
43 struct schedule_node_child;
44 
45 struct schedule_node {
46         struct list_head link;
47         struct qinst *inst;
48         struct schedule_node_child *children;
49         uint32_t child_count;
50         uint32_t child_array_size;
51         uint32_t parent_count;
52 
53         /* Longest cycles + instruction_latency() of any parent of this node. */
54         uint32_t unblocked_time;
55 
56         /**
57          * Minimum number of cycles from scheduling this instruction until the
58          * end of the program, based on the slowest dependency chain through
59          * the children.
60          */
61         uint32_t delay;
62 
63         /**
64          * cycles between this instruction being scheduled and when its result
65          * can be consumed.
66          */
67         uint32_t latency;
68 };
69 
70 struct schedule_node_child {
71         struct schedule_node *node;
72         bool write_after_read;
73 };
74 
75 /* When walking the instructions in reverse, we need to swap before/after in
76  * add_dep().
77  */
78 enum direction { F, R };
79 
80 struct schedule_state {
81         const struct v3d_device_info *devinfo;
82         struct schedule_node *last_r[6];
83         struct schedule_node *last_rf[64];
84         struct schedule_node *last_sf;
85         struct schedule_node *last_vpm_read;
86         struct schedule_node *last_tmu_write;
87         struct schedule_node *last_tmu_config;
88         struct schedule_node *last_tlb;
89         struct schedule_node *last_vpm;
90         struct schedule_node *last_unif;
91         struct schedule_node *last_rtop;
92         enum direction dir;
93         /* Estimated cycle when the current instruction would start. */
94         uint32_t time;
95 };
96 
97 static void
add_dep(struct schedule_state * state,struct schedule_node * before,struct schedule_node * after,bool write)98 add_dep(struct schedule_state *state,
99         struct schedule_node *before,
100         struct schedule_node *after,
101         bool write)
102 {
103         bool write_after_read = !write && state->dir == R;
104 
105         if (!before || !after)
106                 return;
107 
108         assert(before != after);
109 
110         if (state->dir == R) {
111                 struct schedule_node *t = before;
112                 before = after;
113                 after = t;
114         }
115 
116         for (int i = 0; i < before->child_count; i++) {
117                 if (before->children[i].node == after &&
118                     (before->children[i].write_after_read == write_after_read)) {
119                         return;
120                 }
121         }
122 
123         if (before->child_array_size <= before->child_count) {
124                 before->child_array_size = MAX2(before->child_array_size * 2, 16);
125                 before->children = reralloc(before, before->children,
126                                             struct schedule_node_child,
127                                             before->child_array_size);
128         }
129 
130         before->children[before->child_count].node = after;
131         before->children[before->child_count].write_after_read =
132                 write_after_read;
133         before->child_count++;
134         after->parent_count++;
135 }
136 
137 static void
add_read_dep(struct schedule_state * state,struct schedule_node * before,struct schedule_node * after)138 add_read_dep(struct schedule_state *state,
139               struct schedule_node *before,
140               struct schedule_node *after)
141 {
142         add_dep(state, before, after, false);
143 }
144 
145 static void
add_write_dep(struct schedule_state * state,struct schedule_node ** before,struct schedule_node * after)146 add_write_dep(struct schedule_state *state,
147               struct schedule_node **before,
148               struct schedule_node *after)
149 {
150         add_dep(state, *before, after, true);
151         *before = after;
152 }
153 
154 static bool
qpu_inst_is_tlb(const struct v3d_qpu_instr * inst)155 qpu_inst_is_tlb(const struct v3d_qpu_instr *inst)
156 {
157         if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
158                 return false;
159 
160         if (inst->alu.add.magic_write &&
161             (inst->alu.add.waddr == V3D_QPU_WADDR_TLB ||
162              inst->alu.add.waddr == V3D_QPU_WADDR_TLBU))
163                 return true;
164 
165         if (inst->alu.mul.magic_write &&
166             (inst->alu.mul.waddr == V3D_QPU_WADDR_TLB ||
167              inst->alu.mul.waddr == V3D_QPU_WADDR_TLBU))
168                 return true;
169 
170         return false;
171 }
172 
173 static void
process_mux_deps(struct schedule_state * state,struct schedule_node * n,enum v3d_qpu_mux mux)174 process_mux_deps(struct schedule_state *state, struct schedule_node *n,
175                  enum v3d_qpu_mux mux)
176 {
177         switch (mux) {
178         case V3D_QPU_MUX_A:
179                 add_read_dep(state, state->last_rf[n->inst->qpu.raddr_a], n);
180                 break;
181         case V3D_QPU_MUX_B:
182                 add_read_dep(state, state->last_rf[n->inst->qpu.raddr_b], n);
183                 break;
184         default:
185                 add_read_dep(state, state->last_r[mux - V3D_QPU_MUX_R0], n);
186                 break;
187         }
188 }
189 
190 
191 static void
process_waddr_deps(struct schedule_state * state,struct schedule_node * n,uint32_t waddr,bool magic)192 process_waddr_deps(struct schedule_state *state, struct schedule_node *n,
193                    uint32_t waddr, bool magic)
194 {
195         if (!magic) {
196                 add_write_dep(state, &state->last_rf[waddr], n);
197         } else if (v3d_qpu_magic_waddr_is_tmu(waddr)) {
198                 add_write_dep(state, &state->last_tmu_write, n);
199                 switch (waddr) {
200                 case V3D_QPU_WADDR_TMUS:
201                 case V3D_QPU_WADDR_TMUSCM:
202                 case V3D_QPU_WADDR_TMUSF:
203                 case V3D_QPU_WADDR_TMUSLOD:
204                         add_write_dep(state, &state->last_tmu_config, n);
205                         break;
206                 default:
207                         break;
208                 }
209         } else if (v3d_qpu_magic_waddr_is_sfu(waddr)) {
210                 /* Handled by v3d_qpu_writes_r4() check. */
211         } else {
212                 switch (waddr) {
213                 case V3D_QPU_WADDR_R0:
214                 case V3D_QPU_WADDR_R1:
215                 case V3D_QPU_WADDR_R2:
216                         add_write_dep(state,
217                                       &state->last_r[waddr - V3D_QPU_WADDR_R0],
218                                       n);
219                         break;
220                 case V3D_QPU_WADDR_R3:
221                 case V3D_QPU_WADDR_R4:
222                 case V3D_QPU_WADDR_R5:
223                         /* Handled by v3d_qpu_writes_r*() checks below. */
224                         break;
225 
226                 case V3D_QPU_WADDR_VPM:
227                 case V3D_QPU_WADDR_VPMU:
228                         add_write_dep(state, &state->last_vpm, n);
229                         break;
230 
231                 case V3D_QPU_WADDR_TLB:
232                 case V3D_QPU_WADDR_TLBU:
233                         add_write_dep(state, &state->last_tlb, n);
234                         break;
235 
236                 case V3D_QPU_WADDR_NOP:
237                         break;
238 
239                 default:
240                         fprintf(stderr, "Unknown waddr %d\n", waddr);
241                         abort();
242                 }
243         }
244 }
245 
246 static void
process_cond_deps(struct schedule_state * state,struct schedule_node * n,enum v3d_qpu_cond cond)247 process_cond_deps(struct schedule_state *state, struct schedule_node *n,
248                   enum v3d_qpu_cond cond)
249 {
250         if (cond != V3D_QPU_COND_NONE)
251                 add_read_dep(state, state->last_sf, n);
252 }
253 
254 static void
process_pf_deps(struct schedule_state * state,struct schedule_node * n,enum v3d_qpu_pf pf)255 process_pf_deps(struct schedule_state *state, struct schedule_node *n,
256                 enum v3d_qpu_pf pf)
257 {
258         if (pf != V3D_QPU_PF_NONE)
259                 add_write_dep(state, &state->last_sf, n);
260 }
261 
262 static void
process_uf_deps(struct schedule_state * state,struct schedule_node * n,enum v3d_qpu_uf uf)263 process_uf_deps(struct schedule_state *state, struct schedule_node *n,
264                 enum v3d_qpu_uf uf)
265 {
266         if (uf != V3D_QPU_UF_NONE)
267                 add_write_dep(state, &state->last_sf, n);
268 }
269 
270 /**
271  * Common code for dependencies that need to be tracked both forward and
272  * backward.
273  *
274  * This is for things like "all reads of r4 have to happen between the r4
275  * writes that surround them".
276  */
277 static void
calculate_deps(struct schedule_state * state,struct schedule_node * n)278 calculate_deps(struct schedule_state *state, struct schedule_node *n)
279 {
280         const struct v3d_device_info *devinfo = state->devinfo;
281         struct qinst *qinst = n->inst;
282         struct v3d_qpu_instr *inst = &qinst->qpu;
283 
284         if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) {
285                 if (inst->branch.cond != V3D_QPU_BRANCH_COND_ALWAYS)
286                         add_read_dep(state, state->last_sf, n);
287 
288                 /* XXX: BDI */
289                 /* XXX: BDU */
290                 /* XXX: ub */
291                 /* XXX: raddr_a */
292 
293                 add_write_dep(state, &state->last_unif, n);
294                 return;
295         }
296 
297         assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
298 
299         /* XXX: LOAD_IMM */
300 
301         if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0)
302                 process_mux_deps(state, n, inst->alu.add.a);
303         if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1)
304                 process_mux_deps(state, n, inst->alu.add.b);
305 
306         if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0)
307                 process_mux_deps(state, n, inst->alu.mul.a);
308         if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1)
309                 process_mux_deps(state, n, inst->alu.mul.b);
310 
311         switch (inst->alu.add.op) {
312         case V3D_QPU_A_VPMSETUP:
313                 /* Could distinguish read/write by unpacking the uniform. */
314                 add_write_dep(state, &state->last_vpm, n);
315                 add_write_dep(state, &state->last_vpm_read, n);
316                 break;
317 
318         case V3D_QPU_A_STVPMV:
319         case V3D_QPU_A_STVPMD:
320         case V3D_QPU_A_STVPMP:
321                 add_write_dep(state, &state->last_vpm, n);
322                 break;
323 
324         case V3D_QPU_A_VPMWT:
325                 add_read_dep(state, state->last_vpm, n);
326                 break;
327 
328         case V3D_QPU_A_MSF:
329                 add_read_dep(state, state->last_tlb, n);
330                 break;
331 
332         case V3D_QPU_A_SETMSF:
333         case V3D_QPU_A_SETREVF:
334                 add_write_dep(state, &state->last_tlb, n);
335                 break;
336 
337         case V3D_QPU_A_FLAPUSH:
338         case V3D_QPU_A_FLBPUSH:
339         case V3D_QPU_A_VFLA:
340         case V3D_QPU_A_VFLNA:
341         case V3D_QPU_A_VFLB:
342         case V3D_QPU_A_VFLNB:
343                 add_read_dep(state, state->last_sf, n);
344                 break;
345 
346         case V3D_QPU_A_FLBPOP:
347                 add_write_dep(state, &state->last_sf, n);
348                 break;
349 
350         default:
351                 break;
352         }
353 
354         switch (inst->alu.mul.op) {
355         case V3D_QPU_M_MULTOP:
356         case V3D_QPU_M_UMUL24:
357                 /* MULTOP sets rtop, and UMUL24 implicitly reads rtop and
358                  * resets it to 0.  We could possibly reorder umul24s relative
359                  * to each other, but for now just keep all the MUL parts in
360                  * order.
361                  */
362                 add_write_dep(state, &state->last_rtop, n);
363                 break;
364         default:
365                 break;
366         }
367 
368         if (inst->alu.add.op != V3D_QPU_A_NOP) {
369                 process_waddr_deps(state, n, inst->alu.add.waddr,
370                                    inst->alu.add.magic_write);
371         }
372         if (inst->alu.mul.op != V3D_QPU_M_NOP) {
373                 process_waddr_deps(state, n, inst->alu.mul.waddr,
374                                    inst->alu.mul.magic_write);
375         }
376         if (v3d_qpu_sig_writes_address(devinfo, &inst->sig)) {
377                 process_waddr_deps(state, n, inst->sig_addr,
378                                    inst->sig_magic);
379         }
380 
381         if (v3d_qpu_writes_r3(devinfo, inst))
382                 add_write_dep(state, &state->last_r[3], n);
383         if (v3d_qpu_writes_r4(devinfo, inst))
384                 add_write_dep(state, &state->last_r[4], n);
385         if (v3d_qpu_writes_r5(devinfo, inst))
386                 add_write_dep(state, &state->last_r[5], n);
387 
388         if (inst->sig.thrsw) {
389                 /* All accumulator contents and flags are undefined after the
390                  * switch.
391                  */
392                 for (int i = 0; i < ARRAY_SIZE(state->last_r); i++)
393                         add_write_dep(state, &state->last_r[i], n);
394                 add_write_dep(state, &state->last_sf, n);
395 
396                 /* Scoreboard-locking operations have to stay after the last
397                  * thread switch.
398                  */
399                 add_write_dep(state, &state->last_tlb, n);
400 
401                 add_write_dep(state, &state->last_tmu_write, n);
402                 add_write_dep(state, &state->last_tmu_config, n);
403         }
404 
405         if (inst->sig.ldtmu) {
406                 /* TMU loads are coming from a FIFO, so ordering is important.
407                  */
408                 add_write_dep(state, &state->last_tmu_write, n);
409         }
410 
411         if (inst->sig.wrtmuc)
412                 add_write_dep(state, &state->last_tmu_config, n);
413 
414         if (inst->sig.ldtlb | inst->sig.ldtlbu)
415                 add_read_dep(state, state->last_tlb, n);
416 
417         if (inst->sig.ldvpm)
418                 add_write_dep(state, &state->last_vpm_read, n);
419 
420         /* inst->sig.ldunif or sideband uniform read */
421         if (qinst->uniform != ~0)
422                 add_write_dep(state, &state->last_unif, n);
423 
424         process_cond_deps(state, n, inst->flags.ac);
425         process_cond_deps(state, n, inst->flags.mc);
426         process_pf_deps(state, n, inst->flags.apf);
427         process_pf_deps(state, n, inst->flags.mpf);
428         process_uf_deps(state, n, inst->flags.auf);
429         process_uf_deps(state, n, inst->flags.muf);
430 }
431 
432 static void
calculate_forward_deps(struct v3d_compile * c,struct list_head * schedule_list)433 calculate_forward_deps(struct v3d_compile *c, struct list_head *schedule_list)
434 {
435         struct schedule_state state;
436 
437         memset(&state, 0, sizeof(state));
438         state.devinfo = c->devinfo;
439         state.dir = F;
440 
441         list_for_each_entry(struct schedule_node, node, schedule_list, link)
442                 calculate_deps(&state, node);
443 }
444 
445 static void
calculate_reverse_deps(struct v3d_compile * c,struct list_head * schedule_list)446 calculate_reverse_deps(struct v3d_compile *c, struct list_head *schedule_list)
447 {
448         struct list_head *node;
449         struct schedule_state state;
450 
451         memset(&state, 0, sizeof(state));
452         state.devinfo = c->devinfo;
453         state.dir = R;
454 
455         for (node = schedule_list->prev; schedule_list != node; node = node->prev) {
456                 calculate_deps(&state, (struct schedule_node *)node);
457         }
458 }
459 
460 struct choose_scoreboard {
461         int tick;
462         int last_sfu_write_tick;
463         int last_ldvary_tick;
464         int last_uniforms_reset_tick;
465         uint32_t last_waddr_add, last_waddr_mul;
466         bool tlb_locked;
467 };
468 
469 static bool
mux_reads_too_soon(struct choose_scoreboard * scoreboard,const struct v3d_qpu_instr * inst,enum v3d_qpu_mux mux)470 mux_reads_too_soon(struct choose_scoreboard *scoreboard,
471                    const struct v3d_qpu_instr *inst, enum v3d_qpu_mux mux)
472 {
473         switch (mux) {
474         case V3D_QPU_MUX_A:
475                 if (scoreboard->last_waddr_add == inst->raddr_a ||
476                     scoreboard->last_waddr_mul == inst->raddr_a) {
477                         return true;
478                 }
479                 break;
480 
481         case V3D_QPU_MUX_B:
482                 if (scoreboard->last_waddr_add == inst->raddr_b ||
483                     scoreboard->last_waddr_mul == inst->raddr_b) {
484                         return true;
485                 }
486                 break;
487 
488         case V3D_QPU_MUX_R4:
489                 if (scoreboard->tick - scoreboard->last_sfu_write_tick <= 2)
490                         return true;
491                 break;
492 
493         case V3D_QPU_MUX_R5:
494                 if (scoreboard->tick - scoreboard->last_ldvary_tick <= 1)
495                         return true;
496                 break;
497         default:
498                 break;
499         }
500 
501         return false;
502 }
503 
504 static bool
reads_too_soon_after_write(struct choose_scoreboard * scoreboard,struct qinst * qinst)505 reads_too_soon_after_write(struct choose_scoreboard *scoreboard,
506                            struct qinst *qinst)
507 {
508         const struct v3d_qpu_instr *inst = &qinst->qpu;
509 
510         /* XXX: Branching off of raddr. */
511         if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH)
512                 return false;
513 
514         assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
515 
516         if (inst->alu.add.op != V3D_QPU_A_NOP) {
517                 if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0 &&
518                     mux_reads_too_soon(scoreboard, inst, inst->alu.add.a)) {
519                         return true;
520                 }
521                 if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1 &&
522                     mux_reads_too_soon(scoreboard, inst, inst->alu.add.b)) {
523                         return true;
524                 }
525         }
526 
527         if (inst->alu.mul.op != V3D_QPU_M_NOP) {
528                 if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0 &&
529                     mux_reads_too_soon(scoreboard, inst, inst->alu.mul.a)) {
530                         return true;
531                 }
532                 if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1 &&
533                     mux_reads_too_soon(scoreboard, inst, inst->alu.mul.b)) {
534                         return true;
535                 }
536         }
537 
538         /* XXX: imm */
539 
540         return false;
541 }
542 
543 static bool
writes_too_soon_after_write(const struct v3d_device_info * devinfo,struct choose_scoreboard * scoreboard,struct qinst * qinst)544 writes_too_soon_after_write(const struct v3d_device_info *devinfo,
545                             struct choose_scoreboard *scoreboard,
546                             struct qinst *qinst)
547 {
548         const struct v3d_qpu_instr *inst = &qinst->qpu;
549 
550         /* Don't schedule any other r4 write too soon after an SFU write.
551          * This would normally be prevented by dependency tracking, but might
552          * occur if a dead SFU computation makes it to scheduling.
553          */
554         if (scoreboard->tick - scoreboard->last_sfu_write_tick < 2 &&
555             v3d_qpu_writes_r4(devinfo, inst))
556                 return true;
557 
558         return false;
559 }
560 
561 static bool
pixel_scoreboard_too_soon(struct choose_scoreboard * scoreboard,const struct v3d_qpu_instr * inst)562 pixel_scoreboard_too_soon(struct choose_scoreboard *scoreboard,
563                           const struct v3d_qpu_instr *inst)
564 {
565         return (scoreboard->tick == 0 && qpu_inst_is_tlb(inst));
566 }
567 
568 static int
get_instruction_priority(const struct v3d_qpu_instr * inst)569 get_instruction_priority(const struct v3d_qpu_instr *inst)
570 {
571         uint32_t baseline_score;
572         uint32_t next_score = 0;
573 
574         /* Schedule TLB operations as late as possible, to get more
575          * parallelism between shaders.
576          */
577         if (qpu_inst_is_tlb(inst))
578                 return next_score;
579         next_score++;
580 
581         /* Schedule texture read results collection late to hide latency. */
582         if (inst->sig.ldtmu)
583                 return next_score;
584         next_score++;
585 
586         /* Default score for things that aren't otherwise special. */
587         baseline_score = next_score;
588         next_score++;
589 
590         /* Schedule texture read setup early to hide their latency better. */
591         if (inst->type == V3D_QPU_INSTR_TYPE_ALU &&
592             ((inst->alu.add.magic_write &&
593               v3d_qpu_magic_waddr_is_tmu(inst->alu.add.waddr)) ||
594              (inst->alu.mul.magic_write &&
595               v3d_qpu_magic_waddr_is_tmu(inst->alu.mul.waddr)))) {
596                 return next_score;
597         }
598         next_score++;
599 
600         return baseline_score;
601 }
602 
603 static bool
qpu_magic_waddr_is_periph(enum v3d_qpu_waddr waddr)604 qpu_magic_waddr_is_periph(enum v3d_qpu_waddr waddr)
605 {
606         return (v3d_qpu_magic_waddr_is_tmu(waddr) ||
607                 v3d_qpu_magic_waddr_is_sfu(waddr) ||
608                 v3d_qpu_magic_waddr_is_tlb(waddr) ||
609                 v3d_qpu_magic_waddr_is_vpm(waddr) ||
610                 v3d_qpu_magic_waddr_is_tsy(waddr));
611 }
612 
613 static bool
qpu_accesses_peripheral(const struct v3d_qpu_instr * inst)614 qpu_accesses_peripheral(const struct v3d_qpu_instr *inst)
615 {
616         if (v3d_qpu_uses_vpm(inst))
617                 return true;
618 
619         if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
620                 if (inst->alu.add.op != V3D_QPU_A_NOP &&
621                     inst->alu.add.magic_write &&
622                     qpu_magic_waddr_is_periph(inst->alu.add.waddr)) {
623                         return true;
624                 }
625 
626                 if (inst->alu.mul.op != V3D_QPU_M_NOP &&
627                     inst->alu.mul.magic_write &&
628                     qpu_magic_waddr_is_periph(inst->alu.mul.waddr)) {
629                         return true;
630                 }
631         }
632 
633         return (inst->sig.ldvpm ||
634                 inst->sig.ldtmu ||
635                 inst->sig.ldtlb ||
636                 inst->sig.ldtlbu ||
637                 inst->sig.wrtmuc);
638 }
639 
640 static bool
qpu_merge_inst(const struct v3d_device_info * devinfo,struct v3d_qpu_instr * result,const struct v3d_qpu_instr * a,const struct v3d_qpu_instr * b)641 qpu_merge_inst(const struct v3d_device_info *devinfo,
642                struct v3d_qpu_instr *result,
643                const struct v3d_qpu_instr *a,
644                const struct v3d_qpu_instr *b)
645 {
646         if (a->type != V3D_QPU_INSTR_TYPE_ALU ||
647             b->type != V3D_QPU_INSTR_TYPE_ALU) {
648                 return false;
649         }
650 
651         /* Can't do more than one peripheral access in an instruction.
652          *
653          * XXX: V3D 4.1 allows TMU read along with a VPM read or write, and
654          * WRTMUC with a TMU magic register write (other than tmuc).
655          */
656         if (qpu_accesses_peripheral(a) && qpu_accesses_peripheral(b))
657                 return false;
658 
659         struct v3d_qpu_instr merge = *a;
660 
661         if (b->alu.add.op != V3D_QPU_A_NOP) {
662                 if (a->alu.add.op != V3D_QPU_A_NOP)
663                         return false;
664                 merge.alu.add = b->alu.add;
665 
666                 merge.flags.ac = b->flags.ac;
667                 merge.flags.apf = b->flags.apf;
668                 merge.flags.auf = b->flags.auf;
669         }
670 
671         if (b->alu.mul.op != V3D_QPU_M_NOP) {
672                 if (a->alu.mul.op != V3D_QPU_M_NOP)
673                         return false;
674                 merge.alu.mul = b->alu.mul;
675 
676                 merge.flags.mc = b->flags.mc;
677                 merge.flags.mpf = b->flags.mpf;
678                 merge.flags.muf = b->flags.muf;
679         }
680 
681         if (v3d_qpu_uses_mux(b, V3D_QPU_MUX_A)) {
682                 if (v3d_qpu_uses_mux(a, V3D_QPU_MUX_A) &&
683                     a->raddr_a != b->raddr_a) {
684                         return false;
685                 }
686                 merge.raddr_a = b->raddr_a;
687         }
688 
689         if (v3d_qpu_uses_mux(b, V3D_QPU_MUX_B)) {
690                 if (v3d_qpu_uses_mux(a, V3D_QPU_MUX_B) &&
691                     a->raddr_b != b->raddr_b) {
692                         return false;
693                 }
694                 merge.raddr_b = b->raddr_b;
695         }
696 
697         merge.sig.thrsw |= b->sig.thrsw;
698         merge.sig.ldunif |= b->sig.ldunif;
699         merge.sig.ldunifrf |= b->sig.ldunifrf;
700         merge.sig.ldunifa |= b->sig.ldunifa;
701         merge.sig.ldunifarf |= b->sig.ldunifarf;
702         merge.sig.ldtmu |= b->sig.ldtmu;
703         merge.sig.ldvary |= b->sig.ldvary;
704         merge.sig.ldvpm |= b->sig.ldvpm;
705         merge.sig.small_imm |= b->sig.small_imm;
706         merge.sig.ldtlb |= b->sig.ldtlb;
707         merge.sig.ldtlbu |= b->sig.ldtlbu;
708         merge.sig.ucb |= b->sig.ucb;
709         merge.sig.rotate |= b->sig.rotate;
710         merge.sig.wrtmuc |= b->sig.wrtmuc;
711 
712         if (v3d_qpu_sig_writes_address(devinfo, &a->sig) &&
713             v3d_qpu_sig_writes_address(devinfo, &b->sig))
714                 return false;
715         merge.sig_addr |= b->sig_addr;
716         merge.sig_magic |= b->sig_magic;
717 
718         uint64_t packed;
719         bool ok = v3d_qpu_instr_pack(devinfo, &merge, &packed);
720 
721         *result = merge;
722         /* No modifying the real instructions on failure. */
723         assert(ok || (a != result && b != result));
724 
725         return ok;
726 }
727 
728 static struct schedule_node *
choose_instruction_to_schedule(const struct v3d_device_info * devinfo,struct choose_scoreboard * scoreboard,struct list_head * schedule_list,struct schedule_node * prev_inst)729 choose_instruction_to_schedule(const struct v3d_device_info *devinfo,
730                                struct choose_scoreboard *scoreboard,
731                                struct list_head *schedule_list,
732                                struct schedule_node *prev_inst)
733 {
734         struct schedule_node *chosen = NULL;
735         int chosen_prio = 0;
736 
737         /* Don't pair up anything with a thread switch signal -- emit_thrsw()
738          * will handle pairing it along with filling the delay slots.
739          */
740         if (prev_inst) {
741                 if (prev_inst->inst->qpu.sig.thrsw)
742                         return NULL;
743         }
744 
745         list_for_each_entry(struct schedule_node, n, schedule_list, link) {
746                 const struct v3d_qpu_instr *inst = &n->inst->qpu;
747 
748                 /* Don't choose the branch instruction until it's the last one
749                  * left.  We'll move it up to fit its delay slots after we
750                  * choose it.
751                  */
752                 if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH &&
753                     !list_is_singular(schedule_list)) {
754                         continue;
755                 }
756 
757                 /* "An instruction must not read from a location in physical
758                  *  regfile A or B that was written to by the previous
759                  *  instruction."
760                  */
761                 if (reads_too_soon_after_write(scoreboard, n->inst))
762                         continue;
763 
764                 if (writes_too_soon_after_write(devinfo, scoreboard, n->inst))
765                         continue;
766 
767                 /* "A scoreboard wait must not occur in the first two
768                  *  instructions of a fragment shader. This is either the
769                  *  explicit Wait for Scoreboard signal or an implicit wait
770                  *  with the first tile-buffer read or write instruction."
771                  */
772                 if (pixel_scoreboard_too_soon(scoreboard, inst))
773                         continue;
774 
775                 /* ldunif and ldvary both write r5, but ldunif does so a tick
776                  * sooner.  If the ldvary's r5 wasn't used, then ldunif might
777                  * otherwise get scheduled so ldunif and ldvary try to update
778                  * r5 in the same tick.
779                  */
780                 if ((inst->sig.ldunif || inst->sig.ldunifa) &&
781                     scoreboard->tick == scoreboard->last_ldvary_tick + 1) {
782                         continue;
783                 }
784 
785                 /* If we're trying to pair with another instruction, check
786                  * that they're compatible.
787                  */
788                 if (prev_inst) {
789                         /* Don't pair up a thread switch signal -- we'll
790                          * handle pairing it when we pick it on its own.
791                          */
792                         if (inst->sig.thrsw)
793                                 continue;
794 
795                         if (prev_inst->inst->uniform != -1 &&
796                             n->inst->uniform != -1)
797                                 continue;
798 
799                         /* Don't merge in something that will lock the TLB.
800                          * Hopwefully what we have in inst will release some
801                          * other instructions, allowing us to delay the
802                          * TLB-locking instruction until later.
803                          */
804                         if (!scoreboard->tlb_locked && qpu_inst_is_tlb(inst))
805                                 continue;
806 
807                         struct v3d_qpu_instr merged_inst;
808                         if (!qpu_merge_inst(devinfo, &merged_inst,
809                                             &prev_inst->inst->qpu, inst)) {
810                                 continue;
811                         }
812                 }
813 
814                 int prio = get_instruction_priority(inst);
815 
816                 /* Found a valid instruction.  If nothing better comes along,
817                  * this one works.
818                  */
819                 if (!chosen) {
820                         chosen = n;
821                         chosen_prio = prio;
822                         continue;
823                 }
824 
825                 if (prio > chosen_prio) {
826                         chosen = n;
827                         chosen_prio = prio;
828                 } else if (prio < chosen_prio) {
829                         continue;
830                 }
831 
832                 if (n->delay > chosen->delay) {
833                         chosen = n;
834                         chosen_prio = prio;
835                 } else if (n->delay < chosen->delay) {
836                         continue;
837                 }
838         }
839 
840         return chosen;
841 }
842 
843 static void
update_scoreboard_for_magic_waddr(struct choose_scoreboard * scoreboard,enum v3d_qpu_waddr waddr)844 update_scoreboard_for_magic_waddr(struct choose_scoreboard *scoreboard,
845                                   enum v3d_qpu_waddr waddr)
846 {
847         if (v3d_qpu_magic_waddr_is_sfu(waddr))
848                 scoreboard->last_sfu_write_tick = scoreboard->tick;
849 }
850 
851 static void
update_scoreboard_for_chosen(struct choose_scoreboard * scoreboard,const struct v3d_qpu_instr * inst)852 update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard,
853                              const struct v3d_qpu_instr *inst)
854 {
855         scoreboard->last_waddr_add = ~0;
856         scoreboard->last_waddr_mul = ~0;
857 
858         if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH)
859                 return;
860 
861         assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
862 
863         if (inst->alu.add.op != V3D_QPU_A_NOP)  {
864                 if (inst->alu.add.magic_write) {
865                         update_scoreboard_for_magic_waddr(scoreboard,
866                                                           inst->alu.add.waddr);
867                 } else {
868                         scoreboard->last_waddr_add = inst->alu.add.waddr;
869                 }
870         }
871 
872         if (inst->alu.mul.op != V3D_QPU_M_NOP) {
873                 if (inst->alu.mul.magic_write) {
874                         update_scoreboard_for_magic_waddr(scoreboard,
875                                                           inst->alu.mul.waddr);
876                 } else {
877                         scoreboard->last_waddr_mul = inst->alu.mul.waddr;
878                 }
879         }
880 
881         if (inst->sig.ldvary)
882                 scoreboard->last_ldvary_tick = scoreboard->tick;
883 
884         if (qpu_inst_is_tlb(inst))
885                 scoreboard->tlb_locked = true;
886 }
887 
888 static void
dump_state(const struct v3d_device_info * devinfo,struct list_head * schedule_list)889 dump_state(const struct v3d_device_info *devinfo,
890            struct list_head *schedule_list)
891 {
892         list_for_each_entry(struct schedule_node, n, schedule_list, link) {
893                 fprintf(stderr, "         t=%4d: ", n->unblocked_time);
894                 v3d_qpu_dump(devinfo, &n->inst->qpu);
895                 fprintf(stderr, "\n");
896 
897                 for (int i = 0; i < n->child_count; i++) {
898                         struct schedule_node *child = n->children[i].node;
899                         if (!child)
900                                 continue;
901 
902                         fprintf(stderr, "                 - ");
903                         v3d_qpu_dump(devinfo, &child->inst->qpu);
904                         fprintf(stderr, " (%d parents, %c)\n",
905                                 child->parent_count,
906                                 n->children[i].write_after_read ? 'w' : 'r');
907                 }
908         }
909 }
910 
magic_waddr_latency(enum v3d_qpu_waddr waddr,const struct v3d_qpu_instr * after)911 static uint32_t magic_waddr_latency(enum v3d_qpu_waddr waddr,
912                                     const struct v3d_qpu_instr *after)
913 {
914         /* Apply some huge latency between texture fetch requests and getting
915          * their results back.
916          *
917          * FIXME: This is actually pretty bogus.  If we do:
918          *
919          * mov tmu0_s, a
920          * <a bit of math>
921          * mov tmu0_s, b
922          * load_tmu0
923          * <more math>
924          * load_tmu0
925          *
926          * we count that as worse than
927          *
928          * mov tmu0_s, a
929          * mov tmu0_s, b
930          * <lots of math>
931          * load_tmu0
932          * <more math>
933          * load_tmu0
934          *
935          * because we associate the first load_tmu0 with the *second* tmu0_s.
936          */
937         if (v3d_qpu_magic_waddr_is_tmu(waddr) && after->sig.ldtmu)
938                 return 100;
939 
940         /* Assume that anything depending on us is consuming the SFU result. */
941         if (v3d_qpu_magic_waddr_is_sfu(waddr))
942                 return 3;
943 
944         return 1;
945 }
946 
947 static uint32_t
instruction_latency(struct schedule_node * before,struct schedule_node * after)948 instruction_latency(struct schedule_node *before, struct schedule_node *after)
949 {
950         const struct v3d_qpu_instr *before_inst = &before->inst->qpu;
951         const struct v3d_qpu_instr *after_inst = &after->inst->qpu;
952         uint32_t latency = 1;
953 
954         if (before_inst->type != V3D_QPU_INSTR_TYPE_ALU ||
955             after_inst->type != V3D_QPU_INSTR_TYPE_ALU)
956                 return latency;
957 
958         if (before_inst->alu.add.magic_write) {
959                 latency = MAX2(latency,
960                                magic_waddr_latency(before_inst->alu.add.waddr,
961                                                    after_inst));
962         }
963 
964         if (before_inst->alu.mul.magic_write) {
965                 latency = MAX2(latency,
966                                magic_waddr_latency(before_inst->alu.mul.waddr,
967                                                    after_inst));
968         }
969 
970         return latency;
971 }
972 
973 /** Recursive computation of the delay member of a node. */
974 static void
compute_delay(struct schedule_node * n)975 compute_delay(struct schedule_node *n)
976 {
977         if (!n->child_count) {
978                 n->delay = 1;
979         } else {
980                 for (int i = 0; i < n->child_count; i++) {
981                         if (!n->children[i].node->delay)
982                                 compute_delay(n->children[i].node);
983                         n->delay = MAX2(n->delay,
984                                         n->children[i].node->delay +
985                                         instruction_latency(n, n->children[i].node));
986                 }
987         }
988 }
989 
990 static void
mark_instruction_scheduled(struct list_head * schedule_list,uint32_t time,struct schedule_node * node,bool war_only)991 mark_instruction_scheduled(struct list_head *schedule_list,
992                            uint32_t time,
993                            struct schedule_node *node,
994                            bool war_only)
995 {
996         if (!node)
997                 return;
998 
999         for (int i = node->child_count - 1; i >= 0; i--) {
1000                 struct schedule_node *child =
1001                         node->children[i].node;
1002 
1003                 if (!child)
1004                         continue;
1005 
1006                 if (war_only && !node->children[i].write_after_read)
1007                         continue;
1008 
1009                 /* If the requirement is only that the node not appear before
1010                  * the last read of its destination, then it can be scheduled
1011                  * immediately after (or paired with!) the thing reading the
1012                  * destination.
1013                  */
1014                 uint32_t latency = 0;
1015                 if (!war_only) {
1016                         latency = instruction_latency(node,
1017                                                       node->children[i].node);
1018                 }
1019 
1020                 child->unblocked_time = MAX2(child->unblocked_time,
1021                                              time + latency);
1022                 child->parent_count--;
1023                 if (child->parent_count == 0)
1024                         list_add(&child->link, schedule_list);
1025 
1026                 node->children[i].node = NULL;
1027         }
1028 }
1029 
1030 static void
insert_scheduled_instruction(struct v3d_compile * c,struct qblock * block,struct choose_scoreboard * scoreboard,struct qinst * inst)1031 insert_scheduled_instruction(struct v3d_compile *c,
1032                              struct qblock *block,
1033                              struct choose_scoreboard *scoreboard,
1034                              struct qinst *inst)
1035 {
1036         list_addtail(&inst->link, &block->instructions);
1037 
1038         update_scoreboard_for_chosen(scoreboard, &inst->qpu);
1039         c->qpu_inst_count++;
1040         scoreboard->tick++;
1041 }
1042 
1043 static struct qinst *
vir_nop()1044 vir_nop()
1045 {
1046         struct qreg undef = { QFILE_NULL, 0 };
1047         struct qinst *qinst = vir_add_inst(V3D_QPU_A_NOP, undef, undef, undef);
1048 
1049         return qinst;
1050 }
1051 
1052 static void
emit_nop(struct v3d_compile * c,struct qblock * block,struct choose_scoreboard * scoreboard)1053 emit_nop(struct v3d_compile *c, struct qblock *block,
1054          struct choose_scoreboard *scoreboard)
1055 {
1056         insert_scheduled_instruction(c, block, scoreboard, vir_nop());
1057 }
1058 
1059 static bool
qpu_instruction_valid_in_thrend_slot(struct v3d_compile * c,const struct qinst * qinst,int slot)1060 qpu_instruction_valid_in_thrend_slot(struct v3d_compile *c,
1061                                      const struct qinst *qinst, int slot)
1062 {
1063         const struct v3d_qpu_instr *inst = &qinst->qpu;
1064 
1065         /* Only TLB Z writes are prohibited in the last slot, but we don't
1066          * have those flagged so prohibit all TLB ops for now.
1067          */
1068         if (slot == 2 && qpu_inst_is_tlb(inst))
1069                 return false;
1070 
1071         if (slot > 0 && qinst->uniform != ~0)
1072                 return false;
1073 
1074         if (v3d_qpu_uses_vpm(inst))
1075                 return false;
1076 
1077         if (inst->sig.ldvary)
1078                 return false;
1079 
1080         if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
1081                 /* No writing physical registers at the end. */
1082                 if (!inst->alu.add.magic_write ||
1083                     !inst->alu.mul.magic_write) {
1084                         return false;
1085                 }
1086 
1087                 if (c->devinfo->ver < 40 && inst->alu.add.op == V3D_QPU_A_SETMSF)
1088                         return false;
1089 
1090                 /* RF0-2 might be overwritten during the delay slots by
1091                  * fragment shader setup.
1092                  */
1093                 if (inst->raddr_a < 3 &&
1094                     (inst->alu.add.a == V3D_QPU_MUX_A ||
1095                      inst->alu.add.b == V3D_QPU_MUX_A ||
1096                      inst->alu.mul.a == V3D_QPU_MUX_A ||
1097                      inst->alu.mul.b == V3D_QPU_MUX_A)) {
1098                         return false;
1099                 }
1100 
1101                 if (inst->raddr_b < 3 &&
1102                     !inst->sig.small_imm &&
1103                     (inst->alu.add.a == V3D_QPU_MUX_B ||
1104                      inst->alu.add.b == V3D_QPU_MUX_B ||
1105                      inst->alu.mul.a == V3D_QPU_MUX_B ||
1106                      inst->alu.mul.b == V3D_QPU_MUX_B)) {
1107                         return false;
1108                 }
1109         }
1110 
1111         return true;
1112 }
1113 
1114 static bool
valid_thrsw_sequence(struct v3d_compile * c,struct qinst * qinst,int instructions_in_sequence,bool is_thrend)1115 valid_thrsw_sequence(struct v3d_compile *c,
1116                      struct qinst *qinst, int instructions_in_sequence,
1117                      bool is_thrend)
1118 {
1119         for (int slot = 0; slot < instructions_in_sequence; slot++) {
1120                 /* No scheduling SFU when the result would land in the other
1121                  * thread.  The simulator complains for safety, though it
1122                  * would only occur for dead code in our case.
1123                  */
1124                 if (slot > 0 &&
1125                     qinst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
1126                     (v3d_qpu_magic_waddr_is_sfu(qinst->qpu.alu.add.waddr) ||
1127                      v3d_qpu_magic_waddr_is_sfu(qinst->qpu.alu.mul.waddr))) {
1128                         return false;
1129                 }
1130 
1131                 if (slot > 0 && qinst->qpu.sig.ldvary)
1132                         return false;
1133 
1134                 if (is_thrend &&
1135                     !qpu_instruction_valid_in_thrend_slot(c, qinst, slot)) {
1136                         return false;
1137                 }
1138 
1139                 /* Note that the list is circular, so we can only do this up
1140                  * to instructions_in_sequence.
1141                  */
1142                 qinst = (struct qinst *)qinst->link.next;
1143         }
1144 
1145         return true;
1146 }
1147 
1148 /**
1149  * Emits a THRSW signal in the stream, trying to move it up to pair with
1150  * another instruction.
1151  */
1152 static int
emit_thrsw(struct v3d_compile * c,struct qblock * block,struct choose_scoreboard * scoreboard,struct qinst * inst,bool is_thrend)1153 emit_thrsw(struct v3d_compile *c,
1154            struct qblock *block,
1155            struct choose_scoreboard *scoreboard,
1156            struct qinst *inst,
1157            bool is_thrend)
1158 {
1159         int time = 0;
1160 
1161         /* There should be nothing in a thrsw inst being scheduled other than
1162          * the signal bits.
1163          */
1164         assert(inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU);
1165         assert(inst->qpu.alu.add.op == V3D_QPU_A_NOP);
1166         assert(inst->qpu.alu.mul.op == V3D_QPU_M_NOP);
1167 
1168         /* Find how far back into previous instructions we can put the THRSW. */
1169         int slots_filled = 0;
1170         struct qinst *merge_inst = NULL;
1171         vir_for_each_inst_rev(prev_inst, block) {
1172                 struct v3d_qpu_sig sig = prev_inst->qpu.sig;
1173                 sig.thrsw = true;
1174                 uint32_t packed_sig;
1175 
1176                 if (!v3d_qpu_sig_pack(c->devinfo, &sig, &packed_sig))
1177                         break;
1178 
1179                 if (!valid_thrsw_sequence(c, prev_inst, slots_filled + 1,
1180                                           is_thrend)) {
1181                         break;
1182                 }
1183 
1184                 merge_inst = prev_inst;
1185                 if (++slots_filled == 3)
1186                         break;
1187         }
1188 
1189         bool needs_free = false;
1190         if (merge_inst) {
1191                 merge_inst->qpu.sig.thrsw = true;
1192                 needs_free = true;
1193         } else {
1194                 insert_scheduled_instruction(c, block, scoreboard, inst);
1195                 time++;
1196                 slots_filled++;
1197                 merge_inst = inst;
1198         }
1199 
1200         /* Insert any extra delay slot NOPs we need. */
1201         for (int i = 0; i < 3 - slots_filled; i++) {
1202                 emit_nop(c, block, scoreboard);
1203                 time++;
1204         }
1205 
1206         /* If we're emitting the last THRSW (other than program end), then
1207          * signal that to the HW by emitting two THRSWs in a row.
1208          */
1209         if (inst->is_last_thrsw) {
1210                 struct qinst *second_inst =
1211                         (struct qinst *)merge_inst->link.next;
1212                 second_inst->qpu.sig.thrsw = true;
1213         }
1214 
1215         /* If we put our THRSW into another instruction, free up the
1216          * instruction that didn't end up scheduled into the list.
1217          */
1218         if (needs_free)
1219                 free(inst);
1220 
1221         return time;
1222 }
1223 
1224 static uint32_t
schedule_instructions(struct v3d_compile * c,struct choose_scoreboard * scoreboard,struct qblock * block,struct list_head * schedule_list,enum quniform_contents * orig_uniform_contents,uint32_t * orig_uniform_data,uint32_t * next_uniform)1225 schedule_instructions(struct v3d_compile *c,
1226                       struct choose_scoreboard *scoreboard,
1227                       struct qblock *block,
1228                       struct list_head *schedule_list,
1229                       enum quniform_contents *orig_uniform_contents,
1230                       uint32_t *orig_uniform_data,
1231                       uint32_t *next_uniform)
1232 {
1233         const struct v3d_device_info *devinfo = c->devinfo;
1234         uint32_t time = 0;
1235 
1236         if (debug) {
1237                 fprintf(stderr, "initial deps:\n");
1238                 dump_state(devinfo, schedule_list);
1239                 fprintf(stderr, "\n");
1240         }
1241 
1242         /* Remove non-DAG heads from the list. */
1243         list_for_each_entry_safe(struct schedule_node, n, schedule_list, link) {
1244                 if (n->parent_count != 0)
1245                         list_del(&n->link);
1246         }
1247 
1248         while (!list_empty(schedule_list)) {
1249                 struct schedule_node *chosen =
1250                         choose_instruction_to_schedule(devinfo,
1251                                                        scoreboard,
1252                                                        schedule_list,
1253                                                        NULL);
1254                 struct schedule_node *merge = NULL;
1255 
1256                 /* If there are no valid instructions to schedule, drop a NOP
1257                  * in.
1258                  */
1259                 struct qinst *qinst = chosen ? chosen->inst : vir_nop();
1260                 struct v3d_qpu_instr *inst = &qinst->qpu;
1261 
1262                 if (debug) {
1263                         fprintf(stderr, "t=%4d: current list:\n",
1264                                 time);
1265                         dump_state(devinfo, schedule_list);
1266                         fprintf(stderr, "t=%4d: chose:   ", time);
1267                         v3d_qpu_dump(devinfo, inst);
1268                         fprintf(stderr, "\n");
1269                 }
1270 
1271                 /* Schedule this instruction onto the QPU list. Also try to
1272                  * find an instruction to pair with it.
1273                  */
1274                 if (chosen) {
1275                         time = MAX2(chosen->unblocked_time, time);
1276                         list_del(&chosen->link);
1277                         mark_instruction_scheduled(schedule_list, time,
1278                                                    chosen, true);
1279 
1280                         merge = choose_instruction_to_schedule(devinfo,
1281                                                                scoreboard,
1282                                                                schedule_list,
1283                                                                chosen);
1284                         if (merge) {
1285                                 time = MAX2(merge->unblocked_time, time);
1286                                 list_del(&merge->link);
1287                                 (void)qpu_merge_inst(devinfo, inst,
1288                                                      inst, &merge->inst->qpu);
1289                                 if (merge->inst->uniform != -1) {
1290                                         chosen->inst->uniform =
1291                                                 merge->inst->uniform;
1292                                 }
1293 
1294                                 if (debug) {
1295                                         fprintf(stderr, "t=%4d: merging: ",
1296                                                 time);
1297                                         v3d_qpu_dump(devinfo, &merge->inst->qpu);
1298                                         fprintf(stderr, "\n");
1299                                         fprintf(stderr, "         result: ");
1300                                         v3d_qpu_dump(devinfo, inst);
1301                                         fprintf(stderr, "\n");
1302                                 }
1303                         }
1304                 }
1305 
1306                 /* Update the uniform index for the rewritten location --
1307                  * branch target updating will still need to change
1308                  * c->uniform_data[] using this index.
1309                  */
1310                 if (qinst->uniform != -1) {
1311                         if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH)
1312                                 block->branch_uniform = *next_uniform;
1313 
1314                         c->uniform_data[*next_uniform] =
1315                                 orig_uniform_data[qinst->uniform];
1316                         c->uniform_contents[*next_uniform] =
1317                                 orig_uniform_contents[qinst->uniform];
1318                         qinst->uniform = *next_uniform;
1319                         (*next_uniform)++;
1320                 }
1321 
1322                 if (debug) {
1323                         fprintf(stderr, "\n");
1324                 }
1325 
1326                 /* Now that we've scheduled a new instruction, some of its
1327                  * children can be promoted to the list of instructions ready to
1328                  * be scheduled.  Update the children's unblocked time for this
1329                  * DAG edge as we do so.
1330                  */
1331                 mark_instruction_scheduled(schedule_list, time, chosen, false);
1332 
1333                 if (merge) {
1334                         mark_instruction_scheduled(schedule_list, time, merge,
1335                                                    false);
1336 
1337                         /* The merged VIR instruction doesn't get re-added to the
1338                          * block, so free it now.
1339                          */
1340                         free(merge->inst);
1341                 }
1342 
1343                 if (inst->sig.thrsw) {
1344                         time += emit_thrsw(c, block, scoreboard, qinst, false);
1345                 } else {
1346                         insert_scheduled_instruction(c, block,
1347                                                      scoreboard, qinst);
1348 
1349                         if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) {
1350                                 block->branch_qpu_ip = c->qpu_inst_count - 1;
1351                                 /* Fill the delay slots.
1352                                  *
1353                                  * We should fill these with actual instructions,
1354                                  * instead, but that will probably need to be done
1355                                  * after this, once we know what the leading
1356                                  * instructions of the successors are (so we can
1357                                  * handle A/B register file write latency)
1358                                  */
1359                                 for (int i = 0; i < 3; i++)
1360                                         emit_nop(c, block, scoreboard);
1361                         }
1362                 }
1363         }
1364 
1365         return time;
1366 }
1367 
1368 static uint32_t
qpu_schedule_instructions_block(struct v3d_compile * c,struct choose_scoreboard * scoreboard,struct qblock * block,enum quniform_contents * orig_uniform_contents,uint32_t * orig_uniform_data,uint32_t * next_uniform)1369 qpu_schedule_instructions_block(struct v3d_compile *c,
1370                                 struct choose_scoreboard *scoreboard,
1371                                 struct qblock *block,
1372                                 enum quniform_contents *orig_uniform_contents,
1373                                 uint32_t *orig_uniform_data,
1374                                 uint32_t *next_uniform)
1375 {
1376         void *mem_ctx = ralloc_context(NULL);
1377         struct list_head schedule_list;
1378 
1379         list_inithead(&schedule_list);
1380 
1381         /* Wrap each instruction in a scheduler structure. */
1382         while (!list_empty(&block->instructions)) {
1383                 struct qinst *qinst = (struct qinst *)block->instructions.next;
1384                 struct schedule_node *n =
1385                         rzalloc(mem_ctx, struct schedule_node);
1386 
1387                 n->inst = qinst;
1388 
1389                 list_del(&qinst->link);
1390                 list_addtail(&n->link, &schedule_list);
1391         }
1392 
1393         calculate_forward_deps(c, &schedule_list);
1394         calculate_reverse_deps(c, &schedule_list);
1395 
1396         list_for_each_entry(struct schedule_node, n, &schedule_list, link) {
1397                 compute_delay(n);
1398         }
1399 
1400         uint32_t cycles = schedule_instructions(c, scoreboard, block,
1401                                                 &schedule_list,
1402                                                 orig_uniform_contents,
1403                                                 orig_uniform_data,
1404                                                 next_uniform);
1405 
1406         ralloc_free(mem_ctx);
1407 
1408         return cycles;
1409 }
1410 
1411 static void
qpu_set_branch_targets(struct v3d_compile * c)1412 qpu_set_branch_targets(struct v3d_compile *c)
1413 {
1414         vir_for_each_block(block, c) {
1415                 /* The end block of the program has no branch. */
1416                 if (!block->successors[0])
1417                         continue;
1418 
1419                 /* If there was no branch instruction, then the successor
1420                  * block must follow immediately after this one.
1421                  */
1422                 if (block->branch_qpu_ip == ~0) {
1423                         assert(block->end_qpu_ip + 1 ==
1424                                block->successors[0]->start_qpu_ip);
1425                         continue;
1426                 }
1427 
1428                 /* Walk back through the delay slots to find the branch
1429                  * instr.
1430                  */
1431                 struct list_head *entry = block->instructions.prev;
1432                 for (int i = 0; i < 3; i++)
1433                         entry = entry->prev;
1434                 struct qinst *branch = container_of(entry, branch, link);
1435                 assert(branch->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH);
1436 
1437                 /* Make sure that the if-we-don't-jump
1438                  * successor was scheduled just after the
1439                  * delay slots.
1440                  */
1441                 assert(!block->successors[1] ||
1442                        block->successors[1]->start_qpu_ip ==
1443                        block->branch_qpu_ip + 4);
1444 
1445                 branch->qpu.branch.offset =
1446                         ((block->successors[0]->start_qpu_ip -
1447                           (block->branch_qpu_ip + 4)) *
1448                          sizeof(uint64_t));
1449 
1450                 /* Set up the relative offset to jump in the
1451                  * uniform stream.
1452                  *
1453                  * Use a temporary here, because
1454                  * uniform_data[inst->uniform] may be shared
1455                  * between multiple instructions.
1456                  */
1457                 assert(c->uniform_contents[branch->uniform] == QUNIFORM_CONSTANT);
1458                 c->uniform_data[branch->uniform] =
1459                         (block->successors[0]->start_uniform -
1460                          (block->branch_uniform + 1)) * 4;
1461         }
1462 }
1463 
1464 uint32_t
v3d_qpu_schedule_instructions(struct v3d_compile * c)1465 v3d_qpu_schedule_instructions(struct v3d_compile *c)
1466 {
1467         const struct v3d_device_info *devinfo = c->devinfo;
1468         struct qblock *end_block = list_last_entry(&c->blocks,
1469                                                    struct qblock, link);
1470 
1471         /* We reorder the uniforms as we schedule instructions, so save the
1472          * old data off and replace it.
1473          */
1474         uint32_t *uniform_data = c->uniform_data;
1475         enum quniform_contents *uniform_contents = c->uniform_contents;
1476         c->uniform_contents = ralloc_array(c, enum quniform_contents,
1477                                            c->num_uniforms);
1478         c->uniform_data = ralloc_array(c, uint32_t, c->num_uniforms);
1479         c->uniform_array_size = c->num_uniforms;
1480         uint32_t next_uniform = 0;
1481 
1482         struct choose_scoreboard scoreboard;
1483         memset(&scoreboard, 0, sizeof(scoreboard));
1484         scoreboard.last_waddr_add = ~0;
1485         scoreboard.last_waddr_mul = ~0;
1486         scoreboard.last_ldvary_tick = -10;
1487         scoreboard.last_sfu_write_tick = -10;
1488         scoreboard.last_uniforms_reset_tick = -10;
1489 
1490         if (debug) {
1491                 fprintf(stderr, "Pre-schedule instructions\n");
1492                 vir_for_each_block(block, c) {
1493                         fprintf(stderr, "BLOCK %d\n", block->index);
1494                         list_for_each_entry(struct qinst, qinst,
1495                                             &block->instructions, link) {
1496                                 v3d_qpu_dump(devinfo, &qinst->qpu);
1497                                 fprintf(stderr, "\n");
1498                         }
1499                 }
1500                 fprintf(stderr, "\n");
1501         }
1502 
1503         uint32_t cycles = 0;
1504         vir_for_each_block(block, c) {
1505                 block->start_qpu_ip = c->qpu_inst_count;
1506                 block->branch_qpu_ip = ~0;
1507                 block->start_uniform = next_uniform;
1508 
1509                 cycles += qpu_schedule_instructions_block(c,
1510                                                           &scoreboard,
1511                                                           block,
1512                                                           uniform_contents,
1513                                                           uniform_data,
1514                                                           &next_uniform);
1515 
1516                 block->end_qpu_ip = c->qpu_inst_count - 1;
1517         }
1518 
1519         /* Emit the program-end THRSW instruction. */;
1520         struct qinst *thrsw = vir_nop();
1521         thrsw->qpu.sig.thrsw = true;
1522         emit_thrsw(c, end_block, &scoreboard, thrsw, true);
1523 
1524         qpu_set_branch_targets(c);
1525 
1526         assert(next_uniform == c->num_uniforms);
1527 
1528         return cycles;
1529 }
1530