1 /*
2 * Copyright © 2014 Broadcom
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <inttypes.h>
25
26 #include "vc4_context.h"
27 #include "vc4_qir.h"
28 #include "vc4_qpu.h"
29 #include "util/ralloc.h"
30
31 static void
vc4_dump_program(struct vc4_compile * c)32 vc4_dump_program(struct vc4_compile *c)
33 {
34 fprintf(stderr, "%s prog %d/%d QPU:\n",
35 qir_get_stage_name(c->stage),
36 c->program_id, c->variant_id);
37
38 for (int i = 0; i < c->qpu_inst_count; i++) {
39 fprintf(stderr, "0x%016"PRIx64" ", c->qpu_insts[i]);
40 vc4_qpu_disasm(&c->qpu_insts[i], 1);
41 fprintf(stderr, "\n");
42 }
43 fprintf(stderr, "\n");
44 }
45
46 static void
queue(struct qblock * block,uint64_t inst)47 queue(struct qblock *block, uint64_t inst)
48 {
49 struct queued_qpu_inst *q = rzalloc(block, struct queued_qpu_inst);
50 q->inst = inst;
51 list_addtail(&q->link, &block->qpu_inst_list);
52 }
53
54 static uint64_t *
last_inst(struct qblock * block)55 last_inst(struct qblock *block)
56 {
57 struct queued_qpu_inst *q =
58 (struct queued_qpu_inst *)block->qpu_inst_list.prev;
59 return &q->inst;
60 }
61
62 static void
set_last_cond_add(struct qblock * block,uint32_t cond)63 set_last_cond_add(struct qblock *block, uint32_t cond)
64 {
65 *last_inst(block) = qpu_set_cond_add(*last_inst(block), cond);
66 }
67
68 static void
set_last_cond_mul(struct qblock * block,uint32_t cond)69 set_last_cond_mul(struct qblock *block, uint32_t cond)
70 {
71 *last_inst(block) = qpu_set_cond_mul(*last_inst(block), cond);
72 }
73
74 /**
75 * Some special registers can be read from either file, which lets us resolve
76 * raddr conflicts without extra MOVs.
77 */
78 static bool
swap_file(struct qpu_reg * src)79 swap_file(struct qpu_reg *src)
80 {
81 switch (src->addr) {
82 case QPU_R_UNIF:
83 case QPU_R_VARY:
84 if (src->mux == QPU_MUX_SMALL_IMM) {
85 return false;
86 } else {
87 if (src->mux == QPU_MUX_A)
88 src->mux = QPU_MUX_B;
89 else
90 src->mux = QPU_MUX_A;
91 return true;
92 }
93
94 default:
95 return false;
96 }
97 }
98
99 /**
100 * Sets up the VPM read FIFO before we do any VPM read.
101 *
102 * VPM reads (vertex attribute input) and VPM writes (varyings output) from
103 * the QPU reuse the VRI (varying interpolation) block's FIFOs to talk to the
104 * VPM block. In the VS/CS (unlike in the FS), the block starts out
105 * uninitialized, and you need to emit setup to the block before any VPM
106 * reads/writes.
107 *
108 * VRI has a FIFO in each direction, with each FIFO able to hold four
109 * 32-bit-per-vertex values. VPM reads come through the read FIFO and VPM
110 * writes go through the write FIFO. The read/write setup values from QPU go
111 * through the write FIFO as well, with a sideband signal indicating that
112 * they're setup values. Once a read setup reaches the other side of the
113 * FIFO, the VPM block will start asynchronously reading vertex attributes and
114 * filling the read FIFO -- that way hopefully the QPU doesn't have to block
115 * on reads later.
116 *
117 * VPM read setup can configure 16 32-bit-per-vertex values to be read at a
118 * time, which is 4 vec4s. If more than that is being read (since we support
119 * 8 vec4 vertex attributes), then multiple read setup writes need to be done.
120 *
121 * The existence of the FIFO makes it seem like you should be able to emit
122 * both setups for the 5-8 attribute cases and then do all the attribute
123 * reads. However, once the setup value makes it to the other end of the
124 * write FIFO, it will immediately update the VPM block's setup register.
125 * That updated setup register would be used for read FIFO fills from then on,
126 * breaking whatever remaining VPM values were supposed to be read into the
127 * read FIFO from the previous attribute set.
128 *
129 * As a result, we need to emit the read setup, pull every VPM read value from
130 * that setup, and only then emit the second setup if applicable.
131 */
132 static void
setup_for_vpm_read(struct vc4_compile * c,struct qblock * block)133 setup_for_vpm_read(struct vc4_compile *c, struct qblock *block)
134 {
135 if (c->num_inputs_in_fifo) {
136 c->num_inputs_in_fifo--;
137 return;
138 }
139
140 c->num_inputs_in_fifo = MIN2(c->num_inputs_remaining, 16);
141
142 queue(block,
143 qpu_load_imm_ui(qpu_vrsetup(),
144 c->vpm_read_offset |
145 0x00001a00 |
146 ((c->num_inputs_in_fifo & 0xf) << 20)));
147 c->num_inputs_remaining -= c->num_inputs_in_fifo;
148 c->vpm_read_offset += c->num_inputs_in_fifo;
149
150 c->num_inputs_in_fifo--;
151 }
152
153 /**
154 * This is used to resolve the fact that we might register-allocate two
155 * different operands of an instruction to the same physical register file
156 * even though instructions have only one field for the register file source
157 * address.
158 *
159 * In that case, we need to move one to a temporary that can be used in the
160 * instruction, instead. We reserve ra14/rb14 for this purpose.
161 */
162 static void
fixup_raddr_conflict(struct qblock * block,struct qpu_reg dst,struct qpu_reg * src0,struct qpu_reg * src1,struct qinst * inst,uint64_t * unpack)163 fixup_raddr_conflict(struct qblock *block,
164 struct qpu_reg dst,
165 struct qpu_reg *src0, struct qpu_reg *src1,
166 struct qinst *inst, uint64_t *unpack)
167 {
168 uint32_t mux0 = src0->mux == QPU_MUX_SMALL_IMM ? QPU_MUX_B : src0->mux;
169 uint32_t mux1 = src1->mux == QPU_MUX_SMALL_IMM ? QPU_MUX_B : src1->mux;
170
171 if (mux0 <= QPU_MUX_R5 ||
172 mux0 != mux1 ||
173 (src0->addr == src1->addr &&
174 src0->mux == src1->mux)) {
175 return;
176 }
177
178 if (swap_file(src0) || swap_file(src1))
179 return;
180
181 if (mux0 == QPU_MUX_A) {
182 /* Make sure we use the same type of MOV as the instruction,
183 * in case of unpacks.
184 */
185 if (qir_is_float_input(inst))
186 queue(block, qpu_a_FMAX(qpu_rb(14), *src0, *src0));
187 else
188 queue(block, qpu_a_MOV(qpu_rb(14), *src0));
189
190 /* If we had an unpack on this A-file source, we need to put
191 * it into this MOV, not into the later move from regfile B.
192 */
193 if (inst->src[0].pack) {
194 *last_inst(block) |= *unpack;
195 *unpack = 0;
196 }
197 *src0 = qpu_rb(14);
198 } else {
199 queue(block, qpu_a_MOV(qpu_ra(14), *src0));
200 *src0 = qpu_ra(14);
201 }
202 }
203
204 static void
set_last_dst_pack(struct qblock * block,struct qinst * inst)205 set_last_dst_pack(struct qblock *block, struct qinst *inst)
206 {
207 MAYBE_UNUSED bool had_pm = *last_inst(block) & QPU_PM;
208 MAYBE_UNUSED bool had_ws = *last_inst(block) & QPU_WS;
209 MAYBE_UNUSED uint32_t unpack = QPU_GET_FIELD(*last_inst(block), QPU_UNPACK);
210
211 if (!inst->dst.pack)
212 return;
213
214 *last_inst(block) |= QPU_SET_FIELD(inst->dst.pack, QPU_PACK);
215
216 if (qir_is_mul(inst)) {
217 assert(!unpack || had_pm);
218 *last_inst(block) |= QPU_PM;
219 } else {
220 assert(!unpack || !had_pm);
221 assert(!had_ws); /* dst must be a-file to pack. */
222 }
223 }
224
225 static void
handle_r4_qpu_write(struct qblock * block,struct qinst * qinst,struct qpu_reg dst)226 handle_r4_qpu_write(struct qblock *block, struct qinst *qinst,
227 struct qpu_reg dst)
228 {
229 if (dst.mux != QPU_MUX_R4) {
230 queue(block, qpu_a_MOV(dst, qpu_r4()));
231 set_last_cond_add(block, qinst->cond);
232 } else {
233 assert(qinst->cond == QPU_COND_ALWAYS);
234 if (qinst->sf)
235 queue(block, qpu_a_MOV(qpu_ra(QPU_W_NOP), qpu_r4()));
236 }
237 }
238
239 static void
vc4_generate_code_block(struct vc4_compile * c,struct qblock * block,struct qpu_reg * temp_registers)240 vc4_generate_code_block(struct vc4_compile *c,
241 struct qblock *block,
242 struct qpu_reg *temp_registers)
243 {
244 int last_vpm_read_index = -1;
245
246 qir_for_each_inst(qinst, block) {
247 #if 0
248 fprintf(stderr, "translating qinst to qpu: ");
249 qir_dump_inst(qinst);
250 fprintf(stderr, "\n");
251 #endif
252
253 static const struct {
254 uint32_t op;
255 } translate[] = {
256 #define A(name) [QOP_##name] = {QPU_A_##name}
257 #define M(name) [QOP_##name] = {QPU_M_##name}
258 A(FADD),
259 A(FSUB),
260 A(FMIN),
261 A(FMAX),
262 A(FMINABS),
263 A(FMAXABS),
264 A(FTOI),
265 A(ITOF),
266 A(ADD),
267 A(SUB),
268 A(SHL),
269 A(SHR),
270 A(ASR),
271 A(MIN),
272 A(MAX),
273 A(AND),
274 A(OR),
275 A(XOR),
276 A(NOT),
277
278 M(FMUL),
279 M(V8MULD),
280 M(V8MIN),
281 M(V8MAX),
282 M(V8ADDS),
283 M(V8SUBS),
284 M(MUL24),
285
286 /* If we replicate src[0] out to src[1], this works
287 * out the same as a MOV.
288 */
289 [QOP_MOV] = { QPU_A_OR },
290 [QOP_FMOV] = { QPU_A_FMAX },
291 [QOP_MMOV] = { QPU_M_V8MIN },
292
293 [QOP_MIN_NOIMM] = { QPU_A_MIN },
294 };
295
296 uint64_t unpack = 0;
297 struct qpu_reg src[ARRAY_SIZE(qinst->src)];
298 for (int i = 0; i < qir_get_nsrc(qinst); i++) {
299 int index = qinst->src[i].index;
300 switch (qinst->src[i].file) {
301 case QFILE_NULL:
302 case QFILE_LOAD_IMM:
303 src[i] = qpu_rn(0);
304 break;
305 case QFILE_TEMP:
306 src[i] = temp_registers[index];
307 if (qinst->src[i].pack) {
308 assert(!unpack ||
309 unpack == qinst->src[i].pack);
310 unpack = QPU_SET_FIELD(qinst->src[i].pack,
311 QPU_UNPACK);
312 if (src[i].mux == QPU_MUX_R4)
313 unpack |= QPU_PM;
314 }
315 break;
316 case QFILE_UNIF:
317 src[i] = qpu_unif();
318 break;
319 case QFILE_VARY:
320 src[i] = qpu_vary();
321 break;
322 case QFILE_SMALL_IMM:
323 src[i].mux = QPU_MUX_SMALL_IMM;
324 src[i].addr = qpu_encode_small_immediate(qinst->src[i].index);
325 /* This should only have returned a valid
326 * small immediate field, not ~0 for failure.
327 */
328 assert(src[i].addr <= 47);
329 break;
330 case QFILE_VPM:
331 setup_for_vpm_read(c, block);
332 assert((int)qinst->src[i].index >=
333 last_vpm_read_index);
334 (void)last_vpm_read_index;
335 last_vpm_read_index = qinst->src[i].index;
336 src[i] = qpu_ra(QPU_R_VPM);
337 break;
338
339 case QFILE_FRAG_X:
340 src[i] = qpu_ra(QPU_R_XY_PIXEL_COORD);
341 break;
342 case QFILE_FRAG_Y:
343 src[i] = qpu_rb(QPU_R_XY_PIXEL_COORD);
344 break;
345 case QFILE_FRAG_REV_FLAG:
346 src[i] = qpu_rb(QPU_R_MS_REV_FLAGS);
347 break;
348 case QFILE_QPU_ELEMENT:
349 src[i] = qpu_ra(QPU_R_ELEM_QPU);
350 break;
351
352 case QFILE_TLB_COLOR_WRITE:
353 case QFILE_TLB_COLOR_WRITE_MS:
354 case QFILE_TLB_Z_WRITE:
355 case QFILE_TLB_STENCIL_SETUP:
356 case QFILE_TEX_S:
357 case QFILE_TEX_S_DIRECT:
358 case QFILE_TEX_T:
359 case QFILE_TEX_R:
360 case QFILE_TEX_B:
361 unreachable("bad qir src file");
362 }
363 }
364
365 struct qpu_reg dst;
366 switch (qinst->dst.file) {
367 case QFILE_NULL:
368 dst = qpu_ra(QPU_W_NOP);
369 break;
370 case QFILE_TEMP:
371 dst = temp_registers[qinst->dst.index];
372 break;
373 case QFILE_VPM:
374 dst = qpu_ra(QPU_W_VPM);
375 break;
376
377 case QFILE_TLB_COLOR_WRITE:
378 dst = qpu_tlbc();
379 break;
380
381 case QFILE_TLB_COLOR_WRITE_MS:
382 dst = qpu_tlbc_ms();
383 break;
384
385 case QFILE_TLB_Z_WRITE:
386 dst = qpu_ra(QPU_W_TLB_Z);
387 break;
388
389 case QFILE_TLB_STENCIL_SETUP:
390 dst = qpu_ra(QPU_W_TLB_STENCIL_SETUP);
391 break;
392
393 case QFILE_TEX_S:
394 case QFILE_TEX_S_DIRECT:
395 dst = qpu_rb(QPU_W_TMU0_S);
396 break;
397
398 case QFILE_TEX_T:
399 dst = qpu_rb(QPU_W_TMU0_T);
400 break;
401
402 case QFILE_TEX_R:
403 dst = qpu_rb(QPU_W_TMU0_R);
404 break;
405
406 case QFILE_TEX_B:
407 dst = qpu_rb(QPU_W_TMU0_B);
408 break;
409
410 case QFILE_VARY:
411 case QFILE_UNIF:
412 case QFILE_SMALL_IMM:
413 case QFILE_LOAD_IMM:
414 case QFILE_FRAG_X:
415 case QFILE_FRAG_Y:
416 case QFILE_FRAG_REV_FLAG:
417 case QFILE_QPU_ELEMENT:
418 assert(!"not reached");
419 break;
420 }
421
422 MAYBE_UNUSED bool handled_qinst_cond = false;
423
424 switch (qinst->op) {
425 case QOP_RCP:
426 case QOP_RSQ:
427 case QOP_EXP2:
428 case QOP_LOG2:
429 switch (qinst->op) {
430 case QOP_RCP:
431 queue(block, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIP),
432 src[0]) | unpack);
433 break;
434 case QOP_RSQ:
435 queue(block, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIPSQRT),
436 src[0]) | unpack);
437 break;
438 case QOP_EXP2:
439 queue(block, qpu_a_MOV(qpu_rb(QPU_W_SFU_EXP),
440 src[0]) | unpack);
441 break;
442 case QOP_LOG2:
443 queue(block, qpu_a_MOV(qpu_rb(QPU_W_SFU_LOG),
444 src[0]) | unpack);
445 break;
446 default:
447 abort();
448 }
449
450 handle_r4_qpu_write(block, qinst, dst);
451 handled_qinst_cond = true;
452
453 break;
454
455 case QOP_LOAD_IMM:
456 assert(qinst->src[0].file == QFILE_LOAD_IMM);
457 queue(block, qpu_load_imm_ui(dst, qinst->src[0].index));
458 break;
459
460 case QOP_LOAD_IMM_U2:
461 queue(block, qpu_load_imm_u2(dst, qinst->src[0].index));
462 break;
463
464 case QOP_LOAD_IMM_I2:
465 queue(block, qpu_load_imm_i2(dst, qinst->src[0].index));
466 break;
467
468 case QOP_ROT_MUL:
469 /* Rotation at the hardware level occurs on the inputs
470 * to the MUL unit, and they must be accumulators in
471 * order to have the time necessary to move things.
472 */
473 assert(src[0].mux <= QPU_MUX_R3);
474
475 queue(block,
476 qpu_m_rot(dst, src[0], qinst->src[1].index -
477 QPU_SMALL_IMM_MUL_ROT) | unpack);
478 set_last_cond_mul(block, qinst->cond);
479 handled_qinst_cond = true;
480 set_last_dst_pack(block, qinst);
481 break;
482
483 case QOP_MS_MASK:
484 src[1] = qpu_ra(QPU_R_MS_REV_FLAGS);
485 fixup_raddr_conflict(block, dst, &src[0], &src[1],
486 qinst, &unpack);
487 queue(block, qpu_a_AND(qpu_ra(QPU_W_MS_FLAGS),
488 src[0], src[1]) | unpack);
489 break;
490
491 case QOP_FRAG_Z:
492 case QOP_FRAG_W:
493 /* QOP_FRAG_Z/W don't emit instructions, just allocate
494 * the register to the Z/W payload.
495 */
496 break;
497
498 case QOP_TLB_COLOR_READ:
499 queue(block, qpu_NOP());
500 *last_inst(block) = qpu_set_sig(*last_inst(block),
501 QPU_SIG_COLOR_LOAD);
502 handle_r4_qpu_write(block, qinst, dst);
503 handled_qinst_cond = true;
504 break;
505
506 case QOP_VARY_ADD_C:
507 queue(block, qpu_a_FADD(dst, src[0], qpu_r5()) | unpack);
508 break;
509
510
511 case QOP_TEX_RESULT:
512 queue(block, qpu_NOP());
513 *last_inst(block) = qpu_set_sig(*last_inst(block),
514 QPU_SIG_LOAD_TMU0);
515 handle_r4_qpu_write(block, qinst, dst);
516 handled_qinst_cond = true;
517 break;
518
519 case QOP_THRSW:
520 queue(block, qpu_NOP());
521 *last_inst(block) = qpu_set_sig(*last_inst(block),
522 QPU_SIG_THREAD_SWITCH);
523 c->last_thrsw = last_inst(block);
524 break;
525
526 case QOP_BRANCH:
527 /* The branch target will be updated at QPU scheduling
528 * time.
529 */
530 queue(block, (qpu_branch(qinst->cond, 0) |
531 QPU_BRANCH_REL));
532 handled_qinst_cond = true;
533 break;
534
535 case QOP_UNIFORMS_RESET:
536 fixup_raddr_conflict(block, dst, &src[0], &src[1],
537 qinst, &unpack);
538
539 queue(block, qpu_a_ADD(qpu_ra(QPU_W_UNIFORMS_ADDRESS),
540 src[0], src[1]));
541 break;
542
543 default:
544 assert(qinst->op < ARRAY_SIZE(translate));
545 assert(translate[qinst->op].op != 0); /* NOPs */
546
547 /* Skip emitting the MOV if it's a no-op. */
548 if (qir_is_raw_mov(qinst) &&
549 dst.mux == src[0].mux && dst.addr == src[0].addr) {
550 break;
551 }
552
553 /* If we have only one source, put it in the second
554 * argument slot as well so that we don't take up
555 * another raddr just to get unused data.
556 */
557 if (qir_get_non_sideband_nsrc(qinst) == 1)
558 src[1] = src[0];
559
560 fixup_raddr_conflict(block, dst, &src[0], &src[1],
561 qinst, &unpack);
562
563 if (qir_is_mul(qinst)) {
564 queue(block, qpu_m_alu2(translate[qinst->op].op,
565 dst,
566 src[0], src[1]) | unpack);
567 set_last_cond_mul(block, qinst->cond);
568 } else {
569 queue(block, qpu_a_alu2(translate[qinst->op].op,
570 dst,
571 src[0], src[1]) | unpack);
572 set_last_cond_add(block, qinst->cond);
573 }
574 handled_qinst_cond = true;
575 set_last_dst_pack(block, qinst);
576
577 break;
578 }
579
580 assert(qinst->cond == QPU_COND_ALWAYS ||
581 handled_qinst_cond);
582
583 if (qinst->sf)
584 *last_inst(block) |= QPU_SF;
585 }
586 }
587
588 void
vc4_generate_code(struct vc4_context * vc4,struct vc4_compile * c)589 vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
590 {
591 struct qblock *start_block = list_first_entry(&c->blocks,
592 struct qblock, link);
593
594 struct qpu_reg *temp_registers = vc4_register_allocate(vc4, c);
595 if (!temp_registers)
596 return;
597
598 switch (c->stage) {
599 case QSTAGE_VERT:
600 case QSTAGE_COORD:
601 c->num_inputs_remaining = c->num_inputs;
602 queue(start_block, qpu_load_imm_ui(qpu_vwsetup(), 0x00001a00));
603 break;
604 case QSTAGE_FRAG:
605 break;
606 }
607
608 qir_for_each_block(block, c)
609 vc4_generate_code_block(c, block, temp_registers);
610
611 /* Switch the last SIG_THRSW instruction to SIG_LAST_THRSW.
612 *
613 * LAST_THRSW is a new signal in BCM2708B0 (including Raspberry Pi)
614 * that ensures that a later thread doesn't try to lock the scoreboard
615 * and terminate before an earlier-spawned thread on the same QPU, by
616 * delaying switching back to the later shader until earlier has
617 * finished. Otherwise, if the earlier thread was hitting the same
618 * quad, the scoreboard would deadlock.
619 */
620 if (c->last_thrsw) {
621 assert(QPU_GET_FIELD(*c->last_thrsw, QPU_SIG) ==
622 QPU_SIG_THREAD_SWITCH);
623 *c->last_thrsw = ((*c->last_thrsw & ~QPU_SIG_MASK) |
624 QPU_SET_FIELD(QPU_SIG_LAST_THREAD_SWITCH,
625 QPU_SIG));
626 }
627
628 uint32_t cycles = qpu_schedule_instructions(c);
629 uint32_t inst_count_at_schedule_time = c->qpu_inst_count;
630
631 /* thread end can't have VPM write or read */
632 if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
633 QPU_WADDR_ADD) == QPU_W_VPM ||
634 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
635 QPU_WADDR_MUL) == QPU_W_VPM ||
636 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
637 QPU_RADDR_A) == QPU_R_VPM ||
638 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
639 QPU_RADDR_B) == QPU_R_VPM) {
640 qpu_serialize_one_inst(c, qpu_NOP());
641 }
642
643 /* thread end can't have uniform read */
644 if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
645 QPU_RADDR_A) == QPU_R_UNIF ||
646 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
647 QPU_RADDR_B) == QPU_R_UNIF) {
648 qpu_serialize_one_inst(c, qpu_NOP());
649 }
650
651 /* thread end can't have TLB operations */
652 if (qpu_inst_is_tlb(c->qpu_insts[c->qpu_inst_count - 1]))
653 qpu_serialize_one_inst(c, qpu_NOP());
654
655 /* Make sure there's no existing signal set (like for a small
656 * immediate)
657 */
658 if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
659 QPU_SIG) != QPU_SIG_NONE) {
660 qpu_serialize_one_inst(c, qpu_NOP());
661 }
662
663 c->qpu_insts[c->qpu_inst_count - 1] =
664 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
665 QPU_SIG_PROG_END);
666 qpu_serialize_one_inst(c, qpu_NOP());
667 qpu_serialize_one_inst(c, qpu_NOP());
668
669 switch (c->stage) {
670 case QSTAGE_VERT:
671 case QSTAGE_COORD:
672 break;
673 case QSTAGE_FRAG:
674 c->qpu_insts[c->qpu_inst_count - 1] =
675 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
676 QPU_SIG_SCOREBOARD_UNLOCK);
677 break;
678 }
679
680 cycles += c->qpu_inst_count - inst_count_at_schedule_time;
681
682 if (vc4_debug & VC4_DEBUG_SHADERDB) {
683 fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d estimated cycles\n",
684 qir_get_stage_name(c->stage),
685 c->program_id, c->variant_id,
686 cycles);
687 }
688
689 if (vc4_debug & VC4_DEBUG_QPU)
690 vc4_dump_program(c);
691
692 vc4_qpu_validate(c->qpu_insts, c->qpu_inst_count);
693
694 free(temp_registers);
695 }
696