1 
2 /*
3  * Copyright © 2014 Broadcom
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice (including the next
13  * paragraph) shall be included in all copies or substantial portions of the
14  * Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22  * IN THE SOFTWARE.
23  */
24 
25 #include "vc4_qpu.h"
26 
27 static void
fail_instr(uint64_t inst,const char * msg)28 fail_instr(uint64_t inst, const char *msg)
29 {
30         fprintf(stderr, "vc4_qpu_validate: %s: ", msg);
31         vc4_qpu_disasm(&inst, 1);
32         fprintf(stderr, "\n");
33         abort();
34 }
35 
36 static bool
writes_reg(uint64_t inst,uint32_t w)37 writes_reg(uint64_t inst, uint32_t w)
38 {
39         return (QPU_GET_FIELD(inst, QPU_WADDR_ADD) == w ||
40                 QPU_GET_FIELD(inst, QPU_WADDR_MUL) == w);
41 }
42 
43 static bool
_reads_reg(uint64_t inst,uint32_t r,bool ignore_a,bool ignore_b)44 _reads_reg(uint64_t inst, uint32_t r, bool ignore_a, bool ignore_b)
45 {
46         struct {
47                 uint32_t mux, addr;
48         } src_regs[] = {
49                 { QPU_GET_FIELD(inst, QPU_ADD_A) },
50                 { QPU_GET_FIELD(inst, QPU_ADD_B) },
51                 { QPU_GET_FIELD(inst, QPU_MUL_A) },
52                 { QPU_GET_FIELD(inst, QPU_MUL_B) },
53         };
54 
55         /* Branches only reference raddr_a (no mux), and we don't use that
56          * feature of branching.
57          */
58         if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_BRANCH)
59                 return false;
60 
61         /* Load immediates don't read any registers. */
62         if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_LOAD_IMM)
63                 return false;
64 
65         for (int i = 0; i < ARRAY_SIZE(src_regs); i++) {
66                 if (!ignore_a &&
67                     src_regs[i].mux == QPU_MUX_A &&
68                     (QPU_GET_FIELD(inst, QPU_RADDR_A) == r))
69                         return true;
70 
71                 if (!ignore_b &&
72                     QPU_GET_FIELD(inst, QPU_SIG) != QPU_SIG_SMALL_IMM &&
73                     src_regs[i].mux == QPU_MUX_B &&
74                     (QPU_GET_FIELD(inst, QPU_RADDR_B) == r))
75                         return true;
76         }
77 
78         return false;
79 }
80 
81 static bool
reads_reg(uint64_t inst,uint32_t r)82 reads_reg(uint64_t inst, uint32_t r)
83 {
84         return _reads_reg(inst, r, false, false);
85 }
86 
87 static bool
reads_a_reg(uint64_t inst,uint32_t r)88 reads_a_reg(uint64_t inst, uint32_t r)
89 {
90         return _reads_reg(inst, r, false, true);
91 }
92 
93 static bool
reads_b_reg(uint64_t inst,uint32_t r)94 reads_b_reg(uint64_t inst, uint32_t r)
95 {
96         return _reads_reg(inst, r, true, false);
97 }
98 
99 static bool
writes_sfu(uint64_t inst)100 writes_sfu(uint64_t inst)
101 {
102         return (writes_reg(inst, QPU_W_SFU_RECIP) ||
103                 writes_reg(inst, QPU_W_SFU_RECIPSQRT) ||
104                 writes_reg(inst, QPU_W_SFU_EXP) ||
105                 writes_reg(inst, QPU_W_SFU_LOG));
106 }
107 
108 /**
109  * Checks for the instruction restrictions from page 37 ("Summary of
110  * Instruction Restrictions").
111  */
112 void
vc4_qpu_validate(uint64_t * insts,uint32_t num_inst)113 vc4_qpu_validate(uint64_t *insts, uint32_t num_inst)
114 {
115         bool scoreboard_locked = false;
116         bool threaded = false;
117 
118         /* We don't want to do validation in release builds, but we want to
119          * keep compiling the validation code to make sure it doesn't get
120          * broken.
121          */
122 #ifndef DEBUG
123         return;
124 #endif
125 
126         for (int i = 0; i < num_inst; i++) {
127                 uint64_t inst = insts[i];
128                 uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
129 
130                 if (sig != QPU_SIG_PROG_END) {
131                         if (qpu_inst_is_tlb(inst))
132                                 scoreboard_locked = true;
133 
134                         if (sig == QPU_SIG_THREAD_SWITCH ||
135                             sig == QPU_SIG_LAST_THREAD_SWITCH) {
136                                 threaded = true;
137                         }
138 
139                         continue;
140                 }
141 
142                 /* "The Thread End instruction must not write to either physical
143                  *  regfile A or B."
144                  */
145                 if (QPU_GET_FIELD(inst, QPU_WADDR_ADD) < 32 ||
146                     QPU_GET_FIELD(inst, QPU_WADDR_MUL) < 32) {
147                         fail_instr(inst, "write to phys reg in thread end");
148                 }
149 
150                 /* Can't trigger an implicit wait on scoreboard in the program
151                  * end instruction.
152                  */
153                 if (qpu_inst_is_tlb(inst) && !scoreboard_locked)
154                         fail_instr(inst, "implicit sb wait in program end");
155 
156                 /* Two delay slots will be executed. */
157                 assert(i + 2 <= num_inst);
158 
159                  for (int j = i; j < i + 2; j++) {
160                          /* "The last three instructions of any program
161                           *  (Thread End plus the following two delay-slot
162                           *  instructions) must not do varyings read, uniforms
163                           *  read or any kind of VPM, VDR, or VDW read or
164                           *  write."
165                           */
166                          if (writes_reg(insts[j], QPU_W_VPM) ||
167                              reads_reg(insts[j], QPU_R_VARY) ||
168                              reads_reg(insts[j], QPU_R_UNIF) ||
169                              reads_reg(insts[j], QPU_R_VPM)) {
170                                  fail_instr(insts[j], "last 3 instructions "
171                                             "using fixed functions");
172                          }
173 
174                          /* "The Thread End instruction and the following two
175                           *  delay slot instructions must not write or read
176                           *  address 14 in either regfile A or B."
177                           */
178                          if (writes_reg(insts[j], 14) ||
179                              reads_reg(insts[j], 14)) {
180                                  fail_instr(insts[j], "last 3 instructions "
181                                             "must not use r14");
182                          }
183                  }
184 
185                  /* "The final program instruction (the second delay slot
186                   *  instruction) must not do a TLB Z write."
187                   */
188                  if (writes_reg(insts[i + 2], QPU_W_TLB_Z)) {
189                          fail_instr(insts[i + 2], "final instruction doing "
190                                     "Z write");
191                  }
192         }
193 
194         /* "A scoreboard wait must not occur in the first two instructions of
195          *  a fragment shader. This is either the explicit Wait for Scoreboard
196          *  signal or an implicit wait with the first tile-buffer read or
197          *  write instruction."
198          */
199         for (int i = 0; i < 2; i++) {
200                 uint64_t inst = insts[i];
201 
202                 if (qpu_inst_is_tlb(inst))
203                         fail_instr(inst, "sb wait in first two insts");
204         }
205 
206         /* "If TMU_NOSWAP is written, the write must be three instructions
207          *  before the first TMU write instruction.  For example, if
208          *  TMU_NOSWAP is written in the first shader instruction, the first
209          *  TMU write cannot occur before the 4th shader instruction."
210          */
211         int last_tmu_noswap = -10;
212         for (int i = 0; i < num_inst; i++) {
213                 uint64_t inst = insts[i];
214 
215                 if ((i - last_tmu_noswap) <= 3 &&
216                     (writes_reg(inst, QPU_W_TMU0_S) ||
217                      writes_reg(inst, QPU_W_TMU1_S))) {
218                         fail_instr(inst, "TMU write too soon after TMU_NOSWAP");
219                 }
220 
221                 if (writes_reg(inst, QPU_W_TMU_NOSWAP))
222                     last_tmu_noswap = i;
223         }
224 
225         /* "An instruction must not read from a location in physical regfile A
226          *  or B that was written to by the previous instruction."
227          */
228         for (int i = 0; i < num_inst - 1; i++) {
229                 uint64_t inst = insts[i];
230                 uint32_t add_waddr = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
231                 uint32_t mul_waddr = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
232                 uint32_t waddr_a, waddr_b;
233 
234                 if (inst & QPU_WS) {
235                         waddr_b = add_waddr;
236                         waddr_a = mul_waddr;
237                 } else {
238                         waddr_a = add_waddr;
239                         waddr_b = mul_waddr;
240                 }
241 
242                 if ((waddr_a < 32 && reads_a_reg(insts[i + 1], waddr_a)) ||
243                     (waddr_b < 32 && reads_b_reg(insts[i + 1], waddr_b))) {
244                         fail_instr(insts[i + 1],
245                                    "Reads physical reg too soon after write");
246                 }
247         }
248 
249         /* "After an SFU lookup instruction, accumulator r4 must not be read
250          *  in the following two instructions. Any other instruction that
251          *  results in r4 being written (that is, TMU read, TLB read, SFU
252          *  lookup) cannot occur in the two instructions following an SFU
253          *  lookup."
254          */
255         int last_sfu_inst = -10;
256         for (int i = 0; i < num_inst - 1; i++) {
257                 uint64_t inst = insts[i];
258                 uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
259 
260                 if (i - last_sfu_inst <= 2 &&
261                     (writes_sfu(inst) ||
262                      sig == QPU_SIG_LOAD_TMU0 ||
263                      sig == QPU_SIG_LOAD_TMU1 ||
264                      sig == QPU_SIG_COLOR_LOAD)) {
265                         fail_instr(inst, "R4 write too soon after SFU write");
266                 }
267 
268                 if (writes_sfu(inst))
269                         last_sfu_inst = i;
270         }
271 
272         for (int i = 0; i < num_inst - 1; i++) {
273                 uint64_t inst = insts[i];
274 
275                 if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_SMALL_IMM &&
276                     QPU_GET_FIELD(inst, QPU_SMALL_IMM) >=
277                     QPU_SMALL_IMM_MUL_ROT) {
278                         uint32_t mux_a = QPU_GET_FIELD(inst, QPU_MUL_A);
279                         uint32_t mux_b = QPU_GET_FIELD(inst, QPU_MUL_B);
280 
281                         /* "The full horizontal vector rotate is only
282                          *  available when both of the mul ALU input arguments
283                          *  are taken from accumulators r0-r3."
284                          */
285                         if (mux_a > QPU_MUX_R3 || mux_b > QPU_MUX_R3) {
286                                 fail_instr(inst,
287                                            "MUL rotate using non-accumulator "
288                                            "input");
289                         }
290 
291                         if (QPU_GET_FIELD(inst, QPU_SMALL_IMM) ==
292                             QPU_SMALL_IMM_MUL_ROT) {
293                                 /* "An instruction that does a vector rotate
294                                  *  by r5 must not immediately follow an
295                                  *  instruction that writes to r5."
296                                  */
297                                 if (writes_reg(insts[i - 1], QPU_W_ACC5)) {
298                                         fail_instr(inst,
299                                                    "vector rotate by r5 "
300                                                    "immediately after r5 write");
301                                 }
302                         }
303 
304                         /* "An instruction that does a vector rotate must not
305                          *  immediately follow an instruction that writes to the
306                          *  accumulator that is being rotated."
307                          */
308                         if (writes_reg(insts[i - 1], QPU_W_ACC0 + mux_a) ||
309                             writes_reg(insts[i - 1], QPU_W_ACC0 + mux_b)) {
310                                 fail_instr(inst,
311                                            "vector rotate of value "
312                                            "written in previous instruction");
313                         }
314                 }
315         }
316 
317         /* "An instruction that does a vector rotate must not immediately
318          *  follow an instruction that writes to the accumulator that is being
319          *  rotated.
320          *
321          * XXX: TODO.
322          */
323 
324         /* "After an instruction that does a TLB Z write, the multisample mask
325          *  must not be read as an instruction input argument in the following
326          *  two instruction. The TLB Z write instruction can, however, be
327          *  followed immediately by a TLB color write."
328          */
329         for (int i = 0; i < num_inst - 1; i++) {
330                 uint64_t inst = insts[i];
331                 if (writes_reg(inst, QPU_W_TLB_Z) &&
332                     (reads_a_reg(insts[i + 1], QPU_R_MS_REV_FLAGS) ||
333                      reads_a_reg(insts[i + 2], QPU_R_MS_REV_FLAGS))) {
334                         fail_instr(inst, "TLB Z write followed by MS mask read");
335                 }
336         }
337 
338         /*
339          * "A single instruction can only perform a maximum of one of the
340          *  following closely coupled peripheral accesses in a single
341          *  instruction: TMU write, TMU read, TLB write, TLB read, TLB
342          *  combined color read and write, SFU write, Mutex read or Semaphore
343          *  access."
344          */
345         for (int i = 0; i < num_inst - 1; i++) {
346                 uint64_t inst = insts[i];
347 
348                 if (qpu_num_sf_accesses(inst) > 1)
349                         fail_instr(inst, "Single instruction writes SFU twice");
350         }
351 
352         /* "The uniform base pointer can be written (from SIMD element 0) by
353          *  the processor to reset the stream, there must be at least two
354          *  nonuniform-accessing instructions following a pointer change
355          *  before uniforms can be accessed once more."
356          */
357         int last_unif_pointer_update = -3;
358         for (int i = 0; i < num_inst; i++) {
359                 uint64_t inst = insts[i];
360                 uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
361                 uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
362 
363                 if (reads_reg(inst, QPU_R_UNIF) &&
364                     i - last_unif_pointer_update <= 2) {
365                         fail_instr(inst,
366                                    "uniform read too soon after pointer update");
367                 }
368 
369                 if (waddr_add == QPU_W_UNIFORMS_ADDRESS ||
370                     waddr_mul == QPU_W_UNIFORMS_ADDRESS)
371                         last_unif_pointer_update = i;
372         }
373 
374         if (threaded) {
375                 bool last_thrsw_found = false;
376                 bool scoreboard_locked = false;
377                 int tex_samples_outstanding = 0;
378                 int last_tex_samples_outstanding = 0;
379                 int thrsw_ip = -1;
380 
381                 for (int i = 0; i < num_inst; i++) {
382                         uint64_t inst = insts[i];
383                         uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
384 
385                         if (i == thrsw_ip) {
386                                 /* In order to get texture results back in the
387                                  * correct order, before a new thrsw we have
388                                  * to read all the texture results from before
389                                  * the previous thrsw.
390                                  *
391                                  * FIXME: Is collecting the remaining results
392                                  * during the delay slots OK, or should we do
393                                  * this at THRSW signal time?
394                                  */
395                                 if (last_tex_samples_outstanding != 0) {
396                                         fail_instr(inst, "THRSW with texture "
397                                                    "results from the previous "
398                                                    "THRSW still in the FIFO.");
399                                 }
400 
401                                 last_tex_samples_outstanding =
402                                         tex_samples_outstanding;
403                                 tex_samples_outstanding = 0;
404                         }
405 
406                         if (qpu_inst_is_tlb(inst))
407                                 scoreboard_locked = true;
408 
409                         switch (sig) {
410                         case QPU_SIG_THREAD_SWITCH:
411                         case QPU_SIG_LAST_THREAD_SWITCH:
412                                 /* No thread switching with the scoreboard
413                                  * locked.  Doing so means we may deadlock
414                                  * when the other thread tries to lock
415                                  * scoreboard.
416                                  */
417                                 if (scoreboard_locked) {
418                                         fail_instr(inst, "THRSW with the "
419                                                    "scoreboard locked.");
420                                 }
421 
422                                 /* No thread switching after lthrsw, since
423                                  * lthrsw means that we get delayed until the
424                                  * other shader is ready for us to terminate.
425                                  */
426                                 if (last_thrsw_found) {
427                                         fail_instr(inst, "THRSW after a "
428                                                    "previous LTHRSW");
429                                 }
430 
431                                 if (sig == QPU_SIG_LAST_THREAD_SWITCH)
432                                         last_thrsw_found = true;
433 
434                                 /* No THRSW while we already have a THRSW
435                                  * queued.
436                                  */
437                                 if (i < thrsw_ip) {
438                                         fail_instr(inst,
439                                                    "THRSW with a THRSW queued.");
440                                 }
441 
442                                 thrsw_ip = i + 3;
443                                 break;
444 
445                         case QPU_SIG_LOAD_TMU0:
446                         case QPU_SIG_LOAD_TMU1:
447                                 if (last_tex_samples_outstanding == 0) {
448                                         fail_instr(inst, "TMU load with nothing "
449                                                    "in the results fifo from "
450                                                    "the previous THRSW.");
451                                 }
452 
453                                 last_tex_samples_outstanding--;
454                                 break;
455                         }
456 
457                         uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
458                         uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
459                         if (waddr_add == QPU_W_TMU0_S ||
460                             waddr_add == QPU_W_TMU1_S ||
461                             waddr_mul == QPU_W_TMU0_S ||
462                             waddr_mul == QPU_W_TMU1_S) {
463                                 tex_samples_outstanding++;
464                         }
465                 }
466         }
467 }
468