1 /*
2  * Copyright (C) 2020 Collabora, Ltd.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  */
23 
24 #include "compiler.h"
25 #include "bi_print.h"
26 #include "bi_generated_pack.h"
27 
28 #define RETURN_PACKED(str) { \
29         uint64_t temp = 0; \
30         memcpy(&temp, &str, sizeof(str)); \
31         return temp; \
32 }
33 
34 /* This file contains the final passes of the compiler. Running after
35  * scheduling and RA, the IR is now finalized, so we need to emit it to actual
36  * bits on the wire (as well as fixup branches) */
37 
38 static uint64_t
bi_pack_header(bi_clause * clause,bi_clause * next_1,bi_clause * next_2,bool tdd)39 bi_pack_header(bi_clause *clause, bi_clause *next_1, bi_clause *next_2, bool tdd)
40 {
41         /* next_dependencies are the union of the dependencies of successors'
42          * dependencies */
43 
44         unsigned dependency_wait = next_1 ? next_1->dependencies : 0;
45         dependency_wait |= next_2 ? next_2->dependencies : 0;
46 
47         struct bifrost_header header = {
48                 .flow_control =
49                         (next_1 == NULL) ? BIFROST_FLOW_END :
50                         clause->flow_control,
51                 .terminate_discarded_threads = tdd,
52                 .next_clause_prefetch = clause->next_clause_prefetch,
53                 .staging_barrier = clause->staging_barrier,
54                 .staging_register = clause->staging_register,
55                 .dependency_wait = dependency_wait,
56                 .dependency_slot = clause->scoreboard_id,
57                 .message_type = clause->message_type,
58                 .next_message_type = next_1 ? next_1->message_type : 0,
59                 .suppress_inf = true,
60                 .suppress_nan = true,
61         };
62 
63         uint64_t u = 0;
64         memcpy(&u, &header, sizeof(header));
65         return u;
66 }
67 
68 /* The uniform/constant slot allows loading a contiguous 64-bit immediate or
69  * pushed uniform per bundle. Figure out which one we need in the bundle (the
70  * scheduler needs to ensure we only have one type per bundle), validate
71  * everything, and rewrite away the register/uniform indices to use 3-bit
72  * sources directly. */
73 
74 static unsigned
bi_lookup_constant(bi_clause * clause,uint32_t cons,bool * hi)75 bi_lookup_constant(bi_clause *clause, uint32_t cons, bool *hi)
76 {
77         for (unsigned i = 0; i < clause->constant_count; ++i) {
78                 /* Try to apply to top or to bottom */
79                 uint64_t top = clause->constants[i];
80 
81                 if (cons == ((uint32_t) top | (cons & 0xF)))
82                         return i;
83 
84                 if (cons == (top >> 32ul)) {
85                         *hi = true;
86                         return i;
87                 }
88         }
89 
90         unreachable("Invalid constant accessed");
91 }
92 
93 static unsigned
bi_constant_field(unsigned idx)94 bi_constant_field(unsigned idx)
95 {
96         assert(idx <= 5);
97 
98         const unsigned values[] = {
99                 4, 5, 6, 7, 2, 3
100         };
101 
102         return values[idx] << 4;
103 }
104 
105 static bool
bi_assign_fau_idx_single(bi_registers * regs,bi_clause * clause,bi_instruction * ins,bool assigned,bool fast_zero)106 bi_assign_fau_idx_single(bi_registers *regs,
107                          bi_clause *clause,
108                          bi_instruction *ins,
109                          bool assigned,
110                          bool fast_zero)
111 {
112         if (!ins)
113                 return assigned;
114 
115         if (ins->type == BI_BRANCH && clause->branch_constant) {
116                 /* By convention branch constant is last */
117                 unsigned idx = clause->constant_count - 1;
118 
119                 /* We can only jump to clauses which are qword aligned so the
120                  * bottom 4-bits of the offset are necessarily 0 */
121                 unsigned lo = 0;
122 
123                 /* Build the constant */
124                 unsigned C = bi_constant_field(idx) | lo;
125 
126                 if (assigned && regs->fau_idx != C)
127                         unreachable("Mismatched fau_idx: branch");
128 
129                 regs->fau_idx = C;
130                 return true;
131         }
132 
133         bi_foreach_src(ins, s) {
134                 if (s == 0 && (ins->type == BI_LOAD_VAR_ADDRESS || ins->type == BI_LOAD_ATTR)) continue;
135                 if (s == 1 && (ins->type == BI_BRANCH)) continue;
136 
137                 if (ins->src[s] & BIR_INDEX_CONSTANT) {
138                         /* Let direct addresses through */
139                         if (ins->type == BI_LOAD_VAR)
140                                 continue;
141 
142                         bool hi = false;
143                         uint32_t cons = bi_get_immediate(ins, s);
144                         unsigned idx = bi_lookup_constant(clause, cons, &hi);
145                         unsigned lo = clause->constants[idx] & 0xF;
146                         unsigned f = bi_constant_field(idx) | lo;
147 
148                         if (assigned && regs->fau_idx != f)
149                                 unreachable("Mismatched uniform/const field: imm");
150 
151                         regs->fau_idx = f;
152                         ins->src[s] = BIR_INDEX_PASS | (hi ? BIFROST_SRC_FAU_HI : BIFROST_SRC_FAU_LO);
153                         assigned = true;
154                 } else if (ins->src[s] & BIR_INDEX_ZERO && (ins->type == BI_LOAD_UNIFORM || ins->type == BI_LOAD_VAR)) {
155                         /* XXX: HACK UNTIL WE HAVE HI MATCHING DUE TO OVERFLOW XXX */
156                         ins->src[s] = BIR_INDEX_PASS | BIFROST_SRC_FAU_HI;
157                 } else if (ins->src[s] & BIR_INDEX_ZERO && !fast_zero) {
158                         /* FMAs have a fast zero slot, ADD needs to use the
159                          * uniform/const slot's special 0 mode handled here */
160                         unsigned f = 0;
161 
162                         if (assigned && regs->fau_idx != f)
163                                 unreachable("Mismatched uniform/const field: 0");
164 
165                         regs->fau_idx = f;
166                         ins->src[s] = BIR_INDEX_PASS | BIFROST_SRC_FAU_LO;
167                         assigned = true;
168                 } else if (ins->src[s] & BIR_INDEX_ZERO && fast_zero) {
169                         ins->src[s] = BIR_INDEX_PASS | BIFROST_SRC_STAGE;
170                 } else if (ins->src[s] & BIR_INDEX_BLEND) {
171                         unsigned rt = ins->blend_location;
172 
173                         assert(rt <= 7);
174                         assert((ins->src[s] & ~BIR_SPECIAL) == BIFROST_SRC_FAU_HI ||
175                                (ins->src[s] & ~BIR_SPECIAL) == BIFROST_SRC_FAU_LO);
176                         ins->src[s] = BIR_INDEX_PASS | (ins->src[s] & ~BIR_SPECIAL);
177                         if (assigned && regs->fau_idx != (8 | rt))
178                                 unreachable("Mismatched FAU index");
179 
180                         regs->fau_idx = 8 | rt;
181                         assigned = true;
182                 } else if (s & BIR_INDEX_UNIFORM) {
183                         unreachable("Push uniforms not implemented yet");
184                 }
185         }
186 
187         return assigned;
188 }
189 
190 static void
bi_assign_fau_idx(bi_clause * clause,bi_bundle * bundle)191 bi_assign_fau_idx(bi_clause *clause,
192                   bi_bundle *bundle)
193 {
194         bool assigned =
195                 bi_assign_fau_idx_single(&bundle->regs, clause, bundle->fma, false, true);
196 
197         bi_assign_fau_idx_single(&bundle->regs, clause, bundle->add, assigned, false);
198 }
199 
200 /* Assigns a slot for reading, before anything is written */
201 
202 static void
bi_assign_slot_read(bi_registers * regs,unsigned src)203 bi_assign_slot_read(bi_registers *regs, unsigned src)
204 {
205         /* We only assign for registers */
206         if (!(src & BIR_INDEX_REGISTER))
207                 return;
208 
209         unsigned reg = src & ~BIR_INDEX_REGISTER;
210 
211         /* Check if we already assigned the slot */
212         for (unsigned i = 0; i <= 1; ++i) {
213                 if (regs->slot[i] == reg && regs->enabled[i])
214                         return;
215         }
216 
217         if (regs->slot[2] == reg && regs->slot23.slot2 == BIFROST_OP_READ)
218                 return;
219 
220         /* Assign it now */
221 
222         for (unsigned i = 0; i <= 1; ++i) {
223                 if (!regs->enabled[i]) {
224                         regs->slot[i] = reg;
225                         regs->enabled[i] = true;
226                         return;
227                 }
228         }
229 
230         if (!regs->slot23.slot3) {
231                 regs->slot[2] = reg;
232                 regs->slot23.slot2 = BIFROST_OP_READ;
233                 return;
234         }
235 
236         bi_print_slots(regs, stderr);
237         unreachable("Failed to find a free slot for src");
238 }
239 
240 static bi_registers
bi_assign_slots(bi_bundle * now,bi_bundle * prev)241 bi_assign_slots(bi_bundle *now, bi_bundle *prev)
242 {
243         /* We assign slots for the main register mechanism. Special ops
244          * use the data registers, which has its own mechanism entirely
245          * and thus gets skipped over here. */
246 
247         unsigned read_dreg = now->add &&
248                 bi_class_props[now->add->type] & BI_DATA_REG_SRC;
249 
250         unsigned write_dreg = prev->add &&
251                 bi_class_props[prev->add->type] & BI_DATA_REG_DEST;
252 
253         /* First, assign reads */
254 
255         if (now->fma)
256                 bi_foreach_src(now->fma, src)
257                         bi_assign_slot_read(&now->regs, now->fma->src[src]);
258 
259         if (now->add) {
260                 bi_foreach_src(now->add, src) {
261                         if (!(src == 0 && read_dreg))
262                                 bi_assign_slot_read(&now->regs, now->add->src[src]);
263                 }
264         }
265 
266         /* Next, assign writes */
267 
268         if (prev->add && prev->add->dest & BIR_INDEX_REGISTER && !write_dreg) {
269                 now->regs.slot[3] = prev->add->dest & ~BIR_INDEX_REGISTER;
270                 now->regs.slot23.slot3 = BIFROST_OP_WRITE;
271         }
272 
273         if (prev->fma && prev->fma->dest & BIR_INDEX_REGISTER) {
274                 unsigned r = prev->fma->dest & ~BIR_INDEX_REGISTER;
275 
276                 if (now->regs.slot23.slot3) {
277                         /* Scheduler constraint: cannot read 3 and write 2 */
278                         assert(!now->regs.slot23.slot2);
279                         now->regs.slot[2] = r;
280                         now->regs.slot23.slot2 = BIFROST_OP_WRITE;
281                 } else {
282                         now->regs.slot[3] = r;
283                         now->regs.slot23.slot3 = BIFROST_OP_WRITE;
284                         now->regs.slot23.slot3_fma = true;
285                 }
286         }
287 
288         return now->regs;
289 }
290 
291 static enum bifrost_reg_mode
bi_pack_register_mode(bi_registers r)292 bi_pack_register_mode(bi_registers r)
293 {
294         /* Handle idle special case for first instructions */
295         if (r.first_instruction && !(r.slot23.slot2 | r.slot23.slot3))
296                 return BIFROST_IDLE_1;
297 
298         /* Otherwise, use the LUT */
299         for (unsigned i = 0; i < ARRAY_SIZE(bifrost_reg_ctrl_lut); ++i) {
300                 if (memcmp(bifrost_reg_ctrl_lut + i, &r.slot23, sizeof(r.slot23)) == 0)
301                         return i;
302         }
303 
304         bi_print_slots(&r, stderr);
305         unreachable("Invalid slot assignment");
306 }
307 
308 static uint64_t
bi_pack_registers(bi_registers regs)309 bi_pack_registers(bi_registers regs)
310 {
311         enum bifrost_reg_mode mode = bi_pack_register_mode(regs);
312         struct bifrost_regs s = { 0 };
313         uint64_t packed = 0;
314 
315         /* Need to pack 5-bit mode as a 4-bit field. The decoder moves bit 3 to bit 4 for
316          * first instruction and adds 16 when reg 2 == reg 3 */
317 
318         unsigned ctrl;
319         bool r2_equals_r3 = false;
320 
321         if (regs.first_instruction) {
322                 /* Bit 3 implicitly must be clear for first instructions.
323                  * The affected patterns all write both ADD/FMA, but that
324                  * is forbidden for the first instruction, so this does
325                  * not add additional encoding constraints */
326                 assert(!(mode & 0x8));
327 
328                 /* Move bit 4 to bit 3, since bit 3 is clear */
329                 ctrl = (mode & 0x7) | ((mode & 0x10) >> 1);
330 
331                 /* If we can let r2 equal r3, we have to or the hardware raises
332                  * INSTR_INVALID_ENC (it's unclear why). */
333                 if (!(regs.slot23.slot2 && regs.slot23.slot3))
334                         r2_equals_r3 = true;
335         } else {
336                 /* We force r2=r3 or not for the upper bit */
337                 ctrl = (mode & 0xF);
338                 r2_equals_r3 = (mode & 0x10);
339         }
340 
341         if (regs.enabled[1]) {
342                 /* Gotta save that bit!~ Required by the 63-x trick */
343                 assert(regs.slot[1] > regs.slot[0]);
344                 assert(regs.enabled[0]);
345 
346                 /* Do the 63-x trick, see docs/disasm */
347                 if (regs.slot[0] > 31) {
348                         regs.slot[0] = 63 - regs.slot[0];
349                         regs.slot[1] = 63 - regs.slot[1];
350                 }
351 
352                 assert(regs.slot[0] <= 31);
353                 assert(regs.slot[1] <= 63);
354 
355                 s.ctrl = ctrl;
356                 s.reg1 = regs.slot[1];
357                 s.reg0 = regs.slot[0];
358         } else {
359                 /* slot 1 disabled, so set to zero and use slot 1 for ctrl */
360                 s.ctrl = 0;
361                 s.reg1 = ctrl << 2;
362 
363                 if (regs.enabled[0]) {
364                         /* Bit 0 upper bit of slot 0 */
365                         s.reg1 |= (regs.slot[0] >> 5);
366 
367                         /* Rest of slot 0 in usual spot */
368                         s.reg0 = (regs.slot[0] & 0b11111);
369                 } else {
370                         /* Bit 1 set if slot 0 also disabled */
371                         s.reg1 |= (1 << 1);
372                 }
373         }
374 
375         /* Force r2 =/!= r3 as needed */
376         if (r2_equals_r3) {
377                 assert(regs.slot[3] == regs.slot[2] || !(regs.slot23.slot2 && regs.slot23.slot3));
378 
379                 if (regs.slot23.slot2)
380                         regs.slot[3] = regs.slot[2];
381                 else
382                         regs.slot[2] = regs.slot[3];
383         } else if (!regs.first_instruction) {
384                 /* Enforced by the encoding anyway */
385                 assert(regs.slot[2] != regs.slot[3]);
386         }
387 
388         s.reg2 = regs.slot[2];
389         s.reg3 = regs.slot[3];
390         s.fau_idx = regs.fau_idx;
391 
392         memcpy(&packed, &s, sizeof(s));
393         return packed;
394 }
395 
396 static unsigned
bi_pack_fma_special(bi_clause * clause,bi_instruction * ins,bi_registers * regs)397 bi_pack_fma_special(bi_clause *clause, bi_instruction *ins, bi_registers *regs)
398 {
399         switch (ins->op.special) {
400         case BI_SPECIAL_CUBEFACE1:
401                 return pan_pack_fma_cubeface1(clause, ins, regs);
402         default:
403                 unreachable("Unknown special op");
404         }
405 }
406 
407 #define BI_PACK_SHIFT(name)                                                      \
408 static unsigned                                                                  \
409 bi_pack_fma_ ## name(bi_clause *clause, bi_instruction *ins, bi_registers *regs) \
410 {                                                                                \
411         switch (nir_alu_type_get_type_size(ins->dest_type)) {                    \
412         case 32:                                                                 \
413                 return pan_pack_fma_ ## name ## _i32(clause, ins, regs);         \
414         case 16:                                                                 \
415                 return pan_pack_fma_ ## name ## _v2i16(clause, ins, regs);       \
416         case 8:                                                                  \
417                 return pan_pack_fma_ ## name ## _v4i8(clause, ins, regs);        \
418         default:                                                                 \
419                 unreachable("Invalid dest size");                                \
420         }                                                                        \
421 }
422 
423 BI_PACK_SHIFT(rshift_and)
BI_PACK_SHIFT(lshift_and)424 BI_PACK_SHIFT(lshift_and)
425 BI_PACK_SHIFT(rshift_or)
426 BI_PACK_SHIFT(lshift_or)
427 BI_PACK_SHIFT(rshift_xor)
428 BI_PACK_SHIFT(lshift_xor)
429 BI_PACK_SHIFT(arshift)
430 
431 static unsigned
432 bi_pack_fma_bitwise(bi_clause *clause, bi_instruction *ins, bi_registers *regs)
433 {
434         switch (ins->op.bitwise) {
435         case BI_BITWISE_AND:
436                 return ins->bitwise.rshift ?
437                        bi_pack_fma_rshift_and(clause, ins, regs) :
438                        bi_pack_fma_lshift_and(clause, ins, regs);
439         case BI_BITWISE_OR:
440                 return ins->bitwise.rshift ?
441                        bi_pack_fma_rshift_or(clause, ins, regs) :
442                        bi_pack_fma_lshift_or(clause, ins, regs);
443         case BI_BITWISE_XOR:
444                 return ins->bitwise.rshift ?
445                        bi_pack_fma_rshift_xor(clause, ins, regs) :
446                        bi_pack_fma_lshift_xor(clause, ins, regs);
447         case BI_BITWISE_ARSHIFT:
448                 assert(ins->bitwise.rshift);
449                 return bi_pack_fma_arshift(clause, ins, regs);
450         default:
451                 unreachable("Invalid bitwise op");
452         }
453 }
454 
455 static unsigned
bi_pack_fma(bi_clause * clause,bi_bundle bundle,bi_registers * regs)456 bi_pack_fma(bi_clause *clause, bi_bundle bundle, bi_registers *regs)
457 {
458         if (!bundle.fma)
459                 return pan_pack_fma_nop_i32(clause, NULL, regs);
460 
461         bool f16 = bundle.fma->dest_type == nir_type_float16;
462         bool f32 = bundle.fma->dest_type == nir_type_float32;
463         bool u32 = bundle.fma->dest_type == nir_type_uint32 ||
464                 bundle.fma->dest_type == nir_type_bool32;
465         bool u16 = bundle.fma->dest_type == nir_type_uint16;
466         bool s32 = bundle.fma->dest_type == nir_type_int32;
467         bool s16 = bundle.fma->dest_type == nir_type_int16;
468 
469         bool src0_f16 = bundle.fma->src_types[0] == nir_type_float16;
470         bool src0_f32 = bundle.fma->src_types[0] == nir_type_float32;
471         bool src0_u16 = bundle.fma->src_types[0] == nir_type_uint16;
472         bool src0_s16 = bundle.fma->src_types[0] == nir_type_int16;
473         bool src0_s8 = bundle.fma->src_types[0] == nir_type_int8;
474         bool src0_u8 = bundle.fma->src_types[0] == nir_type_uint8;
475 
476         enum bi_cond cond = bundle.fma->cond;
477         bool typeless_cond = (cond == BI_COND_EQ) || (cond == BI_COND_NE);
478 
479         switch (bundle.fma->type) {
480         case BI_ADD:
481                 if (bundle.fma->dest_type == nir_type_float32)
482                         return pan_pack_fma_fadd_f32(clause, bundle.fma, regs);
483                 else if (bundle.fma->dest_type == nir_type_float16)
484                         return pan_pack_fma_fadd_v2f16(clause, bundle.fma, regs);
485 
486                 unreachable("TODO");
487         case BI_CMP:
488                 assert (src0_f16 || src0_f32);
489 
490                 if (src0_f32)
491                         return pan_pack_fma_fcmp_f32(clause, bundle.fma, regs);
492                 else
493                         return pan_pack_fma_fcmp_v2f16(clause, bundle.fma, regs);
494         case BI_BITWISE:
495                 return bi_pack_fma_bitwise(clause, bundle.fma, regs);
496         case BI_CONVERT:
497                 if (src0_s8) {
498                         assert(s32);
499                         return pan_pack_fma_s8_to_s32(clause, bundle.fma, regs);
500                 } else if (src0_u8) {
501                         assert(u32);
502                         return pan_pack_fma_u8_to_u32(clause, bundle.fma, regs);
503                 } else if (src0_s16) {
504                         assert(s32);
505                         return pan_pack_fma_s16_to_s32(clause, bundle.fma, regs);
506                 } else if (src0_u16) {
507                         assert(u32);
508                         return pan_pack_fma_u16_to_u32(clause, bundle.fma, regs);
509                 } else if (src0_f16) {
510                         assert(f32);
511                         return pan_pack_fma_f16_to_f32(clause, bundle.fma, regs);
512                 } else if (src0_f32) {
513                         assert(f16);
514                         return pan_pack_fma_v2f32_to_v2f16(clause, bundle.fma, regs);
515                 }
516 
517                 unreachable("Invalid FMA convert");
518         case BI_CSEL:
519                 if (f32)
520                         return pan_pack_fma_csel_f32(clause, bundle.fma, regs);
521                 else if (f16)
522                         return pan_pack_fma_csel_v2f16(clause, bundle.fma, regs);
523                 else if ((u32 || s32) && typeless_cond)
524                         return pan_pack_fma_csel_i32(clause, bundle.fma, regs);
525                 else if ((u16 || s16) && typeless_cond)
526                         return pan_pack_fma_csel_v2i16(clause, bundle.fma, regs);
527                 else if (u32)
528                         return pan_pack_fma_csel_u32(clause, bundle.fma, regs);
529                 else if (u16)
530                         return pan_pack_fma_csel_v2u16(clause, bundle.fma, regs);
531                 else if (s32)
532                         return pan_pack_fma_csel_s32(clause, bundle.fma, regs);
533                 else if (s16)
534                         return pan_pack_fma_csel_v2s16(clause, bundle.fma, regs);
535                 else
536                         unreachable("Invalid csel type");
537         case BI_FMA:
538                 if (bundle.fma->dest_type == nir_type_float32) {
539                         if (bundle.fma->op.mscale)
540                                 return pan_pack_fma_fma_rscale_f32(clause, bundle.fma, regs);
541                         else
542                                 return pan_pack_fma_fma_f32(clause, bundle.fma, regs);
543                 } else {
544                         assert(bundle.fma->dest_type == nir_type_float16);
545 
546                         if (bundle.fma->op.mscale)
547                                 return pan_pack_fma_fma_rscale_v2f16(clause, bundle.fma, regs);
548                         else
549                                 return pan_pack_fma_fma_v2f16(clause, bundle.fma, regs);
550                 }
551         case BI_FREXP:
552                 assert(src0_f32 || src0_f16);
553 
554                 if (src0_f32)
555                         return pan_pack_fma_frexpe_f32(clause, bundle.fma, regs);
556                 else
557                         return pan_pack_fma_frexpe_v2f16(clause, bundle.fma, regs);
558         case BI_IMATH:
559                 /* XXX: Only 32-bit, with carries/borrows forced */
560                 assert(s32 || u32);
561 
562                 if (bundle.fma->op.imath == BI_IMATH_ADD)
563                         return pan_pack_fma_iaddc_i32(clause, bundle.fma, regs);
564                 else
565                         return pan_pack_fma_isubb_i32(clause, bundle.fma, regs);
566         case BI_MOV:
567                 return pan_pack_fma_mov_i32(clause, bundle.fma, regs);
568         case BI_SELECT:
569                 if (nir_alu_type_get_type_size(bundle.fma->src_types[0]) == 16) {
570                         return pan_pack_fma_mkvec_v2i16(clause, bundle.fma, regs);
571                 } else {
572                         assert(nir_alu_type_get_type_size(bundle.fma->src_types[0]) == 8);
573                         return pan_pack_fma_mkvec_v4i8(clause, bundle.fma, regs);
574                 }
575         case BI_ROUND:
576                 assert(f16 || f32);
577 
578                 if (f16)
579                         return pan_pack_fma_fround_v2f16(clause, bundle.fma, regs);
580                 else
581                         return pan_pack_fma_fround_f32(clause, bundle.fma, regs);
582         case BI_REDUCE_FMA:
583                 assert(src0_f32 && f32);
584                 return pan_pack_fma_fadd_lscale_f32(clause, bundle.fma, regs);
585         case BI_IMUL:
586                 return pan_pack_fma_imul_i32(clause, bundle.fma, regs);
587         case BI_SPECIAL_FMA:
588                 return bi_pack_fma_special(clause, bundle.fma, regs);
589         default:
590                 unreachable("Cannot encode class as FMA");
591         }
592 }
593 
594 static unsigned
bi_pack_add_branch_cond(bi_instruction * ins,bi_registers * regs)595 bi_pack_add_branch_cond(bi_instruction *ins, bi_registers *regs)
596 {
597         assert(ins->cond == BI_COND_EQ);
598         assert(ins->src[1] == BIR_INDEX_ZERO);
599 
600         unsigned zero_ctrl = 0;
601         unsigned size = nir_alu_type_get_type_size(ins->src_types[0]);
602 
603         if (size == 16) {
604                 /* See BR_SIZE_ZERO swizzle disassembly */
605                 zero_ctrl = ins->swizzle[0][0] ? 1 : 2;
606         } else {
607                 assert(size == 32);
608         }
609 
610         /* EQ swap to NE */
611         bool slot_swapped = false;
612 
613         struct bifrost_branch pack = {
614                 .src0 = bi_get_src(ins, regs, 0),
615                 .src1 = (zero_ctrl << 1) | !slot_swapped,
616                 .cond = BR_COND_EQ,
617                 .size = BR_SIZE_ZERO,
618                 .op = BIFROST_ADD_OP_BRANCH
619         };
620 
621         if (ins->branch_target) {
622                 /* We assigned the constant slot to fetch the branch offset so
623                  * we can just passthrough here. We put in the HI slot to match
624                  * the blob since that's where the magic flags end up
625                  */
626                 assert(!ins->src[2]);
627                 pack.src2 = BIFROST_SRC_FAU_HI;
628         } else {
629                 pack.src2 = bi_get_src(ins, regs, 2);
630         }
631 
632         RETURN_PACKED(pack);
633 }
634 
635 static unsigned
bi_pack_add_branch_uncond(bi_instruction * ins,bi_registers * regs)636 bi_pack_add_branch_uncond(bi_instruction *ins, bi_registers *regs)
637 {
638         struct bifrost_branch pack = {
639                 /* It's unclear what these bits actually mean */
640                 .src0 = BIFROST_SRC_FAU_LO,
641                 .src1 = BIFROST_SRC_PASS_FMA,
642 
643                 /* All ones in fact */
644                 .cond = (BR_ALWAYS & 0x7),
645                 .size = (BR_ALWAYS >> 3),
646                 .op = BIFROST_ADD_OP_BRANCH
647         };
648 
649         if (ins->branch_target) {
650                 /* Offset is passed as a PC-relative offset through an
651                  * embedded constant.
652                  */
653                 assert(!ins->src[2]);
654                 pack.src2 = BIFROST_SRC_FAU_HI;
655         } else {
656                 pack.src2 = bi_get_src(ins, regs, 2);
657         }
658 
659         RETURN_PACKED(pack);
660 }
661 
662 static unsigned
bi_pack_add_branch(bi_instruction * ins,bi_registers * regs)663 bi_pack_add_branch(bi_instruction *ins, bi_registers *regs)
664 {
665         if (ins->cond == BI_COND_ALWAYS)
666                 return bi_pack_add_branch_uncond(ins, regs);
667         else
668                 return bi_pack_add_branch_cond(ins, regs);
669 }
670 
671 static unsigned
bi_pack_add_special(bi_clause * clause,bi_instruction * ins,bi_registers * regs)672 bi_pack_add_special(bi_clause *clause, bi_instruction *ins, bi_registers *regs)
673 {
674         bool f16 = ins->dest_type == nir_type_float16;
675 
676         switch (ins->op.special) {
677         case BI_SPECIAL_FRCP:
678                 return f16 ? pan_pack_add_frcp_f16(clause, ins, regs) :
679                              pan_pack_add_frcp_f32(clause, ins, regs);
680         case BI_SPECIAL_FRSQ:
681                 return f16 ? pan_pack_add_frsq_f16(clause, ins, regs) :
682                              pan_pack_add_frsq_f32(clause, ins, regs);
683         case BI_SPECIAL_EXP2_LOW:
684                 assert(!f16);
685                 return pan_pack_add_fexp_f32(clause, ins, regs);
686         case BI_SPECIAL_IABS:
687                 assert(ins->src_types[0] == nir_type_int32);
688                 return pan_pack_add_iabs_s32(clause, ins, regs);
689         case BI_SPECIAL_CUBEFACE2:
690                 return pan_pack_add_cubeface2(clause, ins, regs);
691         case BI_SPECIAL_CUBE_SSEL:
692                 return pan_pack_add_cube_ssel(clause, ins, regs);
693         case BI_SPECIAL_CUBE_TSEL:
694                 return pan_pack_add_cube_tsel(clause, ins, regs);
695         default:
696                 unreachable("Unknown special op");
697         }
698 }
699 
700 static unsigned
bi_pack_add(bi_clause * clause,bi_bundle bundle,bi_registers * regs,gl_shader_stage stage)701 bi_pack_add(bi_clause *clause, bi_bundle bundle, bi_registers *regs, gl_shader_stage stage)
702 {
703         if (!bundle.add)
704                 return pan_pack_add_nop_i32(clause, NULL, regs);
705 
706         bool f16 = bundle.add->dest_type == nir_type_float16;
707         bool f32 = bundle.add->dest_type == nir_type_float32;
708         bool u32 = bundle.add->dest_type == nir_type_uint32 ||
709                 bundle.add->dest_type == nir_type_bool32;
710         bool u16 = bundle.add->dest_type == nir_type_uint16;
711         bool s32 = bundle.add->dest_type == nir_type_int32;
712         bool s16 = bundle.add->dest_type == nir_type_int16;
713 
714         bool src0_f16 = bundle.add->src_types[0] == nir_type_float16;
715         bool src0_f32 = bundle.add->src_types[0] == nir_type_float32;
716         bool src0_u32 = bundle.add->src_types[0] == nir_type_uint32;
717         bool src0_u16 = bundle.add->src_types[0] == nir_type_uint16;
718         bool src0_u8 = bundle.add->src_types[0] == nir_type_uint8;
719         bool src0_s32 = bundle.add->src_types[0] == nir_type_int32;
720         bool src0_s16 = bundle.add->src_types[0] == nir_type_int16;
721         bool src0_s8 = bundle.add->src_types[0] == nir_type_int8;
722 
723         unsigned sz = nir_alu_type_get_type_size(bundle.add->dest_type);
724         enum bi_cond cond = bundle.add->cond;
725         bool typeless_cond = (cond == BI_COND_EQ) || (cond == BI_COND_NE);
726 
727         switch (bundle.add->type) {
728         case BI_ADD:
729                 if (bundle.add->dest_type == nir_type_float32)
730                         return pan_pack_add_fadd_f32(clause, bundle.add, regs);
731                 else if (bundle.add->dest_type == nir_type_float16)
732                         return pan_pack_add_fadd_v2f16(clause, bundle.add, regs);
733 
734                 unreachable("TODO");
735         case BI_ATEST:
736                 return pan_pack_add_atest(clause, bundle.add, regs);
737         case BI_BRANCH:
738                 return bi_pack_add_branch(bundle.add, regs);
739         case BI_CMP:
740                 if (src0_f32)
741                         return pan_pack_add_fcmp_f32(clause, bundle.add, regs);
742                 else if (src0_f16)
743                         return pan_pack_add_fcmp_v2f16(clause, bundle.add, regs);
744                 else if ((src0_u32 || src0_s32) && typeless_cond)
745                         return pan_pack_add_icmp_i32(clause, bundle.add, regs);
746                 else if ((src0_u16 || src0_s16) && typeless_cond)
747                         return pan_pack_add_icmp_v2i16(clause, bundle.add, regs);
748                 else if ((src0_u8 || src0_s8) && typeless_cond)
749                         return pan_pack_add_icmp_v4i8(clause, bundle.add, regs);
750                 else if (src0_u32)
751                         return pan_pack_add_icmp_u32(clause, bundle.add, regs);
752                 else if (src0_u16)
753                         return pan_pack_add_icmp_v2u16(clause, bundle.add, regs);
754                 else if (src0_u8)
755                         return pan_pack_add_icmp_v4u8(clause, bundle.add, regs);
756                 else if (src0_s32)
757                         return pan_pack_add_icmp_s32(clause, bundle.add, regs);
758                 else if (src0_s16)
759                         return pan_pack_add_icmp_v2s16(clause, bundle.add, regs);
760                 else if (src0_s8)
761                         return pan_pack_add_icmp_v4s8(clause, bundle.add, regs);
762                 else
763                         unreachable("Invalid cmp type");
764         case BI_BLEND:
765                 return pan_pack_add_blend(clause, bundle.add, regs);
766         case BI_BITWISE:
767                 unreachable("Packing todo");
768         case BI_CONVERT:
769                 if (src0_f16 && s16)
770                         return pan_pack_add_v2f16_to_v2s16(clause, bundle.add, regs);
771                 else if (src0_f16 && u16)
772                         return pan_pack_add_v2f16_to_v2u16(clause, bundle.add, regs);
773                 else if (src0_f16 && s32)
774                         return pan_pack_add_f16_to_s32(clause, bundle.add, regs);
775                 else if (src0_f16 && u32)
776                         return pan_pack_add_f16_to_u32(clause, bundle.add, regs);
777                 else if (src0_s16 && f16)
778                         return pan_pack_add_v2s16_to_v2f16(clause, bundle.add, regs);
779                 else if (src0_u16 && f16)
780                         return pan_pack_add_v2u16_to_v2f16(clause, bundle.add, regs);
781                 else if (src0_s8  && s16)
782                         return pan_pack_add_v2s8_to_v2s16(clause, bundle.add, regs);
783                 else if (src0_u8  && u16)
784                         return pan_pack_add_v2u8_to_v2u16(clause, bundle.add, regs);
785                 else if (src0_s8  && f16)
786                         return pan_pack_add_v2s8_to_v2f16(clause, bundle.add, regs);
787                 else if (src0_u8  && f16)
788                         return pan_pack_add_v2u8_to_v2f16(clause, bundle.add, regs);
789                 else if (src0_f32 && s32)
790                         return pan_pack_add_f32_to_s32(clause, bundle.add, regs);
791                 else if (src0_f32 && u32)
792                         return pan_pack_add_f32_to_u32(clause, bundle.add, regs);
793                 else if (src0_s8  && s32)
794                         return pan_pack_add_s8_to_s32(clause, bundle.add, regs);
795                 else if (src0_u8  && u32)
796                         return pan_pack_add_u8_to_u32(clause, bundle.add, regs);
797                 else if (src0_s8  && f32)
798                         return pan_pack_add_s8_to_f32(clause, bundle.add, regs);
799                 else if (src0_u8  && f32)
800                         return pan_pack_add_u8_to_f32(clause, bundle.add, regs);
801                 else if (src0_s32 && f32)
802                         return pan_pack_add_s32_to_f32(clause, bundle.add, regs);
803                 else if (src0_u32 && f32)
804                         return pan_pack_add_u32_to_f32(clause, bundle.add, regs);
805                 else if (src0_s16 && s32)
806                         return pan_pack_add_s16_to_s32(clause, bundle.add, regs);
807                 else if (src0_u16 && u32)
808                         return pan_pack_add_u16_to_u32(clause, bundle.add, regs);
809                 else if (src0_s16 && f32)
810                         return pan_pack_add_s16_to_f32(clause, bundle.add, regs);
811                 else if (src0_u16 && f32)
812                         return pan_pack_add_u16_to_f32(clause, bundle.add, regs);
813                 else if (src0_f16 && f32)
814                         return pan_pack_add_f16_to_f32(clause, bundle.add, regs);
815                 else if (src0_f32 && f16)
816                         return pan_pack_add_v2f32_to_v2f16(clause, bundle.add, regs);
817                 else
818                         unreachable("Invalid ADD convert");
819         case BI_DISCARD:
820                 return pan_pack_add_discard_f32(clause, bundle.add, regs);
821         case BI_FREXP:
822                 unreachable("Packing todo");
823         case BI_IMATH:
824                 assert(sz == 8 || sz == 16 || sz == 32);
825 
826                 if (bundle.add->op.imath == BI_IMATH_ADD) {
827                         return (sz == 8) ? pan_pack_add_iadd_v4s8(clause, bundle.add, regs) :
828                                 (sz == 16) ? pan_pack_add_iadd_v2s16(clause, bundle.add, regs) :
829                                 pan_pack_add_iadd_s32(clause, bundle.add, regs);
830                 } else {
831                         return (sz == 8) ? pan_pack_add_isub_v4s8(clause, bundle.add, regs) :
832                                 (sz == 16) ? pan_pack_add_isub_v2s16(clause, bundle.add, regs) :
833                                 pan_pack_add_isub_s32(clause, bundle.add, regs);
834                 }
835         case BI_LOAD_ATTR:
836                 return pan_pack_add_ld_attr_imm(clause, bundle.add, regs);
837         case BI_LOAD:
838         case BI_LOAD_UNIFORM:
839                 assert(u32 || s32 || f32);
840                 switch (bundle.add->vector_channels) {
841                 case 1: return pan_pack_add_load_i32(clause, bundle.add, regs);
842                 case 2: return pan_pack_add_load_i64(clause, bundle.add, regs);
843                 case 3: return pan_pack_add_load_i96(clause, bundle.add, regs);
844                 case 4: return pan_pack_add_load_i128(clause, bundle.add, regs);
845                 default: unreachable("Invalid channel count");
846                 }
847         case BI_LOAD_VAR:
848                 if (bundle.add->src[0] & BIR_INDEX_CONSTANT) {
849                         if (bi_get_immediate(bundle.add, 0) >= 20)
850                                 return pan_pack_add_ld_var_special(clause, bundle.add, regs);
851                         else if (bundle.add->load_vary.flat)
852                                 return pan_pack_add_ld_var_flat_imm(clause, bundle.add, regs);
853                         else
854                                 return pan_pack_add_ld_var_imm(clause, bundle.add, regs);
855                 } else {
856                         if (bundle.add->load_vary.flat)
857                                 return pan_pack_add_ld_var_flat(clause, bundle.add, regs);
858                         else
859                                 return pan_pack_add_ld_var(clause, bundle.add, regs);
860                 }
861         case BI_LOAD_VAR_ADDRESS:
862                 return pan_pack_add_lea_attr_imm(clause, bundle.add, regs);
863         case BI_LOAD_TILE:
864                 return pan_pack_add_ld_tile(clause, bundle.add, regs);
865         case BI_MINMAX:
866                 if (bundle.add->op.minmax == BI_MINMAX_MIN) {
867                         if (bundle.add->dest_type == nir_type_float32)
868                                 return pan_pack_add_fmin_f32(clause, bundle.add, regs);
869                         else if (bundle.add->dest_type == nir_type_float16)
870                                 return pan_pack_add_fmin_v2f16(clause, bundle.add, regs);
871                         unreachable("TODO");
872                 } else {
873                         if (bundle.add->dest_type == nir_type_float32)
874                                 return pan_pack_add_fmax_f32(clause, bundle.add, regs);
875                         else if (bundle.add->dest_type == nir_type_float16)
876                                 return pan_pack_add_fmax_v2f16(clause, bundle.add, regs);
877                         unreachable("TODO");
878                 }
879         case BI_MOV:
880                 unreachable("Packing todo");
881         case BI_STORE:
882                 assert(src0_u32 || src0_s32 || src0_f32);
883                 switch (bundle.add->vector_channels) {
884                 case 1: return pan_pack_add_store_i32(clause, bundle.add, regs);
885                 case 2: return pan_pack_add_store_i64(clause, bundle.add, regs);
886                 case 3: return pan_pack_add_store_i96(clause, bundle.add, regs);
887                 case 4: return pan_pack_add_store_i128(clause, bundle.add, regs);
888                 default: unreachable("Invalid channel count");
889                 }
890         case BI_STORE_VAR:
891                 return pan_pack_add_st_cvt(clause, bundle.add, regs);
892         case BI_SPECIAL_ADD:
893                 return bi_pack_add_special(clause, bundle.add, regs);
894         case BI_TABLE:
895                 assert(bundle.add->dest_type == nir_type_float32);
896                 return pan_pack_add_flogd_f32(clause, bundle.add, regs);
897         case BI_SELECT:
898                 assert(nir_alu_type_get_type_size(bundle.add->src_types[0]) == 16);
899                 return pan_pack_add_mkvec_v2i16(clause, bundle.add, regs);
900         case BI_TEXC:
901                 return pan_pack_add_texc(clause, bundle.add, regs);
902         case BI_TEXC_DUAL:
903                 unreachable("Packing todo");
904         case BI_TEXS:
905                 assert(f16 || f32);
906 
907                 if (f16)
908                         return pan_pack_add_texs_2d_f16(clause, bundle.add, regs);
909                 else
910                         return pan_pack_add_texs_2d_f32(clause, bundle.add, regs);
911 case BI_ROUND:
912                 unreachable("Packing todo");
913         case BI_ZS_EMIT:
914                 return pan_pack_add_zs_emit(clause, bundle.add, regs);
915         default:
916                 unreachable("Cannot encode class as ADD");
917         }
918 }
919 
920 struct bi_packed_bundle {
921         uint64_t lo;
922         uint64_t hi;
923 };
924 
925 /* We must ensure slot 1 > slot 0 for the 63-x trick to function, so we fix
926  * this up at pack time. (Scheduling doesn't care.) */
927 
928 static void
bi_flip_slots(bi_registers * regs)929 bi_flip_slots(bi_registers *regs)
930 {
931         if (regs->enabled[0] && regs->enabled[1] && regs->slot[1] < regs->slot[0]) {
932                 unsigned temp = regs->slot[0];
933                 regs->slot[0] = regs->slot[1];
934                 regs->slot[1] = temp;
935         }
936 
937 }
938 
939 /* Lower CUBEFACE2 to a CUBEFACE1/CUBEFACE2. This is a hack so the scheduler
940  * doesn't have to worry about this while we're just packing singletons */
941 
942 static void
bi_lower_cubeface2(bi_context * ctx,bi_bundle * bundle)943 bi_lower_cubeface2(bi_context *ctx, bi_bundle *bundle)
944 {
945         /* Filter for +CUBEFACE2 */
946         if (!bundle->add || bundle->add->type != BI_SPECIAL_ADD
947                          || bundle->add->op.special != BI_SPECIAL_CUBEFACE2) {
948                 return;
949         }
950 
951         /* This won't be used once we emit non-singletons, for now this is just
952          * a fact of our scheduler and allows us to clobber FMA */
953         assert(!bundle->fma);
954 
955         /* Construct an FMA op */
956         bi_instruction cubeface1 = {
957                 .type = BI_SPECIAL_FMA,
958                 .op.special = BI_SPECIAL_CUBEFACE1,
959                 /* no dest, just to a temporary */
960                 .dest_type = nir_type_float32,
961                 .src_types = { nir_type_float32, nir_type_float32, nir_type_float32 },
962         };
963 
964         /* Copy over the register allocated sources (coordinates). */
965         memcpy(&cubeface1.src, bundle->add->src, sizeof(cubeface1.src));
966 
967         /* Zeroed by RA since this is all 32-bit */
968         for (unsigned i = 0; i < 3; ++i)
969                 assert(bundle->add->swizzle[i][0] == 0);
970 
971         /* Emit the instruction */
972         bundle->fma = bi_emit_before(ctx, bundle->add, cubeface1);
973 
974         /* Now replace the sources of the CUBEFACE2 with a single passthrough
975          * from the CUBEFACE1 (and a side-channel) */
976         bundle->add->src[0] = BIR_INDEX_PASS | BIFROST_SRC_STAGE;
977         bundle->add->src[1] = bundle->add->src[2] = 0;
978 }
979 
980 static struct bi_packed_bundle
bi_pack_bundle(bi_clause * clause,bi_bundle bundle,bi_bundle prev,bool first_bundle,gl_shader_stage stage)981 bi_pack_bundle(bi_clause *clause, bi_bundle bundle, bi_bundle prev, bool first_bundle, gl_shader_stage stage)
982 {
983         bi_assign_slots(&bundle, &prev);
984         bi_assign_fau_idx(clause, &bundle);
985         bundle.regs.first_instruction = first_bundle;
986 
987         bi_flip_slots(&bundle.regs);
988 
989         uint64_t reg = bi_pack_registers(bundle.regs);
990         uint64_t fma = bi_pack_fma(clause, bundle, &bundle.regs);
991         uint64_t add = bi_pack_add(clause, bundle, &bundle.regs, stage);
992 
993         struct bi_packed_bundle packed = {
994                 .lo = reg | (fma << 35) | ((add & 0b111111) << 58),
995                 .hi = add >> 6
996         };
997 
998         return packed;
999 }
1000 
1001 /* Packs the next two constants as a dedicated constant quadword at the end of
1002  * the clause, returning the number packed. There are two cases to consider:
1003  *
1004  * Case #1: Branching is not used. For a single constant copy the upper nibble
1005  * over, easy.
1006  *
1007  * Case #2: Branching is used. For a single constant, it suffices to set the
1008  * upper nibble to 4 and leave the latter constant 0, which matches what the
1009  * blob does.
1010  *
1011  * Extending to multiple constants is considerably more tricky and left for
1012  * future work.
1013  */
1014 
1015 static unsigned
bi_pack_constants(bi_context * ctx,bi_clause * clause,unsigned index,struct util_dynarray * emission)1016 bi_pack_constants(bi_context *ctx, bi_clause *clause,
1017                 unsigned index,
1018                 struct util_dynarray *emission)
1019 {
1020         /* After these two, are we done? Determines tag */
1021         bool done = clause->constant_count <= (index + 2);
1022         ASSERTED bool only = clause->constant_count <= (index + 1);
1023 
1024         /* Is the constant we're packing for a branch? */
1025         bool branches = clause->branch_constant && done;
1026 
1027         /* TODO: Pos */
1028         assert(index == 0 && clause->bundle_count == 1);
1029         assert(only);
1030 
1031         /* Compute branch offset instead of a dummy 0 */
1032         if (branches) {
1033                 bi_instruction *br = clause->bundles[clause->bundle_count - 1].add;
1034                 assert(br && br->type == BI_BRANCH && br->branch_target);
1035 
1036                 /* Put it in the high place */
1037                 int32_t qwords = bi_block_offset(ctx, clause, br->branch_target);
1038                 int32_t bytes = qwords * 16;
1039 
1040                 /* Copy so we get proper sign behaviour */
1041                 uint32_t raw = 0;
1042                 memcpy(&raw, &bytes, sizeof(raw));
1043 
1044                 /* Clear off top bits for the magic bits */
1045                 raw &= ~0xF0000000;
1046 
1047                 /* Put in top 32-bits */
1048                 clause->constants[index + 0] = ((uint64_t) raw) << 32ull;
1049         }
1050 
1051         uint64_t hi = clause->constants[index + 0] >> 60ull;
1052 
1053         struct bifrost_fmt_constant quad = {
1054                 .pos = 0, /* TODO */
1055                 .tag = done ? BIFROST_FMTC_FINAL : BIFROST_FMTC_CONSTANTS,
1056                 .imm_1 = clause->constants[index + 0] >> 4,
1057                 .imm_2 = ((hi < 8) ? (hi << 60ull) : 0) >> 4,
1058         };
1059 
1060         if (branches) {
1061                 /* Branch offsets are less than 60-bits so this should work at
1062                  * least for now */
1063                 quad.imm_1 |= (4ull << 60ull) >> 4;
1064                 assert (hi == 0);
1065         }
1066 
1067         /* XXX: On G71, Connor observed that the difference of the top 4 bits
1068          * of the second constant with the first must be less than 8, otherwise
1069          * we have to swap them. On G52, I'm able to reproduce a similar issue
1070          * but with a different workaround (modeled above with a single
1071          * constant, unclear how to workaround for multiple constants.) Further
1072          * investigation needed. Possibly an errata. XXX */
1073 
1074         util_dynarray_append(emission, struct bifrost_fmt_constant, quad);
1075 
1076         return 2;
1077 }
1078 
1079 static void
bi_pack_clause(bi_context * ctx,bi_clause * clause,bi_clause * next_1,bi_clause * next_2,struct util_dynarray * emission,gl_shader_stage stage,bool tdd)1080 bi_pack_clause(bi_context *ctx, bi_clause *clause,
1081                 bi_clause *next_1, bi_clause *next_2,
1082                 struct util_dynarray *emission, gl_shader_stage stage,
1083                 bool tdd)
1084 {
1085         /* After the deadline lowering */
1086         bi_lower_cubeface2(ctx, &clause->bundles[0]);
1087 
1088         struct bi_packed_bundle ins_1 = bi_pack_bundle(clause, clause->bundles[0], clause->bundles[0], true, stage);
1089         assert(clause->bundle_count == 1);
1090 
1091         /* State for packing constants throughout */
1092         unsigned constant_index = 0;
1093 
1094         struct bifrost_fmt1 quad_1 = {
1095                 .tag = clause->constant_count ? BIFROST_FMT1_CONSTANTS : BIFROST_FMT1_FINAL,
1096                 .header = bi_pack_header(clause, next_1, next_2, tdd),
1097                 .ins_1 = ins_1.lo,
1098                 .ins_2 = ins_1.hi & ((1 << 11) - 1),
1099                 .ins_0 = (ins_1.hi >> 11) & 0b111,
1100         };
1101 
1102         util_dynarray_append(emission, struct bifrost_fmt1, quad_1);
1103 
1104         /* Pack the remaining constants */
1105 
1106         while (constant_index < clause->constant_count) {
1107                 constant_index += bi_pack_constants(ctx, clause,
1108                                 constant_index, emission);
1109         }
1110 }
1111 
1112 static bi_clause *
bi_next_clause(bi_context * ctx,pan_block * block,bi_clause * clause)1113 bi_next_clause(bi_context *ctx, pan_block *block, bi_clause *clause)
1114 {
1115         /* Try the first clause in this block if we're starting from scratch */
1116         if (!clause && !list_is_empty(&((bi_block *) block)->clauses))
1117                 return list_first_entry(&((bi_block *) block)->clauses, bi_clause, link);
1118 
1119         /* Try the next clause in this block */
1120         if (clause && clause->link.next != &((bi_block *) block)->clauses)
1121                 return list_first_entry(&(clause->link), bi_clause, link);
1122 
1123         /* Try the next block, or the one after that if it's empty, etc .*/
1124         pan_block *next_block = pan_next_block(block);
1125 
1126         bi_foreach_block_from(ctx, next_block, block) {
1127                 bi_block *blk = (bi_block *) block;
1128 
1129                 if (!list_is_empty(&blk->clauses))
1130                         return list_first_entry(&(blk->clauses), bi_clause, link);
1131         }
1132 
1133         return NULL;
1134 }
1135 
1136 /* We should terminate discarded threads if there may be discarded threads (a
1137  * fragment shader) and helper invocations are not used. Further logic may be
1138  * required for future discard/demote differentiation
1139  */
1140 
1141 static bool
bi_terminate_discarded_threads(bi_context * ctx)1142 bi_terminate_discarded_threads(bi_context *ctx)
1143 {
1144         if (ctx->stage == MESA_SHADER_FRAGMENT)
1145                 return !ctx->nir->info.fs.needs_helper_invocations;
1146         else
1147                 return false;
1148 }
1149 
1150 static void
bi_collect_blend_ret_addr(bi_context * ctx,struct util_dynarray * emission,const bi_clause * clause)1151 bi_collect_blend_ret_addr(bi_context *ctx, struct util_dynarray *emission,
1152                           const bi_clause *clause)
1153 {
1154         /* No need to collect return addresses when we're in a blend shader. */
1155         if (ctx->is_blend)
1156                 return;
1157 
1158         const bi_bundle *bundle = &clause->bundles[clause->bundle_count - 1];
1159         const bi_instruction *ins = bundle->add;
1160 
1161         if (!ins || ins->type != BI_BLEND)
1162                 return;
1163 
1164         /* We don't support non-terminal blend instructions yet.
1165          * That would requires fixing blend shaders to restore the registers
1166          * they use before jumping back to the fragment shader, which is
1167          * currently not supported.
1168          */
1169         assert(0);
1170 
1171         assert(ins->blend_location < ARRAY_SIZE(ctx->blend_ret_offsets));
1172         assert(!ctx->blend_ret_offsets[ins->blend_location]);
1173         ctx->blend_ret_offsets[ins->blend_location] =
1174                 util_dynarray_num_elements(emission, uint8_t);
1175         assert(!(ctx->blend_ret_offsets[ins->blend_location] & 0x7));
1176 }
1177 
1178 void
bi_pack(bi_context * ctx,struct util_dynarray * emission)1179 bi_pack(bi_context *ctx, struct util_dynarray *emission)
1180 {
1181         bool tdd = bi_terminate_discarded_threads(ctx);
1182 
1183         bi_foreach_block(ctx, _block) {
1184                 bi_block *block = (bi_block *) _block;
1185 
1186                 /* Passthrough the first clause of where we're branching to for
1187                  * the last clause of the block (the clause with the branch) */
1188 
1189                 bi_clause *succ_clause = block->base.successors[1] ?
1190                         bi_next_clause(ctx, block->base.successors[0], NULL) : NULL;
1191 
1192                 bi_foreach_clause_in_block(block, clause) {
1193                         bool is_last = clause->link.next == &block->clauses;
1194 
1195                         bi_clause *next = bi_next_clause(ctx, _block, clause);
1196                         bi_clause *next_2 = is_last ? succ_clause : NULL;
1197 
1198                         bi_pack_clause(ctx, clause, next, next_2, emission, ctx->stage, tdd);
1199 
1200                         if (!is_last)
1201                                 bi_collect_blend_ret_addr(ctx, emission, clause);
1202                 }
1203         }
1204 }
1205