1 /*
2 * Copyright (C) 2020 Collabora, Ltd.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 */
23
24 #include "compiler.h"
25 #include "bi_print.h"
26 #include "bi_generated_pack.h"
27
28 #define RETURN_PACKED(str) { \
29 uint64_t temp = 0; \
30 memcpy(&temp, &str, sizeof(str)); \
31 return temp; \
32 }
33
34 /* This file contains the final passes of the compiler. Running after
35 * scheduling and RA, the IR is now finalized, so we need to emit it to actual
36 * bits on the wire (as well as fixup branches) */
37
38 static uint64_t
bi_pack_header(bi_clause * clause,bi_clause * next_1,bi_clause * next_2,bool tdd)39 bi_pack_header(bi_clause *clause, bi_clause *next_1, bi_clause *next_2, bool tdd)
40 {
41 /* next_dependencies are the union of the dependencies of successors'
42 * dependencies */
43
44 unsigned dependency_wait = next_1 ? next_1->dependencies : 0;
45 dependency_wait |= next_2 ? next_2->dependencies : 0;
46
47 struct bifrost_header header = {
48 .flow_control =
49 (next_1 == NULL) ? BIFROST_FLOW_END :
50 clause->flow_control,
51 .terminate_discarded_threads = tdd,
52 .next_clause_prefetch = clause->next_clause_prefetch,
53 .staging_barrier = clause->staging_barrier,
54 .staging_register = clause->staging_register,
55 .dependency_wait = dependency_wait,
56 .dependency_slot = clause->scoreboard_id,
57 .message_type = clause->message_type,
58 .next_message_type = next_1 ? next_1->message_type : 0,
59 .suppress_inf = true,
60 .suppress_nan = true,
61 };
62
63 uint64_t u = 0;
64 memcpy(&u, &header, sizeof(header));
65 return u;
66 }
67
68 /* The uniform/constant slot allows loading a contiguous 64-bit immediate or
69 * pushed uniform per bundle. Figure out which one we need in the bundle (the
70 * scheduler needs to ensure we only have one type per bundle), validate
71 * everything, and rewrite away the register/uniform indices to use 3-bit
72 * sources directly. */
73
74 static unsigned
bi_lookup_constant(bi_clause * clause,uint32_t cons,bool * hi)75 bi_lookup_constant(bi_clause *clause, uint32_t cons, bool *hi)
76 {
77 for (unsigned i = 0; i < clause->constant_count; ++i) {
78 /* Try to apply to top or to bottom */
79 uint64_t top = clause->constants[i];
80
81 if (cons == ((uint32_t) top | (cons & 0xF)))
82 return i;
83
84 if (cons == (top >> 32ul)) {
85 *hi = true;
86 return i;
87 }
88 }
89
90 unreachable("Invalid constant accessed");
91 }
92
93 static unsigned
bi_constant_field(unsigned idx)94 bi_constant_field(unsigned idx)
95 {
96 assert(idx <= 5);
97
98 const unsigned values[] = {
99 4, 5, 6, 7, 2, 3
100 };
101
102 return values[idx] << 4;
103 }
104
105 static bool
bi_assign_fau_idx_single(bi_registers * regs,bi_clause * clause,bi_instruction * ins,bool assigned,bool fast_zero)106 bi_assign_fau_idx_single(bi_registers *regs,
107 bi_clause *clause,
108 bi_instruction *ins,
109 bool assigned,
110 bool fast_zero)
111 {
112 if (!ins)
113 return assigned;
114
115 if (ins->type == BI_BRANCH && clause->branch_constant) {
116 /* By convention branch constant is last */
117 unsigned idx = clause->constant_count - 1;
118
119 /* We can only jump to clauses which are qword aligned so the
120 * bottom 4-bits of the offset are necessarily 0 */
121 unsigned lo = 0;
122
123 /* Build the constant */
124 unsigned C = bi_constant_field(idx) | lo;
125
126 if (assigned && regs->fau_idx != C)
127 unreachable("Mismatched fau_idx: branch");
128
129 regs->fau_idx = C;
130 return true;
131 }
132
133 bi_foreach_src(ins, s) {
134 if (s == 0 && (ins->type == BI_LOAD_VAR_ADDRESS || ins->type == BI_LOAD_ATTR)) continue;
135 if (s == 1 && (ins->type == BI_BRANCH)) continue;
136
137 if (ins->src[s] & BIR_INDEX_CONSTANT) {
138 /* Let direct addresses through */
139 if (ins->type == BI_LOAD_VAR)
140 continue;
141
142 bool hi = false;
143 uint32_t cons = bi_get_immediate(ins, s);
144 unsigned idx = bi_lookup_constant(clause, cons, &hi);
145 unsigned lo = clause->constants[idx] & 0xF;
146 unsigned f = bi_constant_field(idx) | lo;
147
148 if (assigned && regs->fau_idx != f)
149 unreachable("Mismatched uniform/const field: imm");
150
151 regs->fau_idx = f;
152 ins->src[s] = BIR_INDEX_PASS | (hi ? BIFROST_SRC_FAU_HI : BIFROST_SRC_FAU_LO);
153 assigned = true;
154 } else if (ins->src[s] & BIR_INDEX_ZERO && (ins->type == BI_LOAD_UNIFORM || ins->type == BI_LOAD_VAR)) {
155 /* XXX: HACK UNTIL WE HAVE HI MATCHING DUE TO OVERFLOW XXX */
156 ins->src[s] = BIR_INDEX_PASS | BIFROST_SRC_FAU_HI;
157 } else if (ins->src[s] & BIR_INDEX_ZERO && !fast_zero) {
158 /* FMAs have a fast zero slot, ADD needs to use the
159 * uniform/const slot's special 0 mode handled here */
160 unsigned f = 0;
161
162 if (assigned && regs->fau_idx != f)
163 unreachable("Mismatched uniform/const field: 0");
164
165 regs->fau_idx = f;
166 ins->src[s] = BIR_INDEX_PASS | BIFROST_SRC_FAU_LO;
167 assigned = true;
168 } else if (ins->src[s] & BIR_INDEX_ZERO && fast_zero) {
169 ins->src[s] = BIR_INDEX_PASS | BIFROST_SRC_STAGE;
170 } else if (ins->src[s] & BIR_INDEX_BLEND) {
171 unsigned rt = ins->blend_location;
172
173 assert(rt <= 7);
174 assert((ins->src[s] & ~BIR_SPECIAL) == BIFROST_SRC_FAU_HI ||
175 (ins->src[s] & ~BIR_SPECIAL) == BIFROST_SRC_FAU_LO);
176 ins->src[s] = BIR_INDEX_PASS | (ins->src[s] & ~BIR_SPECIAL);
177 if (assigned && regs->fau_idx != (8 | rt))
178 unreachable("Mismatched FAU index");
179
180 regs->fau_idx = 8 | rt;
181 assigned = true;
182 } else if (s & BIR_INDEX_UNIFORM) {
183 unreachable("Push uniforms not implemented yet");
184 }
185 }
186
187 return assigned;
188 }
189
190 static void
bi_assign_fau_idx(bi_clause * clause,bi_bundle * bundle)191 bi_assign_fau_idx(bi_clause *clause,
192 bi_bundle *bundle)
193 {
194 bool assigned =
195 bi_assign_fau_idx_single(&bundle->regs, clause, bundle->fma, false, true);
196
197 bi_assign_fau_idx_single(&bundle->regs, clause, bundle->add, assigned, false);
198 }
199
200 /* Assigns a slot for reading, before anything is written */
201
202 static void
bi_assign_slot_read(bi_registers * regs,unsigned src)203 bi_assign_slot_read(bi_registers *regs, unsigned src)
204 {
205 /* We only assign for registers */
206 if (!(src & BIR_INDEX_REGISTER))
207 return;
208
209 unsigned reg = src & ~BIR_INDEX_REGISTER;
210
211 /* Check if we already assigned the slot */
212 for (unsigned i = 0; i <= 1; ++i) {
213 if (regs->slot[i] == reg && regs->enabled[i])
214 return;
215 }
216
217 if (regs->slot[2] == reg && regs->slot23.slot2 == BIFROST_OP_READ)
218 return;
219
220 /* Assign it now */
221
222 for (unsigned i = 0; i <= 1; ++i) {
223 if (!regs->enabled[i]) {
224 regs->slot[i] = reg;
225 regs->enabled[i] = true;
226 return;
227 }
228 }
229
230 if (!regs->slot23.slot3) {
231 regs->slot[2] = reg;
232 regs->slot23.slot2 = BIFROST_OP_READ;
233 return;
234 }
235
236 bi_print_slots(regs, stderr);
237 unreachable("Failed to find a free slot for src");
238 }
239
240 static bi_registers
bi_assign_slots(bi_bundle * now,bi_bundle * prev)241 bi_assign_slots(bi_bundle *now, bi_bundle *prev)
242 {
243 /* We assign slots for the main register mechanism. Special ops
244 * use the data registers, which has its own mechanism entirely
245 * and thus gets skipped over here. */
246
247 unsigned read_dreg = now->add &&
248 bi_class_props[now->add->type] & BI_DATA_REG_SRC;
249
250 unsigned write_dreg = prev->add &&
251 bi_class_props[prev->add->type] & BI_DATA_REG_DEST;
252
253 /* First, assign reads */
254
255 if (now->fma)
256 bi_foreach_src(now->fma, src)
257 bi_assign_slot_read(&now->regs, now->fma->src[src]);
258
259 if (now->add) {
260 bi_foreach_src(now->add, src) {
261 if (!(src == 0 && read_dreg))
262 bi_assign_slot_read(&now->regs, now->add->src[src]);
263 }
264 }
265
266 /* Next, assign writes */
267
268 if (prev->add && prev->add->dest & BIR_INDEX_REGISTER && !write_dreg) {
269 now->regs.slot[3] = prev->add->dest & ~BIR_INDEX_REGISTER;
270 now->regs.slot23.slot3 = BIFROST_OP_WRITE;
271 }
272
273 if (prev->fma && prev->fma->dest & BIR_INDEX_REGISTER) {
274 unsigned r = prev->fma->dest & ~BIR_INDEX_REGISTER;
275
276 if (now->regs.slot23.slot3) {
277 /* Scheduler constraint: cannot read 3 and write 2 */
278 assert(!now->regs.slot23.slot2);
279 now->regs.slot[2] = r;
280 now->regs.slot23.slot2 = BIFROST_OP_WRITE;
281 } else {
282 now->regs.slot[3] = r;
283 now->regs.slot23.slot3 = BIFROST_OP_WRITE;
284 now->regs.slot23.slot3_fma = true;
285 }
286 }
287
288 return now->regs;
289 }
290
291 static enum bifrost_reg_mode
bi_pack_register_mode(bi_registers r)292 bi_pack_register_mode(bi_registers r)
293 {
294 /* Handle idle special case for first instructions */
295 if (r.first_instruction && !(r.slot23.slot2 | r.slot23.slot3))
296 return BIFROST_IDLE_1;
297
298 /* Otherwise, use the LUT */
299 for (unsigned i = 0; i < ARRAY_SIZE(bifrost_reg_ctrl_lut); ++i) {
300 if (memcmp(bifrost_reg_ctrl_lut + i, &r.slot23, sizeof(r.slot23)) == 0)
301 return i;
302 }
303
304 bi_print_slots(&r, stderr);
305 unreachable("Invalid slot assignment");
306 }
307
308 static uint64_t
bi_pack_registers(bi_registers regs)309 bi_pack_registers(bi_registers regs)
310 {
311 enum bifrost_reg_mode mode = bi_pack_register_mode(regs);
312 struct bifrost_regs s = { 0 };
313 uint64_t packed = 0;
314
315 /* Need to pack 5-bit mode as a 4-bit field. The decoder moves bit 3 to bit 4 for
316 * first instruction and adds 16 when reg 2 == reg 3 */
317
318 unsigned ctrl;
319 bool r2_equals_r3 = false;
320
321 if (regs.first_instruction) {
322 /* Bit 3 implicitly must be clear for first instructions.
323 * The affected patterns all write both ADD/FMA, but that
324 * is forbidden for the first instruction, so this does
325 * not add additional encoding constraints */
326 assert(!(mode & 0x8));
327
328 /* Move bit 4 to bit 3, since bit 3 is clear */
329 ctrl = (mode & 0x7) | ((mode & 0x10) >> 1);
330
331 /* If we can let r2 equal r3, we have to or the hardware raises
332 * INSTR_INVALID_ENC (it's unclear why). */
333 if (!(regs.slot23.slot2 && regs.slot23.slot3))
334 r2_equals_r3 = true;
335 } else {
336 /* We force r2=r3 or not for the upper bit */
337 ctrl = (mode & 0xF);
338 r2_equals_r3 = (mode & 0x10);
339 }
340
341 if (regs.enabled[1]) {
342 /* Gotta save that bit!~ Required by the 63-x trick */
343 assert(regs.slot[1] > regs.slot[0]);
344 assert(regs.enabled[0]);
345
346 /* Do the 63-x trick, see docs/disasm */
347 if (regs.slot[0] > 31) {
348 regs.slot[0] = 63 - regs.slot[0];
349 regs.slot[1] = 63 - regs.slot[1];
350 }
351
352 assert(regs.slot[0] <= 31);
353 assert(regs.slot[1] <= 63);
354
355 s.ctrl = ctrl;
356 s.reg1 = regs.slot[1];
357 s.reg0 = regs.slot[0];
358 } else {
359 /* slot 1 disabled, so set to zero and use slot 1 for ctrl */
360 s.ctrl = 0;
361 s.reg1 = ctrl << 2;
362
363 if (regs.enabled[0]) {
364 /* Bit 0 upper bit of slot 0 */
365 s.reg1 |= (regs.slot[0] >> 5);
366
367 /* Rest of slot 0 in usual spot */
368 s.reg0 = (regs.slot[0] & 0b11111);
369 } else {
370 /* Bit 1 set if slot 0 also disabled */
371 s.reg1 |= (1 << 1);
372 }
373 }
374
375 /* Force r2 =/!= r3 as needed */
376 if (r2_equals_r3) {
377 assert(regs.slot[3] == regs.slot[2] || !(regs.slot23.slot2 && regs.slot23.slot3));
378
379 if (regs.slot23.slot2)
380 regs.slot[3] = regs.slot[2];
381 else
382 regs.slot[2] = regs.slot[3];
383 } else if (!regs.first_instruction) {
384 /* Enforced by the encoding anyway */
385 assert(regs.slot[2] != regs.slot[3]);
386 }
387
388 s.reg2 = regs.slot[2];
389 s.reg3 = regs.slot[3];
390 s.fau_idx = regs.fau_idx;
391
392 memcpy(&packed, &s, sizeof(s));
393 return packed;
394 }
395
396 static unsigned
bi_pack_fma_special(bi_clause * clause,bi_instruction * ins,bi_registers * regs)397 bi_pack_fma_special(bi_clause *clause, bi_instruction *ins, bi_registers *regs)
398 {
399 switch (ins->op.special) {
400 case BI_SPECIAL_CUBEFACE1:
401 return pan_pack_fma_cubeface1(clause, ins, regs);
402 default:
403 unreachable("Unknown special op");
404 }
405 }
406
407 #define BI_PACK_SHIFT(name) \
408 static unsigned \
409 bi_pack_fma_ ## name(bi_clause *clause, bi_instruction *ins, bi_registers *regs) \
410 { \
411 switch (nir_alu_type_get_type_size(ins->dest_type)) { \
412 case 32: \
413 return pan_pack_fma_ ## name ## _i32(clause, ins, regs); \
414 case 16: \
415 return pan_pack_fma_ ## name ## _v2i16(clause, ins, regs); \
416 case 8: \
417 return pan_pack_fma_ ## name ## _v4i8(clause, ins, regs); \
418 default: \
419 unreachable("Invalid dest size"); \
420 } \
421 }
422
423 BI_PACK_SHIFT(rshift_and)
BI_PACK_SHIFT(lshift_and)424 BI_PACK_SHIFT(lshift_and)
425 BI_PACK_SHIFT(rshift_or)
426 BI_PACK_SHIFT(lshift_or)
427 BI_PACK_SHIFT(rshift_xor)
428 BI_PACK_SHIFT(lshift_xor)
429 BI_PACK_SHIFT(arshift)
430
431 static unsigned
432 bi_pack_fma_bitwise(bi_clause *clause, bi_instruction *ins, bi_registers *regs)
433 {
434 switch (ins->op.bitwise) {
435 case BI_BITWISE_AND:
436 return ins->bitwise.rshift ?
437 bi_pack_fma_rshift_and(clause, ins, regs) :
438 bi_pack_fma_lshift_and(clause, ins, regs);
439 case BI_BITWISE_OR:
440 return ins->bitwise.rshift ?
441 bi_pack_fma_rshift_or(clause, ins, regs) :
442 bi_pack_fma_lshift_or(clause, ins, regs);
443 case BI_BITWISE_XOR:
444 return ins->bitwise.rshift ?
445 bi_pack_fma_rshift_xor(clause, ins, regs) :
446 bi_pack_fma_lshift_xor(clause, ins, regs);
447 case BI_BITWISE_ARSHIFT:
448 assert(ins->bitwise.rshift);
449 return bi_pack_fma_arshift(clause, ins, regs);
450 default:
451 unreachable("Invalid bitwise op");
452 }
453 }
454
455 static unsigned
bi_pack_fma(bi_clause * clause,bi_bundle bundle,bi_registers * regs)456 bi_pack_fma(bi_clause *clause, bi_bundle bundle, bi_registers *regs)
457 {
458 if (!bundle.fma)
459 return pan_pack_fma_nop_i32(clause, NULL, regs);
460
461 bool f16 = bundle.fma->dest_type == nir_type_float16;
462 bool f32 = bundle.fma->dest_type == nir_type_float32;
463 bool u32 = bundle.fma->dest_type == nir_type_uint32 ||
464 bundle.fma->dest_type == nir_type_bool32;
465 bool u16 = bundle.fma->dest_type == nir_type_uint16;
466 bool s32 = bundle.fma->dest_type == nir_type_int32;
467 bool s16 = bundle.fma->dest_type == nir_type_int16;
468
469 bool src0_f16 = bundle.fma->src_types[0] == nir_type_float16;
470 bool src0_f32 = bundle.fma->src_types[0] == nir_type_float32;
471 bool src0_u16 = bundle.fma->src_types[0] == nir_type_uint16;
472 bool src0_s16 = bundle.fma->src_types[0] == nir_type_int16;
473 bool src0_s8 = bundle.fma->src_types[0] == nir_type_int8;
474 bool src0_u8 = bundle.fma->src_types[0] == nir_type_uint8;
475
476 enum bi_cond cond = bundle.fma->cond;
477 bool typeless_cond = (cond == BI_COND_EQ) || (cond == BI_COND_NE);
478
479 switch (bundle.fma->type) {
480 case BI_ADD:
481 if (bundle.fma->dest_type == nir_type_float32)
482 return pan_pack_fma_fadd_f32(clause, bundle.fma, regs);
483 else if (bundle.fma->dest_type == nir_type_float16)
484 return pan_pack_fma_fadd_v2f16(clause, bundle.fma, regs);
485
486 unreachable("TODO");
487 case BI_CMP:
488 assert (src0_f16 || src0_f32);
489
490 if (src0_f32)
491 return pan_pack_fma_fcmp_f32(clause, bundle.fma, regs);
492 else
493 return pan_pack_fma_fcmp_v2f16(clause, bundle.fma, regs);
494 case BI_BITWISE:
495 return bi_pack_fma_bitwise(clause, bundle.fma, regs);
496 case BI_CONVERT:
497 if (src0_s8) {
498 assert(s32);
499 return pan_pack_fma_s8_to_s32(clause, bundle.fma, regs);
500 } else if (src0_u8) {
501 assert(u32);
502 return pan_pack_fma_u8_to_u32(clause, bundle.fma, regs);
503 } else if (src0_s16) {
504 assert(s32);
505 return pan_pack_fma_s16_to_s32(clause, bundle.fma, regs);
506 } else if (src0_u16) {
507 assert(u32);
508 return pan_pack_fma_u16_to_u32(clause, bundle.fma, regs);
509 } else if (src0_f16) {
510 assert(f32);
511 return pan_pack_fma_f16_to_f32(clause, bundle.fma, regs);
512 } else if (src0_f32) {
513 assert(f16);
514 return pan_pack_fma_v2f32_to_v2f16(clause, bundle.fma, regs);
515 }
516
517 unreachable("Invalid FMA convert");
518 case BI_CSEL:
519 if (f32)
520 return pan_pack_fma_csel_f32(clause, bundle.fma, regs);
521 else if (f16)
522 return pan_pack_fma_csel_v2f16(clause, bundle.fma, regs);
523 else if ((u32 || s32) && typeless_cond)
524 return pan_pack_fma_csel_i32(clause, bundle.fma, regs);
525 else if ((u16 || s16) && typeless_cond)
526 return pan_pack_fma_csel_v2i16(clause, bundle.fma, regs);
527 else if (u32)
528 return pan_pack_fma_csel_u32(clause, bundle.fma, regs);
529 else if (u16)
530 return pan_pack_fma_csel_v2u16(clause, bundle.fma, regs);
531 else if (s32)
532 return pan_pack_fma_csel_s32(clause, bundle.fma, regs);
533 else if (s16)
534 return pan_pack_fma_csel_v2s16(clause, bundle.fma, regs);
535 else
536 unreachable("Invalid csel type");
537 case BI_FMA:
538 if (bundle.fma->dest_type == nir_type_float32) {
539 if (bundle.fma->op.mscale)
540 return pan_pack_fma_fma_rscale_f32(clause, bundle.fma, regs);
541 else
542 return pan_pack_fma_fma_f32(clause, bundle.fma, regs);
543 } else {
544 assert(bundle.fma->dest_type == nir_type_float16);
545
546 if (bundle.fma->op.mscale)
547 return pan_pack_fma_fma_rscale_v2f16(clause, bundle.fma, regs);
548 else
549 return pan_pack_fma_fma_v2f16(clause, bundle.fma, regs);
550 }
551 case BI_FREXP:
552 assert(src0_f32 || src0_f16);
553
554 if (src0_f32)
555 return pan_pack_fma_frexpe_f32(clause, bundle.fma, regs);
556 else
557 return pan_pack_fma_frexpe_v2f16(clause, bundle.fma, regs);
558 case BI_IMATH:
559 /* XXX: Only 32-bit, with carries/borrows forced */
560 assert(s32 || u32);
561
562 if (bundle.fma->op.imath == BI_IMATH_ADD)
563 return pan_pack_fma_iaddc_i32(clause, bundle.fma, regs);
564 else
565 return pan_pack_fma_isubb_i32(clause, bundle.fma, regs);
566 case BI_MOV:
567 return pan_pack_fma_mov_i32(clause, bundle.fma, regs);
568 case BI_SELECT:
569 if (nir_alu_type_get_type_size(bundle.fma->src_types[0]) == 16) {
570 return pan_pack_fma_mkvec_v2i16(clause, bundle.fma, regs);
571 } else {
572 assert(nir_alu_type_get_type_size(bundle.fma->src_types[0]) == 8);
573 return pan_pack_fma_mkvec_v4i8(clause, bundle.fma, regs);
574 }
575 case BI_ROUND:
576 assert(f16 || f32);
577
578 if (f16)
579 return pan_pack_fma_fround_v2f16(clause, bundle.fma, regs);
580 else
581 return pan_pack_fma_fround_f32(clause, bundle.fma, regs);
582 case BI_REDUCE_FMA:
583 assert(src0_f32 && f32);
584 return pan_pack_fma_fadd_lscale_f32(clause, bundle.fma, regs);
585 case BI_IMUL:
586 return pan_pack_fma_imul_i32(clause, bundle.fma, regs);
587 case BI_SPECIAL_FMA:
588 return bi_pack_fma_special(clause, bundle.fma, regs);
589 default:
590 unreachable("Cannot encode class as FMA");
591 }
592 }
593
594 static unsigned
bi_pack_add_branch_cond(bi_instruction * ins,bi_registers * regs)595 bi_pack_add_branch_cond(bi_instruction *ins, bi_registers *regs)
596 {
597 assert(ins->cond == BI_COND_EQ);
598 assert(ins->src[1] == BIR_INDEX_ZERO);
599
600 unsigned zero_ctrl = 0;
601 unsigned size = nir_alu_type_get_type_size(ins->src_types[0]);
602
603 if (size == 16) {
604 /* See BR_SIZE_ZERO swizzle disassembly */
605 zero_ctrl = ins->swizzle[0][0] ? 1 : 2;
606 } else {
607 assert(size == 32);
608 }
609
610 /* EQ swap to NE */
611 bool slot_swapped = false;
612
613 struct bifrost_branch pack = {
614 .src0 = bi_get_src(ins, regs, 0),
615 .src1 = (zero_ctrl << 1) | !slot_swapped,
616 .cond = BR_COND_EQ,
617 .size = BR_SIZE_ZERO,
618 .op = BIFROST_ADD_OP_BRANCH
619 };
620
621 if (ins->branch_target) {
622 /* We assigned the constant slot to fetch the branch offset so
623 * we can just passthrough here. We put in the HI slot to match
624 * the blob since that's where the magic flags end up
625 */
626 assert(!ins->src[2]);
627 pack.src2 = BIFROST_SRC_FAU_HI;
628 } else {
629 pack.src2 = bi_get_src(ins, regs, 2);
630 }
631
632 RETURN_PACKED(pack);
633 }
634
635 static unsigned
bi_pack_add_branch_uncond(bi_instruction * ins,bi_registers * regs)636 bi_pack_add_branch_uncond(bi_instruction *ins, bi_registers *regs)
637 {
638 struct bifrost_branch pack = {
639 /* It's unclear what these bits actually mean */
640 .src0 = BIFROST_SRC_FAU_LO,
641 .src1 = BIFROST_SRC_PASS_FMA,
642
643 /* All ones in fact */
644 .cond = (BR_ALWAYS & 0x7),
645 .size = (BR_ALWAYS >> 3),
646 .op = BIFROST_ADD_OP_BRANCH
647 };
648
649 if (ins->branch_target) {
650 /* Offset is passed as a PC-relative offset through an
651 * embedded constant.
652 */
653 assert(!ins->src[2]);
654 pack.src2 = BIFROST_SRC_FAU_HI;
655 } else {
656 pack.src2 = bi_get_src(ins, regs, 2);
657 }
658
659 RETURN_PACKED(pack);
660 }
661
662 static unsigned
bi_pack_add_branch(bi_instruction * ins,bi_registers * regs)663 bi_pack_add_branch(bi_instruction *ins, bi_registers *regs)
664 {
665 if (ins->cond == BI_COND_ALWAYS)
666 return bi_pack_add_branch_uncond(ins, regs);
667 else
668 return bi_pack_add_branch_cond(ins, regs);
669 }
670
671 static unsigned
bi_pack_add_special(bi_clause * clause,bi_instruction * ins,bi_registers * regs)672 bi_pack_add_special(bi_clause *clause, bi_instruction *ins, bi_registers *regs)
673 {
674 bool f16 = ins->dest_type == nir_type_float16;
675
676 switch (ins->op.special) {
677 case BI_SPECIAL_FRCP:
678 return f16 ? pan_pack_add_frcp_f16(clause, ins, regs) :
679 pan_pack_add_frcp_f32(clause, ins, regs);
680 case BI_SPECIAL_FRSQ:
681 return f16 ? pan_pack_add_frsq_f16(clause, ins, regs) :
682 pan_pack_add_frsq_f32(clause, ins, regs);
683 case BI_SPECIAL_EXP2_LOW:
684 assert(!f16);
685 return pan_pack_add_fexp_f32(clause, ins, regs);
686 case BI_SPECIAL_IABS:
687 assert(ins->src_types[0] == nir_type_int32);
688 return pan_pack_add_iabs_s32(clause, ins, regs);
689 case BI_SPECIAL_CUBEFACE2:
690 return pan_pack_add_cubeface2(clause, ins, regs);
691 case BI_SPECIAL_CUBE_SSEL:
692 return pan_pack_add_cube_ssel(clause, ins, regs);
693 case BI_SPECIAL_CUBE_TSEL:
694 return pan_pack_add_cube_tsel(clause, ins, regs);
695 default:
696 unreachable("Unknown special op");
697 }
698 }
699
700 static unsigned
bi_pack_add(bi_clause * clause,bi_bundle bundle,bi_registers * regs,gl_shader_stage stage)701 bi_pack_add(bi_clause *clause, bi_bundle bundle, bi_registers *regs, gl_shader_stage stage)
702 {
703 if (!bundle.add)
704 return pan_pack_add_nop_i32(clause, NULL, regs);
705
706 bool f16 = bundle.add->dest_type == nir_type_float16;
707 bool f32 = bundle.add->dest_type == nir_type_float32;
708 bool u32 = bundle.add->dest_type == nir_type_uint32 ||
709 bundle.add->dest_type == nir_type_bool32;
710 bool u16 = bundle.add->dest_type == nir_type_uint16;
711 bool s32 = bundle.add->dest_type == nir_type_int32;
712 bool s16 = bundle.add->dest_type == nir_type_int16;
713
714 bool src0_f16 = bundle.add->src_types[0] == nir_type_float16;
715 bool src0_f32 = bundle.add->src_types[0] == nir_type_float32;
716 bool src0_u32 = bundle.add->src_types[0] == nir_type_uint32;
717 bool src0_u16 = bundle.add->src_types[0] == nir_type_uint16;
718 bool src0_u8 = bundle.add->src_types[0] == nir_type_uint8;
719 bool src0_s32 = bundle.add->src_types[0] == nir_type_int32;
720 bool src0_s16 = bundle.add->src_types[0] == nir_type_int16;
721 bool src0_s8 = bundle.add->src_types[0] == nir_type_int8;
722
723 unsigned sz = nir_alu_type_get_type_size(bundle.add->dest_type);
724 enum bi_cond cond = bundle.add->cond;
725 bool typeless_cond = (cond == BI_COND_EQ) || (cond == BI_COND_NE);
726
727 switch (bundle.add->type) {
728 case BI_ADD:
729 if (bundle.add->dest_type == nir_type_float32)
730 return pan_pack_add_fadd_f32(clause, bundle.add, regs);
731 else if (bundle.add->dest_type == nir_type_float16)
732 return pan_pack_add_fadd_v2f16(clause, bundle.add, regs);
733
734 unreachable("TODO");
735 case BI_ATEST:
736 return pan_pack_add_atest(clause, bundle.add, regs);
737 case BI_BRANCH:
738 return bi_pack_add_branch(bundle.add, regs);
739 case BI_CMP:
740 if (src0_f32)
741 return pan_pack_add_fcmp_f32(clause, bundle.add, regs);
742 else if (src0_f16)
743 return pan_pack_add_fcmp_v2f16(clause, bundle.add, regs);
744 else if ((src0_u32 || src0_s32) && typeless_cond)
745 return pan_pack_add_icmp_i32(clause, bundle.add, regs);
746 else if ((src0_u16 || src0_s16) && typeless_cond)
747 return pan_pack_add_icmp_v2i16(clause, bundle.add, regs);
748 else if ((src0_u8 || src0_s8) && typeless_cond)
749 return pan_pack_add_icmp_v4i8(clause, bundle.add, regs);
750 else if (src0_u32)
751 return pan_pack_add_icmp_u32(clause, bundle.add, regs);
752 else if (src0_u16)
753 return pan_pack_add_icmp_v2u16(clause, bundle.add, regs);
754 else if (src0_u8)
755 return pan_pack_add_icmp_v4u8(clause, bundle.add, regs);
756 else if (src0_s32)
757 return pan_pack_add_icmp_s32(clause, bundle.add, regs);
758 else if (src0_s16)
759 return pan_pack_add_icmp_v2s16(clause, bundle.add, regs);
760 else if (src0_s8)
761 return pan_pack_add_icmp_v4s8(clause, bundle.add, regs);
762 else
763 unreachable("Invalid cmp type");
764 case BI_BLEND:
765 return pan_pack_add_blend(clause, bundle.add, regs);
766 case BI_BITWISE:
767 unreachable("Packing todo");
768 case BI_CONVERT:
769 if (src0_f16 && s16)
770 return pan_pack_add_v2f16_to_v2s16(clause, bundle.add, regs);
771 else if (src0_f16 && u16)
772 return pan_pack_add_v2f16_to_v2u16(clause, bundle.add, regs);
773 else if (src0_f16 && s32)
774 return pan_pack_add_f16_to_s32(clause, bundle.add, regs);
775 else if (src0_f16 && u32)
776 return pan_pack_add_f16_to_u32(clause, bundle.add, regs);
777 else if (src0_s16 && f16)
778 return pan_pack_add_v2s16_to_v2f16(clause, bundle.add, regs);
779 else if (src0_u16 && f16)
780 return pan_pack_add_v2u16_to_v2f16(clause, bundle.add, regs);
781 else if (src0_s8 && s16)
782 return pan_pack_add_v2s8_to_v2s16(clause, bundle.add, regs);
783 else if (src0_u8 && u16)
784 return pan_pack_add_v2u8_to_v2u16(clause, bundle.add, regs);
785 else if (src0_s8 && f16)
786 return pan_pack_add_v2s8_to_v2f16(clause, bundle.add, regs);
787 else if (src0_u8 && f16)
788 return pan_pack_add_v2u8_to_v2f16(clause, bundle.add, regs);
789 else if (src0_f32 && s32)
790 return pan_pack_add_f32_to_s32(clause, bundle.add, regs);
791 else if (src0_f32 && u32)
792 return pan_pack_add_f32_to_u32(clause, bundle.add, regs);
793 else if (src0_s8 && s32)
794 return pan_pack_add_s8_to_s32(clause, bundle.add, regs);
795 else if (src0_u8 && u32)
796 return pan_pack_add_u8_to_u32(clause, bundle.add, regs);
797 else if (src0_s8 && f32)
798 return pan_pack_add_s8_to_f32(clause, bundle.add, regs);
799 else if (src0_u8 && f32)
800 return pan_pack_add_u8_to_f32(clause, bundle.add, regs);
801 else if (src0_s32 && f32)
802 return pan_pack_add_s32_to_f32(clause, bundle.add, regs);
803 else if (src0_u32 && f32)
804 return pan_pack_add_u32_to_f32(clause, bundle.add, regs);
805 else if (src0_s16 && s32)
806 return pan_pack_add_s16_to_s32(clause, bundle.add, regs);
807 else if (src0_u16 && u32)
808 return pan_pack_add_u16_to_u32(clause, bundle.add, regs);
809 else if (src0_s16 && f32)
810 return pan_pack_add_s16_to_f32(clause, bundle.add, regs);
811 else if (src0_u16 && f32)
812 return pan_pack_add_u16_to_f32(clause, bundle.add, regs);
813 else if (src0_f16 && f32)
814 return pan_pack_add_f16_to_f32(clause, bundle.add, regs);
815 else if (src0_f32 && f16)
816 return pan_pack_add_v2f32_to_v2f16(clause, bundle.add, regs);
817 else
818 unreachable("Invalid ADD convert");
819 case BI_DISCARD:
820 return pan_pack_add_discard_f32(clause, bundle.add, regs);
821 case BI_FREXP:
822 unreachable("Packing todo");
823 case BI_IMATH:
824 assert(sz == 8 || sz == 16 || sz == 32);
825
826 if (bundle.add->op.imath == BI_IMATH_ADD) {
827 return (sz == 8) ? pan_pack_add_iadd_v4s8(clause, bundle.add, regs) :
828 (sz == 16) ? pan_pack_add_iadd_v2s16(clause, bundle.add, regs) :
829 pan_pack_add_iadd_s32(clause, bundle.add, regs);
830 } else {
831 return (sz == 8) ? pan_pack_add_isub_v4s8(clause, bundle.add, regs) :
832 (sz == 16) ? pan_pack_add_isub_v2s16(clause, bundle.add, regs) :
833 pan_pack_add_isub_s32(clause, bundle.add, regs);
834 }
835 case BI_LOAD_ATTR:
836 return pan_pack_add_ld_attr_imm(clause, bundle.add, regs);
837 case BI_LOAD:
838 case BI_LOAD_UNIFORM:
839 assert(u32 || s32 || f32);
840 switch (bundle.add->vector_channels) {
841 case 1: return pan_pack_add_load_i32(clause, bundle.add, regs);
842 case 2: return pan_pack_add_load_i64(clause, bundle.add, regs);
843 case 3: return pan_pack_add_load_i96(clause, bundle.add, regs);
844 case 4: return pan_pack_add_load_i128(clause, bundle.add, regs);
845 default: unreachable("Invalid channel count");
846 }
847 case BI_LOAD_VAR:
848 if (bundle.add->src[0] & BIR_INDEX_CONSTANT) {
849 if (bi_get_immediate(bundle.add, 0) >= 20)
850 return pan_pack_add_ld_var_special(clause, bundle.add, regs);
851 else if (bundle.add->load_vary.flat)
852 return pan_pack_add_ld_var_flat_imm(clause, bundle.add, regs);
853 else
854 return pan_pack_add_ld_var_imm(clause, bundle.add, regs);
855 } else {
856 if (bundle.add->load_vary.flat)
857 return pan_pack_add_ld_var_flat(clause, bundle.add, regs);
858 else
859 return pan_pack_add_ld_var(clause, bundle.add, regs);
860 }
861 case BI_LOAD_VAR_ADDRESS:
862 return pan_pack_add_lea_attr_imm(clause, bundle.add, regs);
863 case BI_LOAD_TILE:
864 return pan_pack_add_ld_tile(clause, bundle.add, regs);
865 case BI_MINMAX:
866 if (bundle.add->op.minmax == BI_MINMAX_MIN) {
867 if (bundle.add->dest_type == nir_type_float32)
868 return pan_pack_add_fmin_f32(clause, bundle.add, regs);
869 else if (bundle.add->dest_type == nir_type_float16)
870 return pan_pack_add_fmin_v2f16(clause, bundle.add, regs);
871 unreachable("TODO");
872 } else {
873 if (bundle.add->dest_type == nir_type_float32)
874 return pan_pack_add_fmax_f32(clause, bundle.add, regs);
875 else if (bundle.add->dest_type == nir_type_float16)
876 return pan_pack_add_fmax_v2f16(clause, bundle.add, regs);
877 unreachable("TODO");
878 }
879 case BI_MOV:
880 unreachable("Packing todo");
881 case BI_STORE:
882 assert(src0_u32 || src0_s32 || src0_f32);
883 switch (bundle.add->vector_channels) {
884 case 1: return pan_pack_add_store_i32(clause, bundle.add, regs);
885 case 2: return pan_pack_add_store_i64(clause, bundle.add, regs);
886 case 3: return pan_pack_add_store_i96(clause, bundle.add, regs);
887 case 4: return pan_pack_add_store_i128(clause, bundle.add, regs);
888 default: unreachable("Invalid channel count");
889 }
890 case BI_STORE_VAR:
891 return pan_pack_add_st_cvt(clause, bundle.add, regs);
892 case BI_SPECIAL_ADD:
893 return bi_pack_add_special(clause, bundle.add, regs);
894 case BI_TABLE:
895 assert(bundle.add->dest_type == nir_type_float32);
896 return pan_pack_add_flogd_f32(clause, bundle.add, regs);
897 case BI_SELECT:
898 assert(nir_alu_type_get_type_size(bundle.add->src_types[0]) == 16);
899 return pan_pack_add_mkvec_v2i16(clause, bundle.add, regs);
900 case BI_TEXC:
901 return pan_pack_add_texc(clause, bundle.add, regs);
902 case BI_TEXC_DUAL:
903 unreachable("Packing todo");
904 case BI_TEXS:
905 assert(f16 || f32);
906
907 if (f16)
908 return pan_pack_add_texs_2d_f16(clause, bundle.add, regs);
909 else
910 return pan_pack_add_texs_2d_f32(clause, bundle.add, regs);
911 case BI_ROUND:
912 unreachable("Packing todo");
913 case BI_ZS_EMIT:
914 return pan_pack_add_zs_emit(clause, bundle.add, regs);
915 default:
916 unreachable("Cannot encode class as ADD");
917 }
918 }
919
920 struct bi_packed_bundle {
921 uint64_t lo;
922 uint64_t hi;
923 };
924
925 /* We must ensure slot 1 > slot 0 for the 63-x trick to function, so we fix
926 * this up at pack time. (Scheduling doesn't care.) */
927
928 static void
bi_flip_slots(bi_registers * regs)929 bi_flip_slots(bi_registers *regs)
930 {
931 if (regs->enabled[0] && regs->enabled[1] && regs->slot[1] < regs->slot[0]) {
932 unsigned temp = regs->slot[0];
933 regs->slot[0] = regs->slot[1];
934 regs->slot[1] = temp;
935 }
936
937 }
938
939 /* Lower CUBEFACE2 to a CUBEFACE1/CUBEFACE2. This is a hack so the scheduler
940 * doesn't have to worry about this while we're just packing singletons */
941
942 static void
bi_lower_cubeface2(bi_context * ctx,bi_bundle * bundle)943 bi_lower_cubeface2(bi_context *ctx, bi_bundle *bundle)
944 {
945 /* Filter for +CUBEFACE2 */
946 if (!bundle->add || bundle->add->type != BI_SPECIAL_ADD
947 || bundle->add->op.special != BI_SPECIAL_CUBEFACE2) {
948 return;
949 }
950
951 /* This won't be used once we emit non-singletons, for now this is just
952 * a fact of our scheduler and allows us to clobber FMA */
953 assert(!bundle->fma);
954
955 /* Construct an FMA op */
956 bi_instruction cubeface1 = {
957 .type = BI_SPECIAL_FMA,
958 .op.special = BI_SPECIAL_CUBEFACE1,
959 /* no dest, just to a temporary */
960 .dest_type = nir_type_float32,
961 .src_types = { nir_type_float32, nir_type_float32, nir_type_float32 },
962 };
963
964 /* Copy over the register allocated sources (coordinates). */
965 memcpy(&cubeface1.src, bundle->add->src, sizeof(cubeface1.src));
966
967 /* Zeroed by RA since this is all 32-bit */
968 for (unsigned i = 0; i < 3; ++i)
969 assert(bundle->add->swizzle[i][0] == 0);
970
971 /* Emit the instruction */
972 bundle->fma = bi_emit_before(ctx, bundle->add, cubeface1);
973
974 /* Now replace the sources of the CUBEFACE2 with a single passthrough
975 * from the CUBEFACE1 (and a side-channel) */
976 bundle->add->src[0] = BIR_INDEX_PASS | BIFROST_SRC_STAGE;
977 bundle->add->src[1] = bundle->add->src[2] = 0;
978 }
979
980 static struct bi_packed_bundle
bi_pack_bundle(bi_clause * clause,bi_bundle bundle,bi_bundle prev,bool first_bundle,gl_shader_stage stage)981 bi_pack_bundle(bi_clause *clause, bi_bundle bundle, bi_bundle prev, bool first_bundle, gl_shader_stage stage)
982 {
983 bi_assign_slots(&bundle, &prev);
984 bi_assign_fau_idx(clause, &bundle);
985 bundle.regs.first_instruction = first_bundle;
986
987 bi_flip_slots(&bundle.regs);
988
989 uint64_t reg = bi_pack_registers(bundle.regs);
990 uint64_t fma = bi_pack_fma(clause, bundle, &bundle.regs);
991 uint64_t add = bi_pack_add(clause, bundle, &bundle.regs, stage);
992
993 struct bi_packed_bundle packed = {
994 .lo = reg | (fma << 35) | ((add & 0b111111) << 58),
995 .hi = add >> 6
996 };
997
998 return packed;
999 }
1000
1001 /* Packs the next two constants as a dedicated constant quadword at the end of
1002 * the clause, returning the number packed. There are two cases to consider:
1003 *
1004 * Case #1: Branching is not used. For a single constant copy the upper nibble
1005 * over, easy.
1006 *
1007 * Case #2: Branching is used. For a single constant, it suffices to set the
1008 * upper nibble to 4 and leave the latter constant 0, which matches what the
1009 * blob does.
1010 *
1011 * Extending to multiple constants is considerably more tricky and left for
1012 * future work.
1013 */
1014
1015 static unsigned
bi_pack_constants(bi_context * ctx,bi_clause * clause,unsigned index,struct util_dynarray * emission)1016 bi_pack_constants(bi_context *ctx, bi_clause *clause,
1017 unsigned index,
1018 struct util_dynarray *emission)
1019 {
1020 /* After these two, are we done? Determines tag */
1021 bool done = clause->constant_count <= (index + 2);
1022 ASSERTED bool only = clause->constant_count <= (index + 1);
1023
1024 /* Is the constant we're packing for a branch? */
1025 bool branches = clause->branch_constant && done;
1026
1027 /* TODO: Pos */
1028 assert(index == 0 && clause->bundle_count == 1);
1029 assert(only);
1030
1031 /* Compute branch offset instead of a dummy 0 */
1032 if (branches) {
1033 bi_instruction *br = clause->bundles[clause->bundle_count - 1].add;
1034 assert(br && br->type == BI_BRANCH && br->branch_target);
1035
1036 /* Put it in the high place */
1037 int32_t qwords = bi_block_offset(ctx, clause, br->branch_target);
1038 int32_t bytes = qwords * 16;
1039
1040 /* Copy so we get proper sign behaviour */
1041 uint32_t raw = 0;
1042 memcpy(&raw, &bytes, sizeof(raw));
1043
1044 /* Clear off top bits for the magic bits */
1045 raw &= ~0xF0000000;
1046
1047 /* Put in top 32-bits */
1048 clause->constants[index + 0] = ((uint64_t) raw) << 32ull;
1049 }
1050
1051 uint64_t hi = clause->constants[index + 0] >> 60ull;
1052
1053 struct bifrost_fmt_constant quad = {
1054 .pos = 0, /* TODO */
1055 .tag = done ? BIFROST_FMTC_FINAL : BIFROST_FMTC_CONSTANTS,
1056 .imm_1 = clause->constants[index + 0] >> 4,
1057 .imm_2 = ((hi < 8) ? (hi << 60ull) : 0) >> 4,
1058 };
1059
1060 if (branches) {
1061 /* Branch offsets are less than 60-bits so this should work at
1062 * least for now */
1063 quad.imm_1 |= (4ull << 60ull) >> 4;
1064 assert (hi == 0);
1065 }
1066
1067 /* XXX: On G71, Connor observed that the difference of the top 4 bits
1068 * of the second constant with the first must be less than 8, otherwise
1069 * we have to swap them. On G52, I'm able to reproduce a similar issue
1070 * but with a different workaround (modeled above with a single
1071 * constant, unclear how to workaround for multiple constants.) Further
1072 * investigation needed. Possibly an errata. XXX */
1073
1074 util_dynarray_append(emission, struct bifrost_fmt_constant, quad);
1075
1076 return 2;
1077 }
1078
1079 static void
bi_pack_clause(bi_context * ctx,bi_clause * clause,bi_clause * next_1,bi_clause * next_2,struct util_dynarray * emission,gl_shader_stage stage,bool tdd)1080 bi_pack_clause(bi_context *ctx, bi_clause *clause,
1081 bi_clause *next_1, bi_clause *next_2,
1082 struct util_dynarray *emission, gl_shader_stage stage,
1083 bool tdd)
1084 {
1085 /* After the deadline lowering */
1086 bi_lower_cubeface2(ctx, &clause->bundles[0]);
1087
1088 struct bi_packed_bundle ins_1 = bi_pack_bundle(clause, clause->bundles[0], clause->bundles[0], true, stage);
1089 assert(clause->bundle_count == 1);
1090
1091 /* State for packing constants throughout */
1092 unsigned constant_index = 0;
1093
1094 struct bifrost_fmt1 quad_1 = {
1095 .tag = clause->constant_count ? BIFROST_FMT1_CONSTANTS : BIFROST_FMT1_FINAL,
1096 .header = bi_pack_header(clause, next_1, next_2, tdd),
1097 .ins_1 = ins_1.lo,
1098 .ins_2 = ins_1.hi & ((1 << 11) - 1),
1099 .ins_0 = (ins_1.hi >> 11) & 0b111,
1100 };
1101
1102 util_dynarray_append(emission, struct bifrost_fmt1, quad_1);
1103
1104 /* Pack the remaining constants */
1105
1106 while (constant_index < clause->constant_count) {
1107 constant_index += bi_pack_constants(ctx, clause,
1108 constant_index, emission);
1109 }
1110 }
1111
1112 static bi_clause *
bi_next_clause(bi_context * ctx,pan_block * block,bi_clause * clause)1113 bi_next_clause(bi_context *ctx, pan_block *block, bi_clause *clause)
1114 {
1115 /* Try the first clause in this block if we're starting from scratch */
1116 if (!clause && !list_is_empty(&((bi_block *) block)->clauses))
1117 return list_first_entry(&((bi_block *) block)->clauses, bi_clause, link);
1118
1119 /* Try the next clause in this block */
1120 if (clause && clause->link.next != &((bi_block *) block)->clauses)
1121 return list_first_entry(&(clause->link), bi_clause, link);
1122
1123 /* Try the next block, or the one after that if it's empty, etc .*/
1124 pan_block *next_block = pan_next_block(block);
1125
1126 bi_foreach_block_from(ctx, next_block, block) {
1127 bi_block *blk = (bi_block *) block;
1128
1129 if (!list_is_empty(&blk->clauses))
1130 return list_first_entry(&(blk->clauses), bi_clause, link);
1131 }
1132
1133 return NULL;
1134 }
1135
1136 /* We should terminate discarded threads if there may be discarded threads (a
1137 * fragment shader) and helper invocations are not used. Further logic may be
1138 * required for future discard/demote differentiation
1139 */
1140
1141 static bool
bi_terminate_discarded_threads(bi_context * ctx)1142 bi_terminate_discarded_threads(bi_context *ctx)
1143 {
1144 if (ctx->stage == MESA_SHADER_FRAGMENT)
1145 return !ctx->nir->info.fs.needs_helper_invocations;
1146 else
1147 return false;
1148 }
1149
1150 static void
bi_collect_blend_ret_addr(bi_context * ctx,struct util_dynarray * emission,const bi_clause * clause)1151 bi_collect_blend_ret_addr(bi_context *ctx, struct util_dynarray *emission,
1152 const bi_clause *clause)
1153 {
1154 /* No need to collect return addresses when we're in a blend shader. */
1155 if (ctx->is_blend)
1156 return;
1157
1158 const bi_bundle *bundle = &clause->bundles[clause->bundle_count - 1];
1159 const bi_instruction *ins = bundle->add;
1160
1161 if (!ins || ins->type != BI_BLEND)
1162 return;
1163
1164 /* We don't support non-terminal blend instructions yet.
1165 * That would requires fixing blend shaders to restore the registers
1166 * they use before jumping back to the fragment shader, which is
1167 * currently not supported.
1168 */
1169 assert(0);
1170
1171 assert(ins->blend_location < ARRAY_SIZE(ctx->blend_ret_offsets));
1172 assert(!ctx->blend_ret_offsets[ins->blend_location]);
1173 ctx->blend_ret_offsets[ins->blend_location] =
1174 util_dynarray_num_elements(emission, uint8_t);
1175 assert(!(ctx->blend_ret_offsets[ins->blend_location] & 0x7));
1176 }
1177
1178 void
bi_pack(bi_context * ctx,struct util_dynarray * emission)1179 bi_pack(bi_context *ctx, struct util_dynarray *emission)
1180 {
1181 bool tdd = bi_terminate_discarded_threads(ctx);
1182
1183 bi_foreach_block(ctx, _block) {
1184 bi_block *block = (bi_block *) _block;
1185
1186 /* Passthrough the first clause of where we're branching to for
1187 * the last clause of the block (the clause with the branch) */
1188
1189 bi_clause *succ_clause = block->base.successors[1] ?
1190 bi_next_clause(ctx, block->base.successors[0], NULL) : NULL;
1191
1192 bi_foreach_clause_in_block(block, clause) {
1193 bool is_last = clause->link.next == &block->clauses;
1194
1195 bi_clause *next = bi_next_clause(ctx, _block, clause);
1196 bi_clause *next_2 = is_last ? succ_clause : NULL;
1197
1198 bi_pack_clause(ctx, clause, next, next_2, emission, ctx->stage, tdd);
1199
1200 if (!is_last)
1201 bi_collect_blend_ret_addr(ctx, emission, clause);
1202 }
1203 }
1204 }
1205