1 /*
2  * Copyright (C) 2020 Collabora Ltd.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  *
23  * Authors (Collabora):
24  *      Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
25  */
26 
27 #include "compiler.h"
28 
29 /* Finds the clause type required or return none */
30 
31 static bool
bi_is_fragz(bi_instruction * ins)32 bi_is_fragz(bi_instruction *ins)
33 {
34         if (!(ins->src[0] & BIR_INDEX_CONSTANT))
35                 return false;
36 
37         return (ins->constant.u32 == BIFROST_FRAGZ);
38 }
39 
40 static enum bifrost_message_type
bi_message_type_for_ins(bi_instruction * ins)41 bi_message_type_for_ins(bi_instruction *ins)
42 {
43         unsigned T = ins->type;
44 
45         /* Only high latency ops impose clause types */
46         if (!(bi_class_props[T] & BI_SCHED_HI_LATENCY))
47                 return BIFROST_MESSAGE_NONE;
48 
49         switch (T) {
50         case BI_BRANCH:
51         case BI_DISCARD:
52                 return BIFROST_MESSAGE_NONE;
53 
54         case BI_LOAD_VAR:
55                 if (bi_is_fragz(ins))
56                         return BIFROST_MESSAGE_Z_STENCIL;
57 
58                 return BIFROST_MESSAGE_VARYING;
59 
60         case BI_LOAD_UNIFORM:
61         case BI_LOAD_ATTR:
62         case BI_LOAD_VAR_ADDRESS:
63                 return BIFROST_MESSAGE_ATTRIBUTE;
64 
65         case BI_TEXS:
66         case BI_TEXC:
67         case BI_TEXC_DUAL:
68                 return BIFROST_MESSAGE_TEX;
69 
70         case BI_LOAD:
71                 return BIFROST_MESSAGE_LOAD;
72 
73         case BI_STORE:
74         case BI_STORE_VAR:
75                 return BIFROST_MESSAGE_STORE;
76 
77         case BI_BLEND:
78                 return BIFROST_MESSAGE_BLEND;
79 
80         case BI_LOAD_TILE:
81                 return BIFROST_MESSAGE_TILE;
82 
83         case BI_ATEST:
84                 return BIFROST_MESSAGE_ATEST;
85 
86         case BI_ZS_EMIT:
87                 return BIFROST_MESSAGE_Z_STENCIL;
88 
89         default:
90                 unreachable("Invalid high-latency class");
91         }
92 }
93 
94 /* There is an encoding restriction against FMA fp16 add/min/max
95  * having both sources with abs(..) with a duplicated source. This is
96  * due to the packing being order-sensitive, so the slots must end up distinct
97  * to handle both having abs(..). The swizzle doesn't matter here. Note
98  * BIR_INDEX_REGISTER generally should not be used pre-schedule (TODO: enforce
99  * this).
100  */
101 
102 static bool
bi_ambiguous_abs(bi_instruction * ins)103 bi_ambiguous_abs(bi_instruction *ins)
104 {
105         bool classy = bi_class_props[ins->type] & BI_NO_ABS_ABS_FP16_FMA;
106         bool typey = ins->dest_type == nir_type_float16;
107         bool absy = ins->src_abs[0] && ins->src_abs[1];
108 
109         return classy && typey && absy;
110 }
111 
112 /* New Bifrost (which?) don't seem to have ICMP on FMA */
113 static bool
bi_icmp(bi_instruction * ins)114 bi_icmp(bi_instruction *ins)
115 {
116         bool ic = nir_alu_type_get_base_type(ins->src_types[0]) != nir_type_float;
117         return ic && (ins->type == BI_CMP);
118 }
119 
120 /* No 8/16-bit IADD/ISUB on FMA */
121 static bool
bi_imath_small(bi_instruction * ins)122 bi_imath_small(bi_instruction *ins)
123 {
124         bool sz = nir_alu_type_get_type_size(ins->src_types[0]) < 32;
125         return sz && (ins->type == BI_IMATH);
126 }
127 
128 /* Lowers FMOV to ADD #0, since FMOV doesn't exist on the h/w and this is the
129  * latest time it's sane to lower (it's useful to distinguish before, but we'll
130  * need this handle during scheduling to ensure the slots get modeled
131  * correctly with respect to the new zero source) */
132 
133 static void
bi_lower_fmov(bi_instruction * ins)134 bi_lower_fmov(bi_instruction *ins)
135 {
136         if (ins->type != BI_FMOV)
137                 return;
138 
139         ins->type = BI_ADD;
140         ins->src[1] = BIR_INDEX_ZERO;
141         ins->src_types[1] = ins->src_types[0];
142 }
143 
144 /* To work out the back-to-back flag, we need to detect branches and
145  * "fallthrough" branches, implied in the last clause of a block that falls
146  * through to another block with *multiple predecessors*. */
147 
148 static bool
bi_back_to_back(bi_block * block)149 bi_back_to_back(bi_block *block)
150 {
151         /* Last block of a program */
152         if (!block->base.successors[0]) {
153                 assert(!block->base.successors[1]);
154                 return false;
155         }
156 
157         /* Multiple successors? We're branching */
158         if (block->base.successors[1])
159                 return false;
160 
161         struct pan_block *succ = block->base.successors[0];
162         assert(succ->predecessors);
163         unsigned count = succ->predecessors->entries;
164 
165         /* Back to back only if the successor has only a single predecessor */
166         return (count == 1);
167 }
168 
169 /* Insert a clause wrapping a single instruction */
170 
171 bi_clause *
bi_make_singleton(void * memctx,bi_instruction * ins,bi_block * block,unsigned scoreboard_id,unsigned dependencies,bool osrb)172 bi_make_singleton(void *memctx, bi_instruction *ins,
173                 bi_block *block,
174                 unsigned scoreboard_id,
175                 unsigned dependencies,
176                 bool osrb)
177 {
178         unsigned props = bi_class_props[ins->type];
179 
180         bi_clause *u = rzalloc(memctx, bi_clause);
181         u->bundle_count = 1;
182 
183         /* Check for scheduling restrictions */
184 
185         bool can_fma = props & BI_SCHED_FMA;
186         ASSERTED bool can_add = props & BI_SCHED_ADD;
187 
188         can_fma &= !bi_ambiguous_abs(ins);
189         can_fma &= !bi_icmp(ins);
190         can_fma &= !bi_imath_small(ins);
191 
192         assert(can_fma || can_add);
193 
194         if (can_fma)
195                 u->bundles[0].fma = ins;
196         else
197                 u->bundles[0].add = ins;
198 
199         u->scoreboard_id = scoreboard_id;
200         u->staging_barrier = osrb;
201         u->dependencies = dependencies;
202 
203         if (ins->type == BI_ATEST)
204                 u->dependencies |= (1 << 6);
205 
206         if (ins->type == BI_BLEND)
207                 u->dependencies |= (1 << 6) | (1 << 7);
208 
209         /* Let's be optimistic, we'll fix up later */
210         u->flow_control = BIFROST_FLOW_NBTB;
211 
212         u->constant_count = 1;
213         u->constants[0] = ins->constant.u64;
214 
215         if (ins->type == BI_BRANCH && ins->branch_target)
216                 u->branch_constant = true;
217 
218         /* We always prefetch except unconditional branches */
219         u->next_clause_prefetch = !(
220                         (ins->type == BI_BRANCH) &&
221                         (ins->cond == BI_COND_ALWAYS));
222 
223         u->message_type = bi_message_type_for_ins(ins);
224         u->block = block;
225 
226         return u;
227 }
228 
229 /* Eventually, we'll need a proper scheduling, grouping instructions
230  * into clauses and ordering/assigning grouped instructions to the
231  * appropriate FMA/ADD slots. Right now we do the dumbest possible
232  * thing just to have the scheduler stubbed out so we can focus on
233  * codegen */
234 
235 void
bi_schedule(bi_context * ctx)236 bi_schedule(bi_context *ctx)
237 {
238         bool is_first = true;
239 
240         bi_foreach_block(ctx, block) {
241                 bi_block *bblock = (bi_block *) block;
242 
243                 list_inithead(&bblock->clauses);
244 
245                 bi_foreach_instr_in_block(bblock, ins) {
246                         /* Convenient time to lower */
247                         bi_lower_fmov(ins);
248 
249                         bi_clause *u = bi_make_singleton(ctx, ins,
250                                         bblock, 0, (1 << 0),
251                                         !is_first);
252 
253                         is_first = false;
254                         list_addtail(&u->link, &bblock->clauses);
255                 }
256 
257                 /* Back-to-back bit affects only the last clause of a block,
258                  * the rest are implicitly true */
259 
260                 if (!list_is_empty(&bblock->clauses)) {
261                         bi_clause *last_clause = list_last_entry(&bblock->clauses, bi_clause, link);
262                         if (!bi_back_to_back(bblock))
263                                 last_clause->flow_control = BIFROST_FLOW_NBTB_UNCONDITIONAL;
264                 }
265 
266                 bblock->scheduled = true;
267         }
268 }
269