1 /*
2  * Copyright © 2019 Google, Inc.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  */
23 
24 #include "nir.h"
25 #include "nir_vla.h"
26 
27 /* Lowering for amul instructions, for drivers that support imul24.
28  * This pass will analyze indirect derefs, and convert corresponding
29  * amul instructions to either imul or imul24, depending on the
30  * required range.
31  *
32  * 1) Analyze the uniform variables and build a table of UBOs and SSBOs
33  *    that are either too large, or might be too large (unknown size)
34  *    for imul24
35  *
36  * 2) Loop thru looking at all the intrinsics, finding dereferences of
37  *    large variables, and recursively replacing all amul instructions
38  *    used with imul
39  *
40  * 3) Finally loop again thru all instructions replacing any remaining
41  *    amul with imul24.  At this point any remaining amul instructions
42  *    are not involved in calculating an offset into a large variable,
43  *    thanks to the 2nd step, so they can be safely replace with imul24.
44  *
45  * Using two passes over all the instructions lets us handle the case
46  * where, due to CSE, an amul is used to calculate an offset into both
47  * a large and small variable.
48  */
49 
50 typedef struct {
51    nir_shader *shader;
52 
53    int (*type_size)(const struct glsl_type *, bool);
54 
55    /* Tables of UBOs and SSBOs mapping driver_location/base whether
56     * they are too large to use imul24:
57     */
58    bool *large_ubos;
59    bool *large_ssbos;
60 
61    /* for cases that we cannot determine UBO/SSBO index, track if *any*
62     * UBO/SSBO is too large for imul24:
63     */
64    bool has_large_ubo;
65    bool has_large_ssbo;
66 
67    unsigned max_slot;
68 } lower_state;
69 
70 /* Lower 'amul's in offset src of large variables to 'imul': */
71 static bool
lower_large_src(nir_src * src,void * s)72 lower_large_src(nir_src *src, void *s)
73 {
74    lower_state *state = s;
75 
76    assert(src->is_ssa);
77 
78    nir_instr *parent = src->ssa->parent_instr;
79 
80    /* No need to visit instructions we've already visited.. this also
81     * avoids infinite recursion when phi's are involved:
82     */
83    if (parent->pass_flags)
84       return false;
85 
86    bool progress = nir_foreach_src(parent, lower_large_src, state);
87 
88    if (parent->type == nir_instr_type_alu) {
89       nir_alu_instr *alu = nir_instr_as_alu(parent);
90       if (alu->op == nir_op_amul) {
91          alu->op = nir_op_imul;
92          progress = true;
93       }
94    }
95 
96    parent->pass_flags = 1;
97 
98    return progress;
99 }
100 
101 static bool
large_ubo(lower_state * state,nir_src src)102 large_ubo(lower_state *state, nir_src src)
103 {
104    if (!nir_src_is_const(src))
105       return state->has_large_ubo;
106    unsigned idx = nir_src_as_uint(src);
107    assert(idx < state->shader->info.num_ubos);
108    return state->large_ubos[idx];
109 }
110 
111 static bool
large_ssbo(lower_state * state,nir_src src)112 large_ssbo(lower_state *state, nir_src src)
113 {
114    if (!nir_src_is_const(src))
115       return state->has_large_ssbo;
116    unsigned idx = nir_src_as_uint(src);
117    assert(idx < state->shader->info.num_ssbos);
118    return state->large_ssbos[idx];
119 }
120 
121 static bool
lower_intrinsic(lower_state * state,nir_intrinsic_instr * intr)122 lower_intrinsic(lower_state *state, nir_intrinsic_instr *intr)
123 {
124    switch (intr->intrinsic) {
125    case nir_intrinsic_load_ubo:
126       //# src[] = { buffer_index, offset }.
127       if (large_ubo(state, intr->src[0]))
128          return lower_large_src(&intr->src[1], state);
129       return false;
130 
131    case nir_intrinsic_load_ssbo:
132       //# src[] = { buffer_index, offset }.
133       if (large_ssbo(state, intr->src[0]))
134          return lower_large_src(&intr->src[1], state);
135       return false;
136 
137    case nir_intrinsic_store_ssbo:
138       //# src[] = { value, block_index, offset }
139       if (large_ssbo(state, intr->src[1]))
140          return lower_large_src(&intr->src[2], state);
141       return false;
142 
143    case nir_intrinsic_ssbo_atomic_add:
144    case nir_intrinsic_ssbo_atomic_imin:
145    case nir_intrinsic_ssbo_atomic_umin:
146    case nir_intrinsic_ssbo_atomic_imax:
147    case nir_intrinsic_ssbo_atomic_umax:
148    case nir_intrinsic_ssbo_atomic_and:
149    case nir_intrinsic_ssbo_atomic_or:
150    case nir_intrinsic_ssbo_atomic_xor:
151    case nir_intrinsic_ssbo_atomic_exchange:
152    case nir_intrinsic_ssbo_atomic_comp_swap:
153    case nir_intrinsic_ssbo_atomic_fadd:
154    case nir_intrinsic_ssbo_atomic_fmin:
155    case nir_intrinsic_ssbo_atomic_fmax:
156    case nir_intrinsic_ssbo_atomic_fcomp_swap:
157       /* 0: SSBO index
158        * 1: offset
159        */
160       if (large_ssbo(state, intr->src[0]))
161          return lower_large_src(&intr->src[1], state);
162       return false;
163 
164    case nir_intrinsic_global_atomic_add:
165    case nir_intrinsic_global_atomic_imin:
166    case nir_intrinsic_global_atomic_umin:
167    case nir_intrinsic_global_atomic_imax:
168    case nir_intrinsic_global_atomic_umax:
169    case nir_intrinsic_global_atomic_and:
170    case nir_intrinsic_global_atomic_or:
171    case nir_intrinsic_global_atomic_xor:
172    case nir_intrinsic_global_atomic_exchange:
173    case nir_intrinsic_global_atomic_comp_swap:
174    case nir_intrinsic_global_atomic_fadd:
175    case nir_intrinsic_global_atomic_fmin:
176    case nir_intrinsic_global_atomic_fmax:
177    case nir_intrinsic_global_atomic_fcomp_swap:
178       /* just assume we that 24b is not sufficient: */
179       return lower_large_src(&intr->src[0], state);
180 
181    /* These should all be small enough to unconditionally use imul24: */
182    case nir_intrinsic_shared_atomic_add:
183    case nir_intrinsic_shared_atomic_imin:
184    case nir_intrinsic_shared_atomic_umin:
185    case nir_intrinsic_shared_atomic_imax:
186    case nir_intrinsic_shared_atomic_umax:
187    case nir_intrinsic_shared_atomic_and:
188    case nir_intrinsic_shared_atomic_or:
189    case nir_intrinsic_shared_atomic_xor:
190    case nir_intrinsic_shared_atomic_exchange:
191    case nir_intrinsic_shared_atomic_comp_swap:
192    case nir_intrinsic_shared_atomic_fadd:
193    case nir_intrinsic_shared_atomic_fmin:
194    case nir_intrinsic_shared_atomic_fmax:
195    case nir_intrinsic_shared_atomic_fcomp_swap:
196    case nir_intrinsic_load_uniform:
197    case nir_intrinsic_load_input:
198    case nir_intrinsic_load_output:
199    case nir_intrinsic_store_output:
200    default:
201       return false;
202    }
203 }
204 
205 static bool
lower_instr(lower_state * state,nir_instr * instr)206 lower_instr(lower_state *state, nir_instr *instr)
207 {
208    bool progress = false;
209 
210    if (instr->type == nir_instr_type_intrinsic) {
211       progress |= lower_intrinsic(state, nir_instr_as_intrinsic(instr));
212    }
213 
214    return progress;
215 }
216 
217 static bool
is_large(lower_state * state,nir_variable * var)218 is_large(lower_state *state, nir_variable *var)
219 {
220    const struct glsl_type *type = glsl_without_array(var->type);
221    unsigned size = state->type_size(type, false);
222 
223    /* if size is not known (ie. VLA) then assume the worst: */
224    if (!size)
225       return true;
226 
227    return size >= (1 << 23);
228 }
229 
230 bool
nir_lower_amul(nir_shader * shader,int (* type_size)(const struct glsl_type *,bool))231 nir_lower_amul(nir_shader *shader,
232                int (*type_size)(const struct glsl_type *, bool))
233 {
234    assert(shader->options->has_imul24);
235    assert(type_size);
236 
237    NIR_VLA_FILL(bool, large_ubos, shader->info.num_ubos, 0);
238    NIR_VLA_FILL(bool, large_ssbos, shader->info.num_ssbos, 0);
239 
240    lower_state state = {
241       .shader = shader,
242       .type_size = type_size,
243       .large_ubos = large_ubos,
244       .large_ssbos = large_ssbos,
245    };
246 
247    /* Figure out which UBOs or SSBOs are large enough to be
248     * disqualified from imul24:
249     */
250    nir_foreach_variable_in_shader (var, shader) {
251       if (var->data.mode == nir_var_mem_ubo) {
252          if (is_large(&state, var)) {
253             state.has_large_ubo = true;
254             unsigned size = MAX2(1, glsl_array_size(var->type));
255             for (unsigned i = 0; i < size; i++)
256                state.large_ubos[var->data.binding + i] = true;
257          }
258       } else if (var->data.mode == nir_var_mem_ssbo) {
259          if (is_large(&state, var)) {
260             state.has_large_ssbo = true;
261             unsigned size = MAX2(1, glsl_array_size(var->type));
262             for (unsigned i = 0; i < size; i++)
263                state.large_ssbos[var->data.binding + i] = true;
264          }
265       }
266    }
267 
268    /* clear pass flags: */
269    nir_foreach_function(function, shader) {
270       nir_function_impl *impl = function->impl;
271       if (!impl)
272          continue;
273 
274       nir_foreach_block(block, impl) {
275          nir_foreach_instr(instr, block) {
276             instr->pass_flags = 0;
277          }
278       }
279    }
280 
281    bool progress = false;
282    nir_foreach_function(function, shader) {
283       nir_function_impl *impl = function->impl;
284 
285       if (!impl)
286          continue;
287 
288       nir_foreach_block(block, impl) {
289          nir_foreach_instr(instr, block) {
290             progress |= lower_instr(&state, instr);
291          }
292       }
293    }
294 
295    /* At this point, all 'amul's used in calculating an offset into
296     * a large variable have been replaced with 'imul'.  So remaining
297     * 'amul's can be replaced with 'imul24':
298     */
299    nir_foreach_function(function, shader) {
300       nir_function_impl *impl = function->impl;
301 
302       if (!impl)
303          continue;
304 
305       nir_foreach_block(block, impl) {
306          nir_foreach_instr(instr, block) {
307             if (instr->type != nir_instr_type_alu)
308                continue;
309 
310             nir_alu_instr *alu = nir_instr_as_alu(instr);
311             if (alu->op != nir_op_amul)
312                continue;
313 
314             alu->op = nir_op_imul24;
315             progress |= true;
316          }
317       }
318 
319       nir_metadata_preserve(impl, nir_metadata_block_index |
320                                   nir_metadata_dominance);
321 
322    }
323 
324    return progress;
325 }
326