1 /*
2  * © Copyright 2018 Alyssa Rosenzweig
3  * Copyright (C) 2019-2020 Collabora, Ltd.
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice (including the next
13  * paragraph) shall be included in all copies or substantial portions of the
14  * Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  *
24  */
25 
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <string.h>
29 #include "pan_bo.h"
30 #include "pan_context.h"
31 #include "pan_util.h"
32 #include "panfrost-quirks.h"
33 
34 #include "compiler/nir/nir.h"
35 #include "nir/tgsi_to_nir.h"
36 #include "midgard/midgard_compile.h"
37 #include "bifrost/bifrost_compile.h"
38 #include "util/u_dynarray.h"
39 #include "util/u_upload_mgr.h"
40 
41 #include "tgsi/tgsi_dump.h"
42 
43 static void
pan_prepare_midgard_props(struct panfrost_shader_state * state,gl_shader_stage stage)44 pan_prepare_midgard_props(struct panfrost_shader_state *state,
45                           gl_shader_stage stage)
46 {
47         pan_prepare(&state->properties, RENDERER_PROPERTIES);
48         state->properties.uniform_buffer_count = state->ubo_count;
49         state->properties.midgard.uniform_count = state->uniform_count;
50         state->properties.midgard.shader_has_side_effects = state->writes_global;
51 
52         /* TODO: Select the appropriate mode. Suppresing inf/nan works around
53          * some bugs in gles2 apps (eg glmark2's terrain scene) but isn't
54          * conformant on gles3 */
55         state->properties.midgard.fp_mode = MALI_FP_MODE_GL_INF_NAN_SUPPRESSED;
56 
57         /* For fragment shaders, work register count, early-z, reads at draw-time */
58 
59         if (stage != MESA_SHADER_FRAGMENT)
60                 state->properties.midgard.work_register_count = state->work_reg_count;
61 }
62 
63 static void
pan_prepare_bifrost_props(struct panfrost_shader_state * state,gl_shader_stage stage)64 pan_prepare_bifrost_props(struct panfrost_shader_state *state,
65                           gl_shader_stage stage)
66 {
67 
68         switch (stage) {
69         case MESA_SHADER_VERTEX:
70                 pan_prepare(&state->properties, RENDERER_PROPERTIES);
71                 state->properties.bifrost.zs_update_operation = MALI_PIXEL_KILL_STRONG_EARLY;
72                 state->properties.uniform_buffer_count = state->ubo_count;
73 
74                 pan_prepare(&state->preload, PRELOAD);
75                 state->preload.uniform_count = state->uniform_count;
76                 state->preload.vertex.vertex_id = true;
77                 state->preload.vertex.instance_id = true;
78                 break;
79         case MESA_SHADER_FRAGMENT:
80                 pan_prepare(&state->properties, RENDERER_PROPERTIES);
81                 /* Early-Z set at draw-time */
82                 if (state->writes_depth || state->writes_stencil) {
83                         state->properties.bifrost.zs_update_operation = MALI_PIXEL_KILL_FORCE_LATE;
84                         state->properties.bifrost.pixel_kill_operation = MALI_PIXEL_KILL_FORCE_LATE;
85                 } else {
86                         state->properties.bifrost.zs_update_operation = MALI_PIXEL_KILL_STRONG_EARLY;
87                         state->properties.bifrost.pixel_kill_operation = MALI_PIXEL_KILL_FORCE_EARLY;
88                 }
89                 state->properties.uniform_buffer_count = state->ubo_count;
90                 state->properties.bifrost.shader_modifies_coverage = state->can_discard;
91 
92                 pan_prepare(&state->preload, PRELOAD);
93                 state->preload.uniform_count = state->uniform_count;
94                 state->preload.fragment.fragment_position = state->reads_frag_coord;
95                 state->preload.fragment.coverage = true;
96                 state->preload.fragment.primitive_flags = state->reads_face;
97                 break;
98         default:
99                 unreachable("TODO");
100         }
101 }
102 
103 static void
pan_upload_shader_descriptor(struct panfrost_context * ctx,struct panfrost_shader_state * state)104 pan_upload_shader_descriptor(struct panfrost_context *ctx,
105                         struct panfrost_shader_state *state)
106 {
107         const struct panfrost_device *dev = pan_device(ctx->base.screen);
108         struct mali_state_packed *out;
109 
110         u_upload_alloc(ctx->state_uploader, 0, MALI_RENDERER_STATE_LENGTH, MALI_RENDERER_STATE_LENGTH,
111                         &state->upload.offset, &state->upload.rsrc, (void **) &out);
112 
113         pan_pack(out, RENDERER_STATE, cfg) {
114                 cfg.shader = state->shader;
115                 cfg.properties = state->properties;
116 
117                 if (dev->quirks & IS_BIFROST)
118                         cfg.preload = state->preload;
119         }
120 
121         u_upload_unmap(ctx->state_uploader);
122 }
123 
124 static unsigned
pan_format_from_nir_base(nir_alu_type base)125 pan_format_from_nir_base(nir_alu_type base)
126 {
127         switch (base) {
128         case nir_type_int:
129                 return MALI_FORMAT_SINT;
130         case nir_type_uint:
131         case nir_type_bool:
132                 return MALI_FORMAT_UINT;
133         case nir_type_float:
134                 return MALI_CHANNEL_FLOAT;
135         default:
136                 unreachable("Invalid base");
137         }
138 }
139 
140 static unsigned
pan_format_from_nir_size(nir_alu_type base,unsigned size)141 pan_format_from_nir_size(nir_alu_type base, unsigned size)
142 {
143         if (base == nir_type_float) {
144                 switch (size) {
145                 case 16: return MALI_FORMAT_SINT;
146                 case 32: return MALI_FORMAT_UNORM;
147                 default:
148                         unreachable("Invalid float size for format");
149                 }
150         } else {
151                 switch (size) {
152                 case 1:
153                 case 8:  return MALI_CHANNEL_8;
154                 case 16: return MALI_CHANNEL_16;
155                 case 32: return MALI_CHANNEL_32;
156                 default:
157                          unreachable("Invalid int size for format");
158                 }
159         }
160 }
161 
162 static enum mali_format
pan_format_from_glsl(const struct glsl_type * type,unsigned precision,unsigned frac)163 pan_format_from_glsl(const struct glsl_type *type, unsigned precision, unsigned frac)
164 {
165         const struct glsl_type *column = glsl_without_array_or_matrix(type);
166         enum glsl_base_type glsl_base = glsl_get_base_type(column);
167         nir_alu_type t = nir_get_nir_type_for_glsl_base_type(glsl_base);
168         unsigned chan = glsl_get_components(column);
169 
170         /* If we have a fractional location added, we need to increase the size
171          * so it will fit, i.e. a vec3 in YZW requires us to allocate a vec4.
172          * We could do better but this is an edge case as it is, normally
173          * packed varyings will be aligned. */
174         chan += frac;
175 
176         assert(chan >= 1 && chan <= 4);
177 
178         unsigned base = nir_alu_type_get_base_type(t);
179         unsigned size = nir_alu_type_get_type_size(t);
180 
181         /* Demote to fp16 where possible. int16 varyings are TODO as the hw
182          * will saturate instead of wrap which is not conformant, so we need to
183          * insert i2i16/u2u16 instructions before the st_vary_32i/32u to get
184          * the intended behaviour */
185 
186         bool is_16 = (precision == GLSL_PRECISION_MEDIUM)
187                 || (precision == GLSL_PRECISION_LOW);
188 
189         if (is_16 && base == nir_type_float)
190                 size = 16;
191         else
192                 size = 32;
193 
194         return pan_format_from_nir_base(base) |
195                 pan_format_from_nir_size(base, size) |
196                 MALI_NR_CHANNELS(chan);
197 }
198 
199 static enum mali_bifrost_register_file_format
bifrost_blend_type_from_nir(nir_alu_type nir_type)200 bifrost_blend_type_from_nir(nir_alu_type nir_type)
201 {
202         switch(nir_type) {
203         case 0: /* Render target not in use */
204                 return 0;
205         case nir_type_float16:
206                 return MALI_BIFROST_REGISTER_FILE_FORMAT_F16;
207         case nir_type_float32:
208                 return MALI_BIFROST_REGISTER_FILE_FORMAT_F32;
209         case nir_type_int32:
210                 return MALI_BIFROST_REGISTER_FILE_FORMAT_I32;
211         case nir_type_uint32:
212                 return MALI_BIFROST_REGISTER_FILE_FORMAT_U32;
213         case nir_type_int16:
214                 return MALI_BIFROST_REGISTER_FILE_FORMAT_I16;
215         case nir_type_uint16:
216                 return MALI_BIFROST_REGISTER_FILE_FORMAT_U16;
217         default:
218                 unreachable("Unsupported blend shader type for NIR alu type");
219                 return 0;
220         }
221 }
222 
223 void
panfrost_shader_compile(struct panfrost_context * ctx,enum pipe_shader_ir ir_type,const void * ir,gl_shader_stage stage,struct panfrost_shader_state * state,uint64_t * outputs_written)224 panfrost_shader_compile(struct panfrost_context *ctx,
225                         enum pipe_shader_ir ir_type,
226                         const void *ir,
227                         gl_shader_stage stage,
228                         struct panfrost_shader_state *state,
229                         uint64_t *outputs_written)
230 {
231         struct panfrost_device *dev = pan_device(ctx->base.screen);
232 
233         nir_shader *s;
234 
235         if (ir_type == PIPE_SHADER_IR_NIR) {
236                 s = nir_shader_clone(NULL, ir);
237         } else {
238                 assert (ir_type == PIPE_SHADER_IR_TGSI);
239                 s = tgsi_to_nir(ir, ctx->base.screen, false);
240         }
241 
242         s->info.stage = stage;
243 
244         /* Call out to Midgard compiler given the above NIR */
245         struct panfrost_compile_inputs inputs = {
246                 .gpu_id = dev->gpu_id,
247                 .shaderdb = !!(dev->debug & PAN_DBG_PRECOMPILE),
248         };
249 
250         memcpy(inputs.rt_formats, state->rt_formats, sizeof(inputs.rt_formats));
251 
252         panfrost_program *program;
253 
254         if (dev->quirks & IS_BIFROST)
255                 program = bifrost_compile_shader_nir(NULL, s, &inputs);
256         else
257                 program = midgard_compile_shader_nir(NULL, s, &inputs);
258 
259         /* Prepare the compiled binary for upload */
260         mali_ptr shader = 0;
261         unsigned attribute_count = 0, varying_count = 0;
262         int size = program->compiled.size;
263 
264         if (size) {
265                 state->bo = panfrost_bo_create(dev, size, PAN_BO_EXECUTE);
266                 memcpy(state->bo->ptr.cpu, program->compiled.data, size);
267                 shader = state->bo->ptr.gpu;
268         }
269 
270         /* Midgard needs the first tag on the bottom nibble */
271 
272         if (!(dev->quirks & IS_BIFROST)) {
273                 /* If size = 0, we tag as "end-of-shader" */
274 
275                 if (size)
276                         shader |= program->first_tag;
277                 else
278                         shader = 0x1;
279         }
280 
281         state->sysval_count = program->sysval_count;
282         memcpy(state->sysval, program->sysvals, sizeof(state->sysval[0]) * state->sysval_count);
283 
284         bool vertex_id = BITSET_TEST(s->info.system_values_read, SYSTEM_VALUE_VERTEX_ID);
285         bool instance_id = BITSET_TEST(s->info.system_values_read, SYSTEM_VALUE_INSTANCE_ID);
286 
287         state->writes_global = s->info.writes_memory;
288 
289         switch (stage) {
290         case MESA_SHADER_VERTEX:
291                 attribute_count = util_bitcount64(s->info.inputs_read);
292                 varying_count = util_bitcount64(s->info.outputs_written);
293 
294                 if (vertex_id)
295                         attribute_count = MAX2(attribute_count, PAN_VERTEX_ID + 1);
296 
297                 if (instance_id)
298                         attribute_count = MAX2(attribute_count, PAN_INSTANCE_ID + 1);
299 
300                 break;
301         case MESA_SHADER_FRAGMENT:
302                 for (unsigned i = 0; i < ARRAY_SIZE(state->blend_ret_addrs); i++) {
303                         if (!program->blend_ret_offsets[i])
304                                 continue;
305 
306                         state->blend_ret_addrs[i] = (state->bo->ptr.gpu & UINT32_MAX) +
307                                                     program->blend_ret_offsets[i];
308                         assert(!(state->blend_ret_addrs[i] & 0x7));
309                 }
310                 varying_count = util_bitcount64(s->info.inputs_read);
311                 if (s->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
312                         state->writes_depth = true;
313                 if (s->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL))
314                         state->writes_stencil = true;
315 
316                 uint64_t outputs_read = s->info.outputs_read;
317                 if (outputs_read & BITFIELD64_BIT(FRAG_RESULT_COLOR))
318                         outputs_read |= BITFIELD64_BIT(FRAG_RESULT_DATA0);
319 
320                 state->outputs_read = outputs_read >> FRAG_RESULT_DATA0;
321 
322                 /* List of reasons we need to execute frag shaders when things
323                  * are masked off */
324 
325                 state->fs_sidefx =
326                         s->info.writes_memory ||
327                         s->info.fs.uses_discard ||
328                         s->info.fs.uses_demote;
329                 break;
330         case MESA_SHADER_COMPUTE:
331                 /* TODO: images */
332                 state->shared_size = s->info.cs.shared_size;
333                 break;
334         default:
335                 unreachable("Unknown shader state");
336         }
337 
338         state->can_discard = s->info.fs.uses_discard;
339         state->helper_invocations = s->info.fs.needs_helper_invocations;
340         state->stack_size = program->tls_size;
341 
342         state->reads_frag_coord = (s->info.inputs_read & (1 << VARYING_SLOT_POS)) ||
343                                   BITSET_TEST(s->info.system_values_read, SYSTEM_VALUE_FRAG_COORD);
344         state->reads_point_coord = s->info.inputs_read & (1 << VARYING_SLOT_PNTC);
345         state->reads_face = (s->info.inputs_read & (1 << VARYING_SLOT_FACE)) ||
346                             BITSET_TEST(s->info.system_values_read, SYSTEM_VALUE_FRONT_FACE);
347         state->writes_point_size = s->info.outputs_written & (1 << VARYING_SLOT_PSIZ);
348 
349         if (outputs_written)
350                 *outputs_written = s->info.outputs_written;
351 
352         /* Separate as primary uniform count is truncated. Sysvals are prefix
353          * uniforms */
354         state->uniform_count = MIN2(s->num_uniforms + program->sysval_count, program->uniform_cutoff);
355         state->work_reg_count = program->work_register_count;
356 
357         if (dev->quirks & IS_BIFROST)
358                 for (unsigned i = 0; i < ARRAY_SIZE(state->blend_types); i++)
359                         state->blend_types[i] = bifrost_blend_type_from_nir(program->blend_types[i]);
360 
361         /* Record the varying mapping for the command stream's bookkeeping */
362 
363         nir_variable_mode varying_mode =
364                         stage == MESA_SHADER_VERTEX ? nir_var_shader_out : nir_var_shader_in;
365 
366         nir_foreach_variable_with_modes(var, s, varying_mode) {
367                 unsigned loc = var->data.driver_location;
368                 unsigned sz = glsl_count_attribute_slots(var->type, FALSE);
369 
370                 for (int c = 0; c < sz; ++c) {
371                         state->varyings_loc[loc + c] = var->data.location + c;
372                         state->varyings[loc + c] = pan_format_from_glsl(var->type,
373                                         var->data.precision, var->data.location_frac);
374                 }
375         }
376 
377         /* Needed for linkage */
378         state->attribute_count = attribute_count;
379         state->varying_count = varying_count;
380         state->ubo_count = s->info.num_ubos + 1; /* off-by-one for uniforms */
381 
382         /* Prepare the descriptors at compile-time */
383         state->shader.shader = shader;
384         state->shader.attribute_count = attribute_count;
385         state->shader.varying_count = varying_count;
386         state->shader.texture_count = s->info.num_textures;
387         state->shader.sampler_count = s->info.num_textures;
388 
389         if (dev->quirks & IS_BIFROST)
390                 pan_prepare_bifrost_props(state, stage);
391         else
392                 pan_prepare_midgard_props(state, stage);
393 
394         state->properties.stencil_from_shader = state->writes_stencil;
395         state->properties.shader_contains_barrier = state->helper_invocations;
396         state->properties.depth_source = state->writes_depth ?
397                                          MALI_DEPTH_SOURCE_SHADER :
398                                          MALI_DEPTH_SOURCE_FIXED_FUNCTION;
399 
400         if (stage != MESA_SHADER_FRAGMENT)
401                 pan_upload_shader_descriptor(ctx, state);
402 
403         ralloc_free(program);
404 
405         /* In both clone and tgsi_to_nir paths, the shader is ralloc'd against
406          * a NULL context */
407         ralloc_free(s);
408 }
409