1 /*
2  * Copyright (C) 2018 Alyssa Rosenzweig
3  * Copyright (C) 2020 Collabora Ltd.
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice (including the next
13  * paragraph) shall be included in all copies or substantial portions of the
14  * Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 
25 #include "util/macros.h"
26 #include "util/u_prim.h"
27 #include "util/u_vbuf.h"
28 #include "util/u_helpers.h"
29 
30 #include "panfrost-quirks.h"
31 
32 #include "pan_pool.h"
33 #include "pan_bo.h"
34 #include "pan_cmdstream.h"
35 #include "pan_context.h"
36 #include "pan_job.h"
37 
38 /* If a BO is accessed for a particular shader stage, will it be in the primary
39  * batch (vertex/tiler) or the secondary batch (fragment)? Anything but
40  * fragment will be primary, e.g. compute jobs will be considered
41  * "vertex/tiler" by analogy */
42 
43 static inline uint32_t
panfrost_bo_access_for_stage(enum pipe_shader_type stage)44 panfrost_bo_access_for_stage(enum pipe_shader_type stage)
45 {
46         assert(stage == PIPE_SHADER_FRAGMENT ||
47                stage == PIPE_SHADER_VERTEX ||
48                stage == PIPE_SHADER_COMPUTE);
49 
50         return stage == PIPE_SHADER_FRAGMENT ?
51                PAN_BO_ACCESS_FRAGMENT :
52                PAN_BO_ACCESS_VERTEX_TILER;
53 }
54 
55 /* Gets a GPU address for the associated index buffer. Only gauranteed to be
56  * good for the duration of the draw (transient), could last longer. Also get
57  * the bounds on the index buffer for the range accessed by the draw. We do
58  * these operations together because there are natural optimizations which
59  * require them to be together. */
60 
61 mali_ptr
panfrost_get_index_buffer_bounded(struct panfrost_context * ctx,const struct pipe_draw_info * info,unsigned * min_index,unsigned * max_index)62 panfrost_get_index_buffer_bounded(struct panfrost_context *ctx,
63                                   const struct pipe_draw_info *info,
64                                   unsigned *min_index, unsigned *max_index)
65 {
66         struct panfrost_resource *rsrc = pan_resource(info->index.resource);
67         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
68         off_t offset = info->start * info->index_size;
69         bool needs_indices = true;
70         mali_ptr out = 0;
71 
72         if (info->max_index != ~0u) {
73                 *min_index = info->min_index;
74                 *max_index = info->max_index;
75                 needs_indices = false;
76         }
77 
78         if (!info->has_user_indices) {
79                 /* Only resources can be directly mapped */
80                 panfrost_batch_add_bo(batch, rsrc->bo,
81                                       PAN_BO_ACCESS_SHARED |
82                                       PAN_BO_ACCESS_READ |
83                                       PAN_BO_ACCESS_VERTEX_TILER);
84                 out = rsrc->bo->ptr.gpu + offset;
85 
86                 /* Check the cache */
87                 needs_indices = !panfrost_minmax_cache_get(rsrc->index_cache,
88                                                            info->start,
89                                                            info->count,
90                                                            min_index,
91                                                            max_index);
92         } else {
93                 /* Otherwise, we need to upload to transient memory */
94                 const uint8_t *ibuf8 = (const uint8_t *) info->index.user;
95                 struct panfrost_ptr T =
96                         panfrost_pool_alloc_aligned(&batch->pool,
97                                 info->count * info->index_size,
98                                 info->index_size);
99 
100                 memcpy(T.cpu, ibuf8 + offset, info->count * info->index_size);
101                 out = T.gpu;
102         }
103 
104         if (needs_indices) {
105                 /* Fallback */
106                 u_vbuf_get_minmax_index(&ctx->base, info, min_index, max_index);
107 
108                 if (!info->has_user_indices)
109                         panfrost_minmax_cache_add(rsrc->index_cache,
110                                                   info->start, info->count,
111                                                   *min_index, *max_index);
112         }
113 
114         return out;
115 }
116 
117 static unsigned
translate_tex_wrap(enum pipe_tex_wrap w)118 translate_tex_wrap(enum pipe_tex_wrap w)
119 {
120         switch (w) {
121         case PIPE_TEX_WRAP_REPEAT: return MALI_WRAP_MODE_REPEAT;
122         case PIPE_TEX_WRAP_CLAMP: return MALI_WRAP_MODE_CLAMP;
123         case PIPE_TEX_WRAP_CLAMP_TO_EDGE: return MALI_WRAP_MODE_CLAMP_TO_EDGE;
124         case PIPE_TEX_WRAP_CLAMP_TO_BORDER: return MALI_WRAP_MODE_CLAMP_TO_BORDER;
125         case PIPE_TEX_WRAP_MIRROR_REPEAT: return MALI_WRAP_MODE_MIRRORED_REPEAT;
126         case PIPE_TEX_WRAP_MIRROR_CLAMP: return MALI_WRAP_MODE_MIRRORED_CLAMP;
127         case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE: return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_EDGE;
128         case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_BORDER;
129         default: unreachable("Invalid wrap");
130         }
131 }
132 
133 /* The hardware compares in the wrong order order, so we have to flip before
134  * encoding. Yes, really. */
135 
136 static enum mali_func
panfrost_sampler_compare_func(const struct pipe_sampler_state * cso)137 panfrost_sampler_compare_func(const struct pipe_sampler_state *cso)
138 {
139         if (!cso->compare_mode)
140                 return MALI_FUNC_NEVER;
141 
142         enum mali_func f = panfrost_translate_compare_func(cso->compare_func);
143         return panfrost_flip_compare_func(f);
144 }
145 
146 static enum mali_mipmap_mode
pan_pipe_to_mipmode(enum pipe_tex_mipfilter f)147 pan_pipe_to_mipmode(enum pipe_tex_mipfilter f)
148 {
149         switch (f) {
150         case PIPE_TEX_MIPFILTER_NEAREST: return MALI_MIPMAP_MODE_NEAREST;
151         case PIPE_TEX_MIPFILTER_LINEAR: return MALI_MIPMAP_MODE_TRILINEAR;
152         case PIPE_TEX_MIPFILTER_NONE: return MALI_MIPMAP_MODE_NONE;
153         default: unreachable("Invalid");
154         }
155 }
156 
panfrost_sampler_desc_init(const struct pipe_sampler_state * cso,struct mali_midgard_sampler_packed * hw)157 void panfrost_sampler_desc_init(const struct pipe_sampler_state *cso,
158                                 struct mali_midgard_sampler_packed *hw)
159 {
160         pan_pack(hw, MIDGARD_SAMPLER, cfg) {
161                 cfg.magnify_nearest = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
162                 cfg.minify_nearest = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST;
163                 cfg.mipmap_mode = (cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR) ?
164                         MALI_MIPMAP_MODE_TRILINEAR : MALI_MIPMAP_MODE_NEAREST;
165                 cfg.normalized_coordinates = cso->normalized_coords;
166 
167                 cfg.lod_bias = FIXED_16(cso->lod_bias, true);
168 
169                 cfg.minimum_lod = FIXED_16(cso->min_lod, false);
170 
171                 /* If necessary, we disable mipmapping in the sampler descriptor by
172                  * clamping the LOD as tight as possible (from 0 to epsilon,
173                  * essentially -- remember these are fixed point numbers, so
174                  * epsilon=1/256) */
175 
176                 cfg.maximum_lod = (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE) ?
177                         cfg.minimum_lod + 1 :
178                         FIXED_16(cso->max_lod, false);
179 
180                 cfg.wrap_mode_s = translate_tex_wrap(cso->wrap_s);
181                 cfg.wrap_mode_t = translate_tex_wrap(cso->wrap_t);
182                 cfg.wrap_mode_r = translate_tex_wrap(cso->wrap_r);
183 
184                 cfg.compare_function = panfrost_sampler_compare_func(cso);
185                 cfg.seamless_cube_map = cso->seamless_cube_map;
186 
187                 cfg.border_color_r = cso->border_color.f[0];
188                 cfg.border_color_g = cso->border_color.f[1];
189                 cfg.border_color_b = cso->border_color.f[2];
190                 cfg.border_color_a = cso->border_color.f[3];
191         }
192 }
193 
panfrost_sampler_desc_init_bifrost(const struct pipe_sampler_state * cso,struct mali_bifrost_sampler_packed * hw)194 void panfrost_sampler_desc_init_bifrost(const struct pipe_sampler_state *cso,
195                                         struct mali_bifrost_sampler_packed *hw)
196 {
197         pan_pack(hw, BIFROST_SAMPLER, cfg) {
198                 cfg.point_sample_magnify = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
199                 cfg.point_sample_minify = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST;
200                 cfg.mipmap_mode = pan_pipe_to_mipmode(cso->min_mip_filter);
201                 cfg.normalized_coordinates = cso->normalized_coords;
202 
203                 cfg.lod_bias = FIXED_16(cso->lod_bias, true);
204                 cfg.minimum_lod = FIXED_16(cso->min_lod, false);
205                 cfg.maximum_lod = FIXED_16(cso->max_lod, false);
206 
207                 cfg.wrap_mode_s = translate_tex_wrap(cso->wrap_s);
208                 cfg.wrap_mode_t = translate_tex_wrap(cso->wrap_t);
209                 cfg.wrap_mode_r = translate_tex_wrap(cso->wrap_r);
210 
211                 cfg.compare_function = panfrost_sampler_compare_func(cso);
212                 cfg.seamless_cube_map = cso->seamless_cube_map;
213         }
214 }
215 
216 static bool
panfrost_fs_required(struct panfrost_shader_state * fs,struct panfrost_blend_final * blend,unsigned rt_count)217 panfrost_fs_required(
218                 struct panfrost_shader_state *fs,
219                 struct panfrost_blend_final *blend,
220                 unsigned rt_count)
221 {
222         /* If we generally have side effects */
223         if (fs->fs_sidefx)
224                 return true;
225 
226         /* If colour is written we need to execute */
227         for (unsigned i = 0; i < rt_count; ++i) {
228                 if (!blend[i].no_colour)
229                         return true;
230         }
231 
232         /* If depth is written and not implied we need to execute.
233          * TODO: Predicate on Z/S writes being enabled */
234         return (fs->writes_depth || fs->writes_stencil);
235 }
236 
237 static void
panfrost_emit_bifrost_blend(struct panfrost_batch * batch,struct panfrost_blend_final * blend,void * rts)238 panfrost_emit_bifrost_blend(struct panfrost_batch *batch,
239                             struct panfrost_blend_final *blend,
240                             void *rts)
241 {
242         unsigned rt_count = batch->key.nr_cbufs;
243 
244         if (rt_count == 0) {
245                 /* Disable blending for depth-only */
246                 pan_pack(rts, BLEND, cfg) {
247                         cfg.enable = false;
248                         cfg.bifrost.internal.mode = MALI_BIFROST_BLEND_MODE_OFF;
249                 }
250                 return;
251         }
252 
253         const struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
254         struct panfrost_shader_state *fs = panfrost_get_shader_state(batch->ctx, PIPE_SHADER_FRAGMENT);
255 
256         for (unsigned i = 0; i < rt_count; ++i) {
257                 pan_pack(rts + i * MALI_BLEND_LENGTH, BLEND, cfg) {
258                         if (blend[i].no_colour) {
259                                 cfg.enable = false;
260                         } else {
261                                 cfg.srgb = util_format_is_srgb(batch->key.cbufs[i]->format);
262                                 cfg.load_destination = blend[i].load_dest;
263                                 cfg.round_to_fb_precision = !batch->ctx->blend->base.dither;
264                         }
265 
266                         if (blend[i].is_shader) {
267                                 /* The blend shader's address needs to be at
268                                  * the same top 32 bit as the fragment shader.
269                                  * TODO: Ensure that's always the case.
270                                  */
271                                 assert((blend[i].shader.gpu & (0xffffffffull << 32)) ==
272                                        (fs->bo->ptr.gpu & (0xffffffffull << 32)));
273                                 cfg.bifrost.internal.shader.pc = (u32)blend[i].shader.gpu;
274                                 assert(!(fs->blend_ret_addrs[i] & 0x7));
275                                 cfg.bifrost.internal.shader.return_value = fs->blend_ret_addrs[i];
276                                 cfg.bifrost.internal.mode = MALI_BIFROST_BLEND_MODE_SHADER;
277                         } else {
278                                 enum pipe_format format = batch->key.cbufs[i]->format;
279                                 const struct util_format_description *format_desc;
280                                 unsigned chan_size = 0;
281 
282                                 format_desc = util_format_description(format);
283 
284                                 for (unsigned i = 0; i < format_desc->nr_channels; i++)
285                                         chan_size = MAX2(format_desc->channel[0].size, chan_size);
286 
287                                 cfg.bifrost.equation = blend[i].equation.equation;
288 
289                                 /* Fixed point constant */
290                                 u16 constant = blend[i].equation.constant * ((1 << chan_size) - 1);
291                                 constant <<= 16 - chan_size;
292                                 cfg.bifrost.constant = constant;
293 
294                                 if (blend[i].opaque)
295                                         cfg.bifrost.internal.mode = MALI_BIFROST_BLEND_MODE_OPAQUE;
296                                 else
297                                         cfg.bifrost.internal.mode = MALI_BIFROST_BLEND_MODE_FIXED_FUNCTION;
298 
299                                 /* If we want the conversion to work properly,
300                                  * num_comps must be set to 4
301                                  */
302                                 cfg.bifrost.internal.fixed_function.num_comps = 4;
303                                 cfg.bifrost.internal.fixed_function.conversion.memory_format.format =
304                                         panfrost_format_to_bifrost_blend(format_desc, true);
305                                 if (dev->quirks & HAS_SWIZZLES) {
306                                         cfg.bifrost.internal.fixed_function.conversion.memory_format.swizzle =
307                                                 panfrost_get_default_swizzle(4);
308                                 }
309                                 cfg.bifrost.internal.fixed_function.conversion.register_format =
310                                         fs->blend_types[i];
311                         }
312                 }
313         }
314 }
315 
316 static void
panfrost_emit_midgard_blend(struct panfrost_batch * batch,struct panfrost_blend_final * blend,void * rts)317 panfrost_emit_midgard_blend(struct panfrost_batch *batch,
318                             struct panfrost_blend_final *blend,
319                             void *rts)
320 {
321         unsigned rt_count = batch->key.nr_cbufs;
322 
323         if (rt_count == 0) {
324                 /* Disable blending for depth-only */
325                 pan_pack(rts, BLEND, cfg) {
326                         cfg.midgard.equation.color_mask = 0xf;
327                         cfg.midgard.equation.rgb.a = MALI_BLEND_OPERAND_A_SRC;
328                         cfg.midgard.equation.rgb.b = MALI_BLEND_OPERAND_B_SRC;
329                         cfg.midgard.equation.rgb.c = MALI_BLEND_OPERAND_C_ZERO;
330                         cfg.midgard.equation.alpha.a = MALI_BLEND_OPERAND_A_SRC;
331                         cfg.midgard.equation.alpha.b = MALI_BLEND_OPERAND_B_SRC;
332                         cfg.midgard.equation.alpha.c = MALI_BLEND_OPERAND_C_ZERO;
333                 }
334                 return;
335         }
336 
337         for (unsigned i = 0; i < rt_count; ++i) {
338                 pan_pack(rts + i * MALI_BLEND_LENGTH, BLEND, cfg) {
339                         if (blend[i].no_colour) {
340                                 cfg.enable = false;
341                                 continue;
342                         }
343 
344                         cfg.srgb = util_format_is_srgb(batch->key.cbufs[i]->format);
345                         cfg.load_destination = blend[i].load_dest;
346                         cfg.round_to_fb_precision = !batch->ctx->blend->base.dither;
347                         cfg.midgard.blend_shader = blend[i].is_shader;
348                         if (blend[i].is_shader) {
349                                 cfg.midgard.shader_pc = blend[i].shader.gpu | blend[i].shader.first_tag;
350                         } else {
351                                 cfg.midgard.equation = blend[i].equation.equation;
352                                 cfg.midgard.constant = blend[i].equation.constant;
353                         }
354                 }
355         }
356 }
357 
358 static void
panfrost_emit_blend(struct panfrost_batch * batch,void * rts,struct panfrost_blend_final * blend)359 panfrost_emit_blend(struct panfrost_batch *batch, void *rts,
360                 struct panfrost_blend_final *blend)
361 {
362         const struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
363 
364         if (dev->quirks & IS_BIFROST)
365                 panfrost_emit_bifrost_blend(batch, blend, rts);
366         else
367                 panfrost_emit_midgard_blend(batch, blend, rts);
368 
369         for (unsigned i = 0; i < batch->key.nr_cbufs; ++i) {
370                 if (!blend[i].no_colour)
371                         batch->draws |= (PIPE_CLEAR_COLOR0 << i);
372         }
373 }
374 
375 static void
panfrost_prepare_bifrost_fs_state(struct panfrost_context * ctx,struct panfrost_blend_final * blend,struct MALI_RENDERER_STATE * state)376 panfrost_prepare_bifrost_fs_state(struct panfrost_context *ctx,
377                                   struct panfrost_blend_final *blend,
378                                   struct MALI_RENDERER_STATE *state)
379 {
380         struct panfrost_shader_state *fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
381         unsigned rt_count = ctx->pipe_framebuffer.nr_cbufs;
382 
383         if (!panfrost_fs_required(fs, blend, rt_count)) {
384                 state->properties.uniform_buffer_count = 32;
385                 state->properties.bifrost.shader_modifies_coverage = true;
386                 state->properties.bifrost.allow_forward_pixel_to_kill = true;
387                 state->properties.bifrost.allow_forward_pixel_to_be_killed = true;
388                 state->properties.bifrost.zs_update_operation = MALI_PIXEL_KILL_STRONG_EARLY;
389         } else {
390                 bool no_blend = true;
391 
392                 for (unsigned i = 0; i < rt_count; ++i)
393                         no_blend &= (!blend[i].load_dest | blend[i].no_colour);
394 
395                 state->properties = fs->properties;
396                 state->properties.bifrost.allow_forward_pixel_to_kill =
397                         !fs->can_discard && !fs->writes_depth && no_blend;
398                 state->shader = fs->shader;
399                 state->preload = fs->preload;
400         }
401 }
402 
403 static void
panfrost_prepare_midgard_fs_state(struct panfrost_context * ctx,struct panfrost_blend_final * blend,struct MALI_RENDERER_STATE * state)404 panfrost_prepare_midgard_fs_state(struct panfrost_context *ctx,
405                                   struct panfrost_blend_final *blend,
406                                   struct MALI_RENDERER_STATE *state)
407 {
408         const struct panfrost_device *dev = pan_device(ctx->base.screen);
409         struct panfrost_shader_state *fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
410         const struct panfrost_zsa_state *zsa = ctx->depth_stencil;
411         unsigned rt_count = ctx->pipe_framebuffer.nr_cbufs;
412         bool alpha_to_coverage = ctx->blend->base.alpha_to_coverage;
413 
414         if (!panfrost_fs_required(fs, blend, rt_count)) {
415                 state->shader.shader = 0x1;
416                 state->properties.midgard.work_register_count = 1;
417                 state->properties.depth_source = MALI_DEPTH_SOURCE_FIXED_FUNCTION;
418                 state->properties.midgard.force_early_z = true;
419         } else {
420                 /* Reasons to disable early-Z from a shader perspective */
421                 bool late_z = fs->can_discard || fs->writes_global ||
422                               fs->writes_depth || fs->writes_stencil;
423 
424                 /* If either depth or stencil is enabled, discard matters */
425                 bool zs_enabled =
426                         (zsa->base.depth.enabled && zsa->base.depth.func != PIPE_FUNC_ALWAYS) ||
427                         zsa->base.stencil[0].enabled;
428 
429                 bool has_blend_shader = false;
430 
431                 for (unsigned c = 0; c < rt_count; ++c)
432                         has_blend_shader |= blend[c].is_shader;
433 
434                 /* TODO: Reduce this limit? */
435                 state->properties = fs->properties;
436                 if (has_blend_shader)
437                         state->properties.midgard.work_register_count = MAX2(fs->work_reg_count, 8);
438                 else
439                         state->properties.midgard.work_register_count = fs->work_reg_count;
440 
441                 state->properties.midgard.force_early_z = !(late_z || alpha_to_coverage);
442 
443                 /* Workaround a hardware errata where early-z cannot be enabled
444                  * when discarding even when the depth buffer is read-only, by
445                  * lying to the hardware about the discard and setting the
446                  * reads tilebuffer? flag to compensate */
447                 state->properties.midgard.shader_reads_tilebuffer =
448                         fs->outputs_read || (!zs_enabled && fs->can_discard);
449                 state->properties.midgard.shader_contains_discard = zs_enabled && fs->can_discard;
450                 state->shader = fs->shader;
451         }
452 
453         if (dev->quirks & MIDGARD_SFBD) {
454                 state->multisample_misc.sfbd_load_destination = blend[0].load_dest;
455                 state->multisample_misc.sfbd_blend_shader = blend[0].is_shader;
456                 state->stencil_mask_misc.sfbd_write_enable = !blend[0].no_colour;
457                 state->stencil_mask_misc.sfbd_srgb = util_format_is_srgb(ctx->pipe_framebuffer.cbufs[0]->format);
458                 state->stencil_mask_misc.sfbd_dither_disable = !ctx->blend->base.dither;
459 
460                 if (blend[0].is_shader) {
461                         state->sfbd_blend_shader = blend[0].shader.gpu |
462                                                    blend[0].shader.first_tag;
463                 } else {
464                         state->sfbd_blend_equation = blend[0].equation.equation;
465                         state->sfbd_blend_constant = blend[0].equation.constant;
466                 }
467         } else {
468                 /* Bug where MRT-capable hw apparently reads the last blend
469                  * shader from here instead of the usual location? */
470 
471                 for (signed rt = ((signed) rt_count - 1); rt >= 0; --rt) {
472                         if (!blend[rt].is_shader)
473                                 continue;
474 
475                         state->sfbd_blend_shader = blend[rt].shader.gpu |
476                                                    blend[rt].shader.first_tag;
477                         break;
478                 }
479         }
480 }
481 
482 static void
panfrost_prepare_fs_state(struct panfrost_context * ctx,struct panfrost_blend_final * blend,struct MALI_RENDERER_STATE * state)483 panfrost_prepare_fs_state(struct panfrost_context *ctx,
484                           struct panfrost_blend_final *blend,
485                           struct MALI_RENDERER_STATE *state)
486 {
487         const struct panfrost_device *dev = pan_device(ctx->base.screen);
488         struct panfrost_shader_state *fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
489         struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
490         const struct panfrost_zsa_state *zsa = ctx->depth_stencil;
491         bool alpha_to_coverage = ctx->blend->base.alpha_to_coverage;
492 
493         if (dev->quirks & IS_BIFROST)
494                 panfrost_prepare_bifrost_fs_state(ctx, blend, state);
495         else
496                 panfrost_prepare_midgard_fs_state(ctx, blend, state);
497 
498         bool msaa = rast->multisample;
499         state->multisample_misc.multisample_enable = msaa;
500         state->multisample_misc.sample_mask = (msaa ? ctx->sample_mask : ~0) & 0xFFFF;
501 
502         /* EXT_shader_framebuffer_fetch requires per-sample */
503         bool per_sample = ctx->min_samples > 1 || fs->outputs_read;
504         state->multisample_misc.evaluate_per_sample = msaa && per_sample;
505         state->multisample_misc.depth_function = zsa->base.depth.enabled ?
506                 panfrost_translate_compare_func(zsa->base.depth.func) :
507                 MALI_FUNC_ALWAYS;
508 
509         state->multisample_misc.depth_write_mask = zsa->base.depth.writemask;
510         state->multisample_misc.fixed_function_near_discard = rast->depth_clip_near;
511         state->multisample_misc.fixed_function_far_discard = rast->depth_clip_far;
512         state->multisample_misc.shader_depth_range_fixed = true;
513 
514         state->stencil_mask_misc.stencil_mask_front = zsa->stencil_mask_front;
515         state->stencil_mask_misc.stencil_mask_back = zsa->stencil_mask_back;
516         state->stencil_mask_misc.stencil_enable = zsa->base.stencil[0].enabled;
517         state->stencil_mask_misc.alpha_to_coverage = alpha_to_coverage;
518         state->stencil_mask_misc.alpha_test_compare_function = MALI_FUNC_ALWAYS;
519         state->stencil_mask_misc.depth_range_1 = rast->offset_tri;
520         state->stencil_mask_misc.depth_range_2 = rast->offset_tri;
521         state->stencil_mask_misc.single_sampled_lines = !rast->multisample;
522         state->depth_units = rast->offset_units * 2.0f;
523         state->depth_factor = rast->offset_scale;
524 
525         bool back_enab = zsa->base.stencil[1].enabled;
526         state->stencil_front = zsa->stencil_front;
527         state->stencil_back = zsa->stencil_back;
528         state->stencil_front.reference_value = ctx->stencil_ref.ref_value[0];
529         state->stencil_back.reference_value = ctx->stencil_ref.ref_value[back_enab ? 1 : 0];
530 }
531 
532 
533 static void
panfrost_emit_frag_shader(struct panfrost_context * ctx,struct mali_renderer_state_packed * fragmeta,struct panfrost_blend_final * blend)534 panfrost_emit_frag_shader(struct panfrost_context *ctx,
535                           struct mali_renderer_state_packed *fragmeta,
536                           struct panfrost_blend_final *blend)
537 {
538         pan_pack(fragmeta, RENDERER_STATE, cfg) {
539                 panfrost_prepare_fs_state(ctx, blend, &cfg);
540         }
541 }
542 
543 mali_ptr
panfrost_emit_compute_shader_meta(struct panfrost_batch * batch,enum pipe_shader_type stage)544 panfrost_emit_compute_shader_meta(struct panfrost_batch *batch, enum pipe_shader_type stage)
545 {
546         struct panfrost_shader_state *ss = panfrost_get_shader_state(batch->ctx, stage);
547 
548         panfrost_batch_add_bo(batch, ss->bo,
549                               PAN_BO_ACCESS_PRIVATE |
550                               PAN_BO_ACCESS_READ |
551                               PAN_BO_ACCESS_VERTEX_TILER);
552 
553         panfrost_batch_add_bo(batch, pan_resource(ss->upload.rsrc)->bo,
554                               PAN_BO_ACCESS_PRIVATE |
555                               PAN_BO_ACCESS_READ |
556                               PAN_BO_ACCESS_VERTEX_TILER);
557 
558         return pan_resource(ss->upload.rsrc)->bo->ptr.gpu + ss->upload.offset;
559 }
560 
561 mali_ptr
panfrost_emit_frag_shader_meta(struct panfrost_batch * batch)562 panfrost_emit_frag_shader_meta(struct panfrost_batch *batch)
563 {
564         struct panfrost_context *ctx = batch->ctx;
565         struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
566 
567         /* Add the shader BO to the batch. */
568         panfrost_batch_add_bo(batch, ss->bo,
569                               PAN_BO_ACCESS_PRIVATE |
570                               PAN_BO_ACCESS_READ |
571                               PAN_BO_ACCESS_FRAGMENT);
572 
573         struct panfrost_device *dev = pan_device(ctx->base.screen);
574         unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
575         struct panfrost_ptr xfer;
576         unsigned rt_size;
577 
578         if (dev->quirks & MIDGARD_SFBD)
579                 rt_size = 0;
580         else
581                 rt_size = MALI_BLEND_LENGTH;
582 
583         unsigned desc_size = MALI_RENDERER_STATE_LENGTH + rt_size * rt_count;
584         xfer = panfrost_pool_alloc_aligned(&batch->pool, desc_size, MALI_RENDERER_STATE_LENGTH);
585 
586         struct panfrost_blend_final blend[PIPE_MAX_COLOR_BUFS];
587         unsigned shader_offset = 0;
588         struct panfrost_bo *shader_bo = NULL;
589 
590         for (unsigned c = 0; c < ctx->pipe_framebuffer.nr_cbufs; ++c)
591                 blend[c] = panfrost_get_blend_for_context(ctx, c, &shader_bo,
592                                                           &shader_offset);
593         panfrost_emit_frag_shader(ctx, (struct mali_renderer_state_packed *) xfer.cpu, blend);
594 
595         if (!(dev->quirks & MIDGARD_SFBD))
596                 panfrost_emit_blend(batch, xfer.cpu + MALI_RENDERER_STATE_LENGTH, blend);
597         else
598                 batch->draws |= PIPE_CLEAR_COLOR0;
599 
600         return xfer.gpu;
601 }
602 
603 mali_ptr
panfrost_emit_viewport(struct panfrost_batch * batch)604 panfrost_emit_viewport(struct panfrost_batch *batch)
605 {
606         struct panfrost_context *ctx = batch->ctx;
607         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
608         const struct pipe_scissor_state *ss = &ctx->scissor;
609         const struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
610         const struct pipe_framebuffer_state *fb = &ctx->pipe_framebuffer;
611 
612         /* Derive min/max from translate/scale. Note since |x| >= 0 by
613          * definition, we have that -|x| <= |x| hence translate - |scale| <=
614          * translate + |scale|, so the ordering is correct here. */
615         float vp_minx = vp->translate[0] - fabsf(vp->scale[0]);
616         float vp_maxx = vp->translate[0] + fabsf(vp->scale[0]);
617         float vp_miny = vp->translate[1] - fabsf(vp->scale[1]);
618         float vp_maxy = vp->translate[1] + fabsf(vp->scale[1]);
619         float minz = (vp->translate[2] - fabsf(vp->scale[2]));
620         float maxz = (vp->translate[2] + fabsf(vp->scale[2]));
621 
622         /* Scissor to the intersection of viewport and to the scissor, clamped
623          * to the framebuffer */
624 
625         unsigned minx = MIN2(fb->width, MAX2((int) vp_minx, 0));
626         unsigned maxx = MIN2(fb->width, MAX2((int) vp_maxx, 0));
627         unsigned miny = MIN2(fb->height, MAX2((int) vp_miny, 0));
628         unsigned maxy = MIN2(fb->height, MAX2((int) vp_maxy, 0));
629 
630         if (ss && rast->scissor) {
631                 minx = MAX2(ss->minx, minx);
632                 miny = MAX2(ss->miny, miny);
633                 maxx = MIN2(ss->maxx, maxx);
634                 maxy = MIN2(ss->maxy, maxy);
635         }
636 
637         /* Set the range to [1, 1) so max values don't wrap round */
638         if (maxx == 0 || maxy == 0)
639                 maxx = maxy = minx = miny = 1;
640 
641         struct panfrost_ptr T = panfrost_pool_alloc(&batch->pool, MALI_VIEWPORT_LENGTH);
642 
643         pan_pack(T.cpu, VIEWPORT, cfg) {
644                 /* [minx, maxx) and [miny, maxy) are exclusive ranges, but
645                  * these are inclusive */
646                 cfg.scissor_minimum_x = minx;
647                 cfg.scissor_minimum_y = miny;
648                 cfg.scissor_maximum_x = maxx - 1;
649                 cfg.scissor_maximum_y = maxy - 1;
650 
651                 cfg.minimum_z = rast->depth_clip_near ? minz : -INFINITY;
652                 cfg.maximum_z = rast->depth_clip_far ? maxz : INFINITY;
653         }
654 
655         panfrost_batch_union_scissor(batch, minx, miny, maxx, maxy);
656         return T.gpu;
657 }
658 
659 static mali_ptr
panfrost_map_constant_buffer_gpu(struct panfrost_batch * batch,enum pipe_shader_type st,struct panfrost_constant_buffer * buf,unsigned index)660 panfrost_map_constant_buffer_gpu(struct panfrost_batch *batch,
661                                  enum pipe_shader_type st,
662                                  struct panfrost_constant_buffer *buf,
663                                  unsigned index)
664 {
665         struct pipe_constant_buffer *cb = &buf->cb[index];
666         struct panfrost_resource *rsrc = pan_resource(cb->buffer);
667 
668         if (rsrc) {
669                 panfrost_batch_add_bo(batch, rsrc->bo,
670                                       PAN_BO_ACCESS_SHARED |
671                                       PAN_BO_ACCESS_READ |
672                                       panfrost_bo_access_for_stage(st));
673 
674                 /* Alignment gauranteed by
675                  * PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT */
676                 return rsrc->bo->ptr.gpu + cb->buffer_offset;
677         } else if (cb->user_buffer) {
678                 return panfrost_pool_upload_aligned(&batch->pool,
679                                                  cb->user_buffer +
680                                                  cb->buffer_offset,
681                                                  cb->buffer_size, 16);
682         } else {
683                 unreachable("No constant buffer");
684         }
685 }
686 
687 struct sysval_uniform {
688         union {
689                 float f[4];
690                 int32_t i[4];
691                 uint32_t u[4];
692                 uint64_t du[2];
693         };
694 };
695 
696 static void
panfrost_upload_viewport_scale_sysval(struct panfrost_batch * batch,struct sysval_uniform * uniform)697 panfrost_upload_viewport_scale_sysval(struct panfrost_batch *batch,
698                                       struct sysval_uniform *uniform)
699 {
700         struct panfrost_context *ctx = batch->ctx;
701         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
702 
703         uniform->f[0] = vp->scale[0];
704         uniform->f[1] = vp->scale[1];
705         uniform->f[2] = vp->scale[2];
706 }
707 
708 static void
panfrost_upload_viewport_offset_sysval(struct panfrost_batch * batch,struct sysval_uniform * uniform)709 panfrost_upload_viewport_offset_sysval(struct panfrost_batch *batch,
710                                        struct sysval_uniform *uniform)
711 {
712         struct panfrost_context *ctx = batch->ctx;
713         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
714 
715         uniform->f[0] = vp->translate[0];
716         uniform->f[1] = vp->translate[1];
717         uniform->f[2] = vp->translate[2];
718 }
719 
panfrost_upload_txs_sysval(struct panfrost_batch * batch,enum pipe_shader_type st,unsigned int sysvalid,struct sysval_uniform * uniform)720 static void panfrost_upload_txs_sysval(struct panfrost_batch *batch,
721                                        enum pipe_shader_type st,
722                                        unsigned int sysvalid,
723                                        struct sysval_uniform *uniform)
724 {
725         struct panfrost_context *ctx = batch->ctx;
726         unsigned texidx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
727         unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
728         bool is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
729         struct pipe_sampler_view *tex = &ctx->sampler_views[st][texidx]->base;
730 
731         assert(dim);
732         uniform->i[0] = u_minify(tex->texture->width0, tex->u.tex.first_level);
733 
734         if (dim > 1)
735                 uniform->i[1] = u_minify(tex->texture->height0,
736                                          tex->u.tex.first_level);
737 
738         if (dim > 2)
739                 uniform->i[2] = u_minify(tex->texture->depth0,
740                                          tex->u.tex.first_level);
741 
742         if (is_array)
743                 uniform->i[dim] = tex->texture->array_size;
744 }
745 
746 static void
panfrost_upload_ssbo_sysval(struct panfrost_batch * batch,enum pipe_shader_type st,unsigned ssbo_id,struct sysval_uniform * uniform)747 panfrost_upload_ssbo_sysval(struct panfrost_batch *batch,
748                             enum pipe_shader_type st,
749                             unsigned ssbo_id,
750                             struct sysval_uniform *uniform)
751 {
752         struct panfrost_context *ctx = batch->ctx;
753 
754         assert(ctx->ssbo_mask[st] & (1 << ssbo_id));
755         struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id];
756 
757         /* Compute address */
758         struct panfrost_bo *bo = pan_resource(sb.buffer)->bo;
759 
760         panfrost_batch_add_bo(batch, bo,
761                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_RW |
762                               panfrost_bo_access_for_stage(st));
763 
764         /* Upload address and size as sysval */
765         uniform->du[0] = bo->ptr.gpu + sb.buffer_offset;
766         uniform->u[2] = sb.buffer_size;
767 }
768 
769 static void
panfrost_upload_sampler_sysval(struct panfrost_batch * batch,enum pipe_shader_type st,unsigned samp_idx,struct sysval_uniform * uniform)770 panfrost_upload_sampler_sysval(struct panfrost_batch *batch,
771                                enum pipe_shader_type st,
772                                unsigned samp_idx,
773                                struct sysval_uniform *uniform)
774 {
775         struct panfrost_context *ctx = batch->ctx;
776         struct pipe_sampler_state *sampl = &ctx->samplers[st][samp_idx]->base;
777 
778         uniform->f[0] = sampl->min_lod;
779         uniform->f[1] = sampl->max_lod;
780         uniform->f[2] = sampl->lod_bias;
781 
782         /* Even without any errata, Midgard represents "no mipmapping" as
783          * fixing the LOD with the clamps; keep behaviour consistent. c.f.
784          * panfrost_create_sampler_state which also explains our choice of
785          * epsilon value (again to keep behaviour consistent) */
786 
787         if (sampl->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
788                 uniform->f[1] = uniform->f[0] + (1.0/256.0);
789 }
790 
791 static void
panfrost_upload_num_work_groups_sysval(struct panfrost_batch * batch,struct sysval_uniform * uniform)792 panfrost_upload_num_work_groups_sysval(struct panfrost_batch *batch,
793                                        struct sysval_uniform *uniform)
794 {
795         struct panfrost_context *ctx = batch->ctx;
796 
797         uniform->u[0] = ctx->compute_grid->grid[0];
798         uniform->u[1] = ctx->compute_grid->grid[1];
799         uniform->u[2] = ctx->compute_grid->grid[2];
800 }
801 
802 static void
panfrost_upload_sysvals(struct panfrost_batch * batch,void * buf,struct panfrost_shader_state * ss,enum pipe_shader_type st)803 panfrost_upload_sysvals(struct panfrost_batch *batch, void *buf,
804                         struct panfrost_shader_state *ss,
805                         enum pipe_shader_type st)
806 {
807         struct sysval_uniform *uniforms = (void *)buf;
808 
809         for (unsigned i = 0; i < ss->sysval_count; ++i) {
810                 int sysval = ss->sysval[i];
811 
812                 switch (PAN_SYSVAL_TYPE(sysval)) {
813                 case PAN_SYSVAL_VIEWPORT_SCALE:
814                         panfrost_upload_viewport_scale_sysval(batch,
815                                                               &uniforms[i]);
816                         break;
817                 case PAN_SYSVAL_VIEWPORT_OFFSET:
818                         panfrost_upload_viewport_offset_sysval(batch,
819                                                                &uniforms[i]);
820                         break;
821                 case PAN_SYSVAL_TEXTURE_SIZE:
822                         panfrost_upload_txs_sysval(batch, st,
823                                                    PAN_SYSVAL_ID(sysval),
824                                                    &uniforms[i]);
825                         break;
826                 case PAN_SYSVAL_SSBO:
827                         panfrost_upload_ssbo_sysval(batch, st,
828                                                     PAN_SYSVAL_ID(sysval),
829                                                     &uniforms[i]);
830                         break;
831                 case PAN_SYSVAL_NUM_WORK_GROUPS:
832                         panfrost_upload_num_work_groups_sysval(batch,
833                                                                &uniforms[i]);
834                         break;
835                 case PAN_SYSVAL_SAMPLER:
836                         panfrost_upload_sampler_sysval(batch, st,
837                                                        PAN_SYSVAL_ID(sysval),
838                                                        &uniforms[i]);
839                         break;
840                 default:
841                         assert(0);
842                 }
843         }
844 }
845 
846 static const void *
panfrost_map_constant_buffer_cpu(struct panfrost_constant_buffer * buf,unsigned index)847 panfrost_map_constant_buffer_cpu(struct panfrost_constant_buffer *buf,
848                                  unsigned index)
849 {
850         struct pipe_constant_buffer *cb = &buf->cb[index];
851         struct panfrost_resource *rsrc = pan_resource(cb->buffer);
852 
853         if (rsrc)
854                 return rsrc->bo->ptr.cpu;
855         else if (cb->user_buffer)
856                 return cb->user_buffer;
857         else
858                 unreachable("No constant buffer");
859 }
860 
861 mali_ptr
panfrost_emit_const_buf(struct panfrost_batch * batch,enum pipe_shader_type stage,mali_ptr * push_constants)862 panfrost_emit_const_buf(struct panfrost_batch *batch,
863                         enum pipe_shader_type stage,
864                         mali_ptr *push_constants)
865 {
866         struct panfrost_context *ctx = batch->ctx;
867         struct panfrost_shader_variants *all = ctx->shader[stage];
868 
869         if (!all)
870                 return 0;
871 
872         struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage];
873 
874         struct panfrost_shader_state *ss = &all->variants[all->active_variant];
875 
876         /* Uniforms are implicitly UBO #0 */
877         bool has_uniforms = buf->enabled_mask & (1 << 0);
878 
879         /* Allocate room for the sysval and the uniforms */
880         size_t sys_size = sizeof(float) * 4 * ss->sysval_count;
881         size_t uniform_size = has_uniforms ? (buf->cb[0].buffer_size) : 0;
882         size_t size = sys_size + uniform_size;
883         struct panfrost_ptr transfer =
884                 panfrost_pool_alloc_aligned(&batch->pool, size, 16);
885 
886         /* Upload sysvals requested by the shader */
887         panfrost_upload_sysvals(batch, transfer.cpu, ss, stage);
888 
889         /* Upload uniforms */
890         if (has_uniforms && uniform_size) {
891                 const void *cpu = panfrost_map_constant_buffer_cpu(buf, 0);
892                 memcpy(transfer.cpu + sys_size, cpu, uniform_size);
893         }
894 
895         /* Next up, attach UBOs. UBO #0 is the uniforms we just
896          * uploaded, so it's always included. The count is the highest UBO
897          * addressable -- gaps are included. */
898 
899         unsigned ubo_count = 32 - __builtin_clz(buf->enabled_mask | 1);
900 
901         size_t sz = MALI_UNIFORM_BUFFER_LENGTH * ubo_count;
902         struct panfrost_ptr ubos =
903                 panfrost_pool_alloc_aligned(&batch->pool, sz,
904                                 MALI_UNIFORM_BUFFER_LENGTH);
905 
906         uint64_t *ubo_ptr = (uint64_t *) ubos.cpu;
907 
908         /* Upload uniforms as a UBO */
909 
910         if (size) {
911                 pan_pack(ubo_ptr, UNIFORM_BUFFER, cfg) {
912                         cfg.entries = DIV_ROUND_UP(size, 16);
913                         cfg.pointer = transfer.gpu;
914                 }
915         } else {
916                 *ubo_ptr = 0;
917         }
918 
919         /* The rest are honest-to-goodness UBOs */
920 
921         for (unsigned ubo = 1; ubo < ubo_count; ++ubo) {
922                 size_t usz = buf->cb[ubo].buffer_size;
923                 bool enabled = buf->enabled_mask & (1 << ubo);
924                 bool empty = usz == 0;
925 
926                 if (!enabled || empty) {
927                         ubo_ptr[ubo] = 0;
928                         continue;
929                 }
930 
931                 /* Issue (57) for the ARB_uniform_buffer_object spec says that
932                  * the buffer can be larger than the uniform data inside it,
933                  * so clamp ubo size to what hardware supports. */
934 
935                 pan_pack(ubo_ptr + ubo, UNIFORM_BUFFER, cfg) {
936                         cfg.entries = MIN2(DIV_ROUND_UP(usz, 16), 1 << 12);
937                         cfg.pointer = panfrost_map_constant_buffer_gpu(batch,
938                                         stage, buf, ubo);
939                 }
940         }
941 
942         if (ss->uniform_count)
943                 *push_constants = transfer.gpu;
944 
945         buf->dirty_mask = 0;
946         return ubos.gpu;
947 }
948 
949 mali_ptr
panfrost_emit_shared_memory(struct panfrost_batch * batch,const struct pipe_grid_info * info)950 panfrost_emit_shared_memory(struct panfrost_batch *batch,
951                             const struct pipe_grid_info *info)
952 {
953         struct panfrost_context *ctx = batch->ctx;
954         struct panfrost_device *dev = pan_device(ctx->base.screen);
955         struct panfrost_shader_variants *all = ctx->shader[PIPE_SHADER_COMPUTE];
956         struct panfrost_shader_state *ss = &all->variants[all->active_variant];
957         unsigned single_size = util_next_power_of_two(MAX2(ss->shared_size,
958                                                            128));
959 
960         unsigned instances =
961                 util_next_power_of_two(info->grid[0]) *
962                 util_next_power_of_two(info->grid[1]) *
963                 util_next_power_of_two(info->grid[2]);
964 
965         unsigned shared_size = single_size * instances * dev->core_count;
966         struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch,
967                                                                   shared_size,
968                                                                   1);
969         struct panfrost_ptr t =
970                 panfrost_pool_alloc_aligned(&batch->pool,
971                                             MALI_LOCAL_STORAGE_LENGTH,
972                                             64);
973 
974         pan_pack(t.cpu, LOCAL_STORAGE, ls) {
975                 ls.wls_base_pointer = bo->ptr.gpu;
976                 ls.wls_instances = instances;
977                 ls.wls_size_scale = util_logbase2(single_size) + 1;
978         };
979 
980         return t.gpu;
981 }
982 
983 static mali_ptr
panfrost_get_tex_desc(struct panfrost_batch * batch,enum pipe_shader_type st,struct panfrost_sampler_view * view)984 panfrost_get_tex_desc(struct panfrost_batch *batch,
985                       enum pipe_shader_type st,
986                       struct panfrost_sampler_view *view)
987 {
988         if (!view)
989                 return (mali_ptr) 0;
990 
991         struct pipe_sampler_view *pview = &view->base;
992         struct panfrost_resource *rsrc = pan_resource(pview->texture);
993 
994         /* Add the BO to the job so it's retained until the job is done. */
995 
996         panfrost_batch_add_bo(batch, rsrc->bo,
997                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
998                               panfrost_bo_access_for_stage(st));
999 
1000         panfrost_batch_add_bo(batch, view->bo,
1001                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1002                               panfrost_bo_access_for_stage(st));
1003 
1004         return view->bo->ptr.gpu;
1005 }
1006 
1007 static void
panfrost_update_sampler_view(struct panfrost_sampler_view * view,struct pipe_context * pctx)1008 panfrost_update_sampler_view(struct panfrost_sampler_view *view,
1009                              struct pipe_context *pctx)
1010 {
1011         struct panfrost_resource *rsrc = pan_resource(view->base.texture);
1012         if (view->texture_bo != rsrc->bo->ptr.gpu ||
1013             view->modifier != rsrc->modifier) {
1014                 panfrost_bo_unreference(view->bo);
1015                 panfrost_create_sampler_view_bo(view, pctx, &rsrc->base);
1016         }
1017 }
1018 
1019 mali_ptr
panfrost_emit_texture_descriptors(struct panfrost_batch * batch,enum pipe_shader_type stage)1020 panfrost_emit_texture_descriptors(struct panfrost_batch *batch,
1021                                   enum pipe_shader_type stage)
1022 {
1023         struct panfrost_context *ctx = batch->ctx;
1024         struct panfrost_device *device = pan_device(ctx->base.screen);
1025 
1026         if (!ctx->sampler_view_count[stage])
1027                 return 0;
1028 
1029         if (device->quirks & IS_BIFROST) {
1030                 struct panfrost_ptr T = panfrost_pool_alloc_aligned(&batch->pool,
1031                                 MALI_BIFROST_TEXTURE_LENGTH *
1032                                 ctx->sampler_view_count[stage],
1033                                 MALI_BIFROST_TEXTURE_LENGTH);
1034 
1035                 struct mali_bifrost_texture_packed *out =
1036                         (struct mali_bifrost_texture_packed *) T.cpu;
1037 
1038                 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1039                         struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1040                         struct pipe_sampler_view *pview = &view->base;
1041                         struct panfrost_resource *rsrc = pan_resource(pview->texture);
1042 
1043                         panfrost_update_sampler_view(view, &ctx->base);
1044                         out[i] = view->bifrost_descriptor;
1045 
1046                         /* Add the BOs to the job so they are retained until the job is done. */
1047 
1048                         panfrost_batch_add_bo(batch, rsrc->bo,
1049                                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1050                                               panfrost_bo_access_for_stage(stage));
1051 
1052                         panfrost_batch_add_bo(batch, view->bo,
1053                                               PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1054                                               panfrost_bo_access_for_stage(stage));
1055                 }
1056 
1057                 return T.gpu;
1058         } else {
1059                 uint64_t trampolines[PIPE_MAX_SHADER_SAMPLER_VIEWS];
1060 
1061                 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1062                         struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1063 
1064                         panfrost_update_sampler_view(view, &ctx->base);
1065 
1066                         trampolines[i] = panfrost_get_tex_desc(batch, stage, view);
1067                 }
1068 
1069                 return panfrost_pool_upload_aligned(&batch->pool, trampolines,
1070                                 sizeof(uint64_t) *
1071                                 ctx->sampler_view_count[stage],
1072                                 sizeof(uint64_t));
1073         }
1074 }
1075 
1076 mali_ptr
panfrost_emit_sampler_descriptors(struct panfrost_batch * batch,enum pipe_shader_type stage)1077 panfrost_emit_sampler_descriptors(struct panfrost_batch *batch,
1078                                   enum pipe_shader_type stage)
1079 {
1080         struct panfrost_context *ctx = batch->ctx;
1081 
1082         if (!ctx->sampler_count[stage])
1083                 return 0;
1084 
1085         size_t desc_size = MALI_BIFROST_SAMPLER_LENGTH;
1086         assert(MALI_BIFROST_SAMPLER_LENGTH == MALI_MIDGARD_SAMPLER_LENGTH);
1087 
1088         size_t sz = desc_size * ctx->sampler_count[stage];
1089         struct panfrost_ptr T = panfrost_pool_alloc_aligned(&batch->pool, sz, desc_size);
1090         struct mali_midgard_sampler_packed *out = (struct mali_midgard_sampler_packed *) T.cpu;
1091 
1092         for (unsigned i = 0; i < ctx->sampler_count[stage]; ++i)
1093                 out[i] = ctx->samplers[stage][i]->hw;
1094 
1095         return T.gpu;
1096 }
1097 
1098 mali_ptr
panfrost_emit_vertex_data(struct panfrost_batch * batch,mali_ptr * buffers)1099 panfrost_emit_vertex_data(struct panfrost_batch *batch,
1100                           mali_ptr *buffers)
1101 {
1102         struct panfrost_context *ctx = batch->ctx;
1103         struct panfrost_device *dev = pan_device(ctx->base.screen);
1104         bool is_bifrost = !!(dev->quirks & IS_BIFROST);
1105         struct panfrost_vertex_state *so = ctx->vertex;
1106         struct panfrost_shader_state *vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1107 
1108         /* Worst case: everything is NPOT, which is only possible if instancing
1109          * is enabled. Otherwise single record is gauranteed */
1110         struct panfrost_ptr S = panfrost_pool_alloc_aligned(&batch->pool,
1111                         MALI_ATTRIBUTE_BUFFER_LENGTH * (vs->attribute_count + 1) *
1112                         (ctx->instance_count > 1 ? 2 : 1),
1113                         MALI_ATTRIBUTE_BUFFER_LENGTH * 2);
1114 
1115         struct panfrost_ptr T = panfrost_pool_alloc_aligned(&batch->pool,
1116                         MALI_ATTRIBUTE_LENGTH * vs->attribute_count,
1117                         MALI_ATTRIBUTE_LENGTH);
1118 
1119         struct mali_attribute_buffer_packed *bufs =
1120                 (struct mali_attribute_buffer_packed *) S.cpu;
1121 
1122         /* Determine (n + 1)'th index to suppress prefetch on Bifrost */
1123         unsigned last = vs->attribute_count * ((ctx->instance_count > 1) ? 2 : 1);
1124         memset(bufs + last, 0, sizeof(*bufs));
1125 
1126         struct mali_attribute_packed *out =
1127                 (struct mali_attribute_packed *) T.cpu;
1128 
1129         unsigned attrib_to_buffer[PIPE_MAX_ATTRIBS] = { 0 };
1130         unsigned k = 0;
1131 
1132         for (unsigned i = 0; i < so->num_elements; ++i) {
1133                 /* We map buffers 1:1 with the attributes, which
1134                  * means duplicating some vertex buffers (who cares? aside from
1135                  * maybe some caching implications but I somehow doubt that
1136                  * matters) */
1137 
1138                 struct pipe_vertex_element *elem = &so->pipe[i];
1139                 unsigned vbi = elem->vertex_buffer_index;
1140                 attrib_to_buffer[i] = k;
1141 
1142                 if (!(ctx->vb_mask & (1 << vbi)))
1143                         continue;
1144 
1145                 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1146                 struct panfrost_resource *rsrc;
1147 
1148                 rsrc = pan_resource(buf->buffer.resource);
1149                 if (!rsrc)
1150                         continue;
1151 
1152                 /* Add a dependency of the batch on the vertex buffer */
1153                 panfrost_batch_add_bo(batch, rsrc->bo,
1154                                       PAN_BO_ACCESS_SHARED |
1155                                       PAN_BO_ACCESS_READ |
1156                                       PAN_BO_ACCESS_VERTEX_TILER);
1157 
1158                 /* Mask off lower bits, see offset fixup below */
1159                 mali_ptr raw_addr = rsrc->bo->ptr.gpu + buf->buffer_offset;
1160                 mali_ptr addr = raw_addr & ~63;
1161 
1162                 /* Since we advanced the base pointer, we shrink the buffer
1163                  * size, but add the offset we subtracted */
1164                 unsigned size = rsrc->base.width0 + (raw_addr - addr)
1165                         - buf->buffer_offset;
1166 
1167                 /* When there is a divisor, the hardware-level divisor is
1168                  * the product of the instance divisor and the padded count */
1169                 unsigned divisor = elem->instance_divisor;
1170                 unsigned hw_divisor = ctx->padded_count * divisor;
1171                 unsigned stride = buf->stride;
1172 
1173                 /* If there's a divisor(=1) but no instancing, we want every
1174                  * attribute to be the same */
1175 
1176                 if (divisor && ctx->instance_count == 1)
1177                         stride = 0;
1178 
1179                 if (!divisor || ctx->instance_count <= 1) {
1180                         pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1181                                 if (ctx->instance_count > 1) {
1182                                         cfg.type = MALI_ATTRIBUTE_TYPE_1D_MODULUS;
1183                                         cfg.divisor = ctx->padded_count;
1184                                 }
1185 
1186                                 cfg.pointer = addr;
1187                                 cfg.stride = stride;
1188                                 cfg.size = size;
1189                         }
1190                 } else if (util_is_power_of_two_or_zero(hw_divisor)) {
1191                         pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1192                                 cfg.type = MALI_ATTRIBUTE_TYPE_1D_POT_DIVISOR;
1193                                 cfg.pointer = addr;
1194                                 cfg.stride = stride;
1195                                 cfg.size = size;
1196                                 cfg.divisor_r = __builtin_ctz(hw_divisor);
1197                         }
1198 
1199                 } else {
1200                         unsigned shift = 0, extra_flags = 0;
1201 
1202                         unsigned magic_divisor =
1203                                 panfrost_compute_magic_divisor(hw_divisor, &shift, &extra_flags);
1204 
1205                         pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1206                                 cfg.type = MALI_ATTRIBUTE_TYPE_1D_NPOT_DIVISOR;
1207                                 cfg.pointer = addr;
1208                                 cfg.stride = stride;
1209                                 cfg.size = size;
1210 
1211                                 cfg.divisor_r = shift;
1212                                 cfg.divisor_e = extra_flags;
1213                         }
1214 
1215                         pan_pack(bufs + k + 1, ATTRIBUTE_BUFFER_CONTINUATION_NPOT, cfg) {
1216                                 cfg.divisor_numerator = magic_divisor;
1217                                 cfg.divisor = divisor;
1218                         }
1219 
1220                         ++k;
1221                 }
1222 
1223                 ++k;
1224         }
1225 
1226         /* Add special gl_VertexID/gl_InstanceID buffers */
1227 
1228         if (unlikely(vs->attribute_count >= PAN_VERTEX_ID)) {
1229                 panfrost_vertex_id(ctx->padded_count, &bufs[k], ctx->instance_count > 1);
1230 
1231                 pan_pack(out + PAN_VERTEX_ID, ATTRIBUTE, cfg) {
1232                         cfg.buffer_index = k++;
1233                         cfg.format = so->formats[PAN_VERTEX_ID];
1234                 }
1235 
1236                 panfrost_instance_id(ctx->padded_count, &bufs[k], ctx->instance_count > 1);
1237 
1238                 pan_pack(out + PAN_INSTANCE_ID, ATTRIBUTE, cfg) {
1239                         cfg.buffer_index = k++;
1240                         cfg.format = so->formats[PAN_INSTANCE_ID];
1241                 }
1242         }
1243 
1244         /* We need an empty attrib buf to stop the prefetching on Bifrost */
1245         if (is_bifrost)
1246                 pan_pack(&bufs[k], ATTRIBUTE_BUFFER, cfg);
1247 
1248         /* Attribute addresses require 64-byte alignment, so let:
1249          *
1250          *      base' = base & ~63 = base - (base & 63)
1251          *      offset' = offset + (base & 63)
1252          *
1253          * Since base' + offset' = base + offset, these are equivalent
1254          * addressing modes and now base is 64 aligned.
1255          */
1256 
1257         for (unsigned i = 0; i < so->num_elements; ++i) {
1258                 unsigned vbi = so->pipe[i].vertex_buffer_index;
1259                 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1260 
1261                 /* Adjust by the masked off bits of the offset. Make sure we
1262                  * read src_offset from so->hw (which is not GPU visible)
1263                  * rather than target (which is) due to caching effects */
1264 
1265                 unsigned src_offset = so->pipe[i].src_offset;
1266 
1267                 /* BOs aligned to 4k so guaranteed aligned to 64 */
1268                 src_offset += (buf->buffer_offset & 63);
1269 
1270                 /* Also, somewhat obscurely per-instance data needs to be
1271                  * offset in response to a delayed start in an indexed draw */
1272 
1273                 if (so->pipe[i].instance_divisor && ctx->instance_count > 1)
1274                         src_offset -= buf->stride * ctx->offset_start;
1275 
1276                 pan_pack(out + i, ATTRIBUTE, cfg) {
1277                         cfg.buffer_index = attrib_to_buffer[i];
1278                         cfg.format = so->formats[i];
1279                         cfg.offset = src_offset;
1280                 }
1281         }
1282 
1283         *buffers = S.gpu;
1284         return T.gpu;
1285 }
1286 
1287 static mali_ptr
panfrost_emit_varyings(struct panfrost_batch * batch,struct mali_attribute_buffer_packed * slot,unsigned stride,unsigned count)1288 panfrost_emit_varyings(struct panfrost_batch *batch,
1289                 struct mali_attribute_buffer_packed *slot,
1290                 unsigned stride, unsigned count)
1291 {
1292         unsigned size = stride * count;
1293         mali_ptr ptr = panfrost_pool_alloc_aligned(&batch->invisible_pool, size, 64).gpu;
1294 
1295         pan_pack(slot, ATTRIBUTE_BUFFER, cfg) {
1296                 cfg.stride = stride;
1297                 cfg.size = size;
1298                 cfg.pointer = ptr;
1299         }
1300 
1301         return ptr;
1302 }
1303 
1304 static unsigned
panfrost_streamout_offset(unsigned stride,struct pipe_stream_output_target * target)1305 panfrost_streamout_offset(unsigned stride,
1306                         struct pipe_stream_output_target *target)
1307 {
1308         return (target->buffer_offset + (pan_so_target(target)->offset * stride * 4)) & 63;
1309 }
1310 
1311 static void
panfrost_emit_streamout(struct panfrost_batch * batch,struct mali_attribute_buffer_packed * slot,unsigned stride_words,unsigned count,struct pipe_stream_output_target * target)1312 panfrost_emit_streamout(struct panfrost_batch *batch,
1313                         struct mali_attribute_buffer_packed *slot,
1314                         unsigned stride_words, unsigned count,
1315                         struct pipe_stream_output_target *target)
1316 {
1317         unsigned stride = stride_words * 4;
1318         unsigned max_size = target->buffer_size;
1319         unsigned expected_size = stride * count;
1320 
1321         /* Grab the BO and bind it to the batch */
1322         struct panfrost_bo *bo = pan_resource(target->buffer)->bo;
1323 
1324         /* Varyings are WRITE from the perspective of the VERTEX but READ from
1325          * the perspective of the TILER and FRAGMENT.
1326          */
1327         panfrost_batch_add_bo(batch, bo,
1328                               PAN_BO_ACCESS_SHARED |
1329                               PAN_BO_ACCESS_RW |
1330                               PAN_BO_ACCESS_VERTEX_TILER |
1331                               PAN_BO_ACCESS_FRAGMENT);
1332 
1333         /* We will have an offset applied to get alignment */
1334         mali_ptr addr = bo->ptr.gpu + target->buffer_offset + (pan_so_target(target)->offset * stride);
1335 
1336         pan_pack(slot, ATTRIBUTE_BUFFER, cfg) {
1337                 cfg.pointer = (addr & ~63);
1338                 cfg.stride = stride;
1339                 cfg.size = MIN2(max_size, expected_size) + (addr & 63);
1340         }
1341 }
1342 
1343 /* Helpers for manipulating stream out information so we can pack varyings
1344  * accordingly. Compute the src_offset for a given captured varying */
1345 
1346 static struct pipe_stream_output *
pan_get_so(struct pipe_stream_output_info * info,gl_varying_slot loc)1347 pan_get_so(struct pipe_stream_output_info *info, gl_varying_slot loc)
1348 {
1349         for (unsigned i = 0; i < info->num_outputs; ++i) {
1350                 if (info->output[i].register_index == loc)
1351                         return &info->output[i];
1352         }
1353 
1354         unreachable("Varying not captured");
1355 }
1356 
1357 static unsigned
pan_varying_size(enum mali_format fmt)1358 pan_varying_size(enum mali_format fmt)
1359 {
1360         unsigned type = MALI_EXTRACT_TYPE(fmt);
1361         unsigned chan = MALI_EXTRACT_CHANNELS(fmt);
1362         unsigned bits = MALI_EXTRACT_BITS(fmt);
1363         unsigned bpc = 0;
1364 
1365         if (bits == MALI_CHANNEL_FLOAT) {
1366                 /* No doubles */
1367                 bool fp16 = (type == MALI_FORMAT_SINT);
1368                 assert(fp16 || (type == MALI_FORMAT_UNORM));
1369 
1370                 bpc = fp16 ? 2 : 4;
1371         } else {
1372                 assert(type >= MALI_FORMAT_SNORM && type <= MALI_FORMAT_SINT);
1373 
1374                 /* See the enums */
1375                 bits = 1 << bits;
1376                 assert(bits >= 8);
1377                 bpc = bits / 8;
1378         }
1379 
1380         return bpc * chan;
1381 }
1382 
1383 /* Indices for named (non-XFB) varyings that are present. These are packed
1384  * tightly so they correspond to a bitfield present (P) indexed by (1 <<
1385  * PAN_VARY_*). This has the nice property that you can lookup the buffer index
1386  * of a given special field given a shift S by:
1387  *
1388  *      idx = popcount(P & ((1 << S) - 1))
1389  *
1390  * That is... look at all of the varyings that come earlier and count them, the
1391  * count is the new index since plus one. Likewise, the total number of special
1392  * buffers required is simply popcount(P)
1393  */
1394 
1395 enum pan_special_varying {
1396         PAN_VARY_GENERAL = 0,
1397         PAN_VARY_POSITION = 1,
1398         PAN_VARY_PSIZ = 2,
1399         PAN_VARY_PNTCOORD = 3,
1400         PAN_VARY_FACE = 4,
1401         PAN_VARY_FRAGCOORD = 5,
1402 
1403         /* Keep last */
1404         PAN_VARY_MAX,
1405 };
1406 
1407 /* Given a varying, figure out which index it correpsonds to */
1408 
1409 static inline unsigned
pan_varying_index(unsigned present,enum pan_special_varying v)1410 pan_varying_index(unsigned present, enum pan_special_varying v)
1411 {
1412         unsigned mask = (1 << v) - 1;
1413         return util_bitcount(present & mask);
1414 }
1415 
1416 /* Get the base offset for XFB buffers, which by convention come after
1417  * everything else. Wrapper function for semantic reasons; by construction this
1418  * is just popcount. */
1419 
1420 static inline unsigned
pan_xfb_base(unsigned present)1421 pan_xfb_base(unsigned present)
1422 {
1423         return util_bitcount(present);
1424 }
1425 
1426 /* Computes the present mask for varyings so we can start emitting varying records */
1427 
1428 static inline unsigned
pan_varying_present(struct panfrost_shader_state * vs,struct panfrost_shader_state * fs,unsigned quirks,uint16_t point_coord_mask)1429 pan_varying_present(
1430         struct panfrost_shader_state *vs,
1431         struct panfrost_shader_state *fs,
1432         unsigned quirks,
1433         uint16_t point_coord_mask)
1434 {
1435         /* At the moment we always emit general and position buffers. Not
1436          * strictly necessary but usually harmless */
1437 
1438         unsigned present = (1 << PAN_VARY_GENERAL) | (1 << PAN_VARY_POSITION);
1439 
1440         /* Enable special buffers by the shader info */
1441 
1442         if (vs->writes_point_size)
1443                 present |= (1 << PAN_VARY_PSIZ);
1444 
1445         if (fs->reads_point_coord)
1446                 present |= (1 << PAN_VARY_PNTCOORD);
1447 
1448         if (fs->reads_face)
1449                 present |= (1 << PAN_VARY_FACE);
1450 
1451         if (fs->reads_frag_coord && !(quirks & IS_BIFROST))
1452                 present |= (1 << PAN_VARY_FRAGCOORD);
1453 
1454         /* Also, if we have a point sprite, we need a point coord buffer */
1455 
1456         for (unsigned i = 0; i < fs->varying_count; i++)  {
1457                 gl_varying_slot loc = fs->varyings_loc[i];
1458 
1459                 if (util_varying_is_point_coord(loc, point_coord_mask))
1460                         present |= (1 << PAN_VARY_PNTCOORD);
1461         }
1462 
1463         return present;
1464 }
1465 
1466 /* Emitters for varying records */
1467 
1468 static void
pan_emit_vary(struct mali_attribute_packed * out,unsigned present,enum pan_special_varying buf,unsigned quirks,enum mali_format format,unsigned offset)1469 pan_emit_vary(struct mali_attribute_packed *out,
1470                 unsigned present, enum pan_special_varying buf,
1471                 unsigned quirks, enum mali_format format,
1472                 unsigned offset)
1473 {
1474         unsigned nr_channels = MALI_EXTRACT_CHANNELS(format);
1475         unsigned swizzle = quirks & HAS_SWIZZLES ?
1476                         panfrost_get_default_swizzle(nr_channels) :
1477                         panfrost_bifrost_swizzle(nr_channels);
1478 
1479         pan_pack(out, ATTRIBUTE, cfg) {
1480                 cfg.buffer_index = pan_varying_index(present, buf);
1481                 cfg.offset_enable = quirks & IS_BIFROST ? false : true;
1482                 cfg.format = (format << 12) | swizzle;
1483                 cfg.offset = offset;
1484         }
1485 }
1486 
1487 /* General varying that is unused */
1488 
1489 static void
pan_emit_vary_only(struct mali_attribute_packed * out,unsigned present,unsigned quirks)1490 pan_emit_vary_only(struct mali_attribute_packed *out,
1491                 unsigned present, unsigned quirks)
1492 {
1493         pan_emit_vary(out, present, 0, quirks, MALI_CONSTANT, 0);
1494 }
1495 
1496 /* Special records */
1497 
1498 static const enum mali_format pan_varying_formats[PAN_VARY_MAX] = {
1499         [PAN_VARY_POSITION]     = MALI_SNAP_4,
1500         [PAN_VARY_PSIZ]         = MALI_R16F,
1501         [PAN_VARY_PNTCOORD]     = MALI_R16F,
1502         [PAN_VARY_FACE]         = MALI_R32I,
1503         [PAN_VARY_FRAGCOORD]    = MALI_RGBA32F
1504 };
1505 
1506 static void
pan_emit_vary_special(struct mali_attribute_packed * out,unsigned present,enum pan_special_varying buf,unsigned quirks)1507 pan_emit_vary_special(struct mali_attribute_packed *out,
1508                 unsigned present, enum pan_special_varying buf,
1509                 unsigned quirks)
1510 {
1511         assert(buf < PAN_VARY_MAX);
1512         pan_emit_vary(out, present, buf, quirks, pan_varying_formats[buf], 0);
1513 }
1514 
1515 static enum mali_format
pan_xfb_format(enum mali_format format,unsigned nr)1516 pan_xfb_format(enum mali_format format, unsigned nr)
1517 {
1518         if (MALI_EXTRACT_BITS(format) == MALI_CHANNEL_FLOAT)
1519                 return MALI_R32F | MALI_NR_CHANNELS(nr);
1520         else
1521                 return MALI_EXTRACT_TYPE(format) | MALI_NR_CHANNELS(nr) | MALI_CHANNEL_32;
1522 }
1523 
1524 /* Transform feedback records. Note struct pipe_stream_output is (if packed as
1525  * a bitfield) 32-bit, smaller than a 64-bit pointer, so may as well pass by
1526  * value. */
1527 
1528 static void
pan_emit_vary_xfb(struct mali_attribute_packed * out,unsigned present,unsigned max_xfb,unsigned * streamout_offsets,unsigned quirks,enum mali_format format,struct pipe_stream_output o)1529 pan_emit_vary_xfb(struct mali_attribute_packed *out,
1530                 unsigned present,
1531                 unsigned max_xfb,
1532                 unsigned *streamout_offsets,
1533                 unsigned quirks,
1534                 enum mali_format format,
1535                 struct pipe_stream_output o)
1536 {
1537         unsigned swizzle = quirks & HAS_SWIZZLES ?
1538                         panfrost_get_default_swizzle(o.num_components) :
1539                         panfrost_bifrost_swizzle(o.num_components);
1540 
1541         pan_pack(out, ATTRIBUTE, cfg) {
1542                 /* XFB buffers come after everything else */
1543                 cfg.buffer_index = pan_xfb_base(present) + o.output_buffer;
1544                 cfg.offset_enable = quirks & IS_BIFROST ? false : true;
1545 
1546                 /* Override number of channels and precision to highp */
1547                 cfg.format = (pan_xfb_format(format, o.num_components) << 12) | swizzle;
1548 
1549                 /* Apply given offsets together */
1550                 cfg.offset = (o.dst_offset * 4) /* dwords */
1551                         + streamout_offsets[o.output_buffer];
1552         }
1553 }
1554 
1555 /* Determine if we should capture a varying for XFB. This requires actually
1556  * having a buffer for it. If we don't capture it, we'll fallback to a general
1557  * varying path (linked or unlinked, possibly discarding the write) */
1558 
1559 static bool
panfrost_xfb_captured(struct panfrost_shader_state * xfb,unsigned loc,unsigned max_xfb)1560 panfrost_xfb_captured(struct panfrost_shader_state *xfb,
1561                 unsigned loc, unsigned max_xfb)
1562 {
1563         if (!(xfb->so_mask & (1ll << loc)))
1564                 return false;
1565 
1566         struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1567         return o->output_buffer < max_xfb;
1568 }
1569 
1570 static void
pan_emit_general_varying(struct mali_attribute_packed * out,struct panfrost_shader_state * other,struct panfrost_shader_state * xfb,gl_varying_slot loc,enum mali_format format,unsigned present,unsigned quirks,unsigned * gen_offsets,enum mali_format * gen_formats,unsigned * gen_stride,unsigned idx,bool should_alloc)1571 pan_emit_general_varying(struct mali_attribute_packed *out,
1572                 struct panfrost_shader_state *other,
1573                 struct panfrost_shader_state *xfb,
1574                 gl_varying_slot loc,
1575                 enum mali_format format,
1576                 unsigned present,
1577                 unsigned quirks,
1578                 unsigned *gen_offsets,
1579                 enum mali_format *gen_formats,
1580                 unsigned *gen_stride,
1581                 unsigned idx,
1582                 bool should_alloc)
1583 {
1584         /* Check if we're linked */
1585         signed other_idx = -1;
1586 
1587         for (unsigned j = 0; j < other->varying_count; ++j) {
1588                 if (other->varyings_loc[j] == loc) {
1589                         other_idx = j;
1590                         break;
1591                 }
1592         }
1593 
1594         if (other_idx < 0) {
1595                 pan_emit_vary_only(out, present, quirks);
1596                 return;
1597         }
1598 
1599         unsigned offset = gen_offsets[other_idx];
1600 
1601         if (should_alloc) {
1602                 /* We're linked, so allocate a space via a watermark allocation */
1603                 enum mali_format alt = other->varyings[other_idx];
1604 
1605                 /* Do interpolation at minimum precision */
1606                 unsigned size_main = pan_varying_size(format);
1607                 unsigned size_alt = pan_varying_size(alt);
1608                 unsigned size = MIN2(size_main, size_alt);
1609 
1610                 /* If a varying is marked for XFB but not actually captured, we
1611                  * should match the format to the format that would otherwise
1612                  * be used for XFB, since dEQP checks for invariance here. It's
1613                  * unclear if this is required by the spec. */
1614 
1615                 if (xfb->so_mask & (1ull << loc)) {
1616                         struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1617                         format = pan_xfb_format(format, o->num_components);
1618                         size = pan_varying_size(format);
1619                 } else if (size == size_alt) {
1620                         format = alt;
1621                 }
1622 
1623                 gen_offsets[idx] = *gen_stride;
1624                 gen_formats[other_idx] = format;
1625                 offset = *gen_stride;
1626                 *gen_stride += size;
1627         }
1628 
1629         pan_emit_vary(out, present, PAN_VARY_GENERAL, quirks, format, offset);
1630 }
1631 
1632 /* Higher-level wrapper around all of the above, classifying a varying into one
1633  * of the above types */
1634 
1635 static void
panfrost_emit_varying(struct mali_attribute_packed * out,struct panfrost_shader_state * stage,struct panfrost_shader_state * other,struct panfrost_shader_state * xfb,unsigned present,uint16_t point_sprite_mask,unsigned max_xfb,unsigned * streamout_offsets,unsigned quirks,unsigned * gen_offsets,enum mali_format * gen_formats,unsigned * gen_stride,unsigned idx,bool should_alloc,bool is_fragment)1636 panfrost_emit_varying(
1637                 struct mali_attribute_packed *out,
1638                 struct panfrost_shader_state *stage,
1639                 struct panfrost_shader_state *other,
1640                 struct panfrost_shader_state *xfb,
1641                 unsigned present,
1642                 uint16_t point_sprite_mask,
1643                 unsigned max_xfb,
1644                 unsigned *streamout_offsets,
1645                 unsigned quirks,
1646                 unsigned *gen_offsets,
1647                 enum mali_format *gen_formats,
1648                 unsigned *gen_stride,
1649                 unsigned idx,
1650                 bool should_alloc,
1651                 bool is_fragment)
1652 {
1653         gl_varying_slot loc = stage->varyings_loc[idx];
1654         enum mali_format format = stage->varyings[idx];
1655 
1656         /* Override format to match linkage */
1657         if (!should_alloc && gen_formats[idx])
1658                 format = gen_formats[idx];
1659 
1660         if (util_varying_is_point_coord(loc, point_sprite_mask)) {
1661                 pan_emit_vary_special(out, present, PAN_VARY_PNTCOORD, quirks);
1662         } else if (panfrost_xfb_captured(xfb, loc, max_xfb)) {
1663                 struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1664                 pan_emit_vary_xfb(out, present, max_xfb, streamout_offsets, quirks, format, *o);
1665         } else if (loc == VARYING_SLOT_POS) {
1666                 if (is_fragment)
1667                         pan_emit_vary_special(out, present, PAN_VARY_FRAGCOORD, quirks);
1668                 else
1669                         pan_emit_vary_special(out, present, PAN_VARY_POSITION, quirks);
1670         } else if (loc == VARYING_SLOT_PSIZ) {
1671                 pan_emit_vary_special(out, present, PAN_VARY_PSIZ, quirks);
1672         } else if (loc == VARYING_SLOT_PNTC) {
1673                 pan_emit_vary_special(out, present, PAN_VARY_PNTCOORD, quirks);
1674         } else if (loc == VARYING_SLOT_FACE) {
1675                 pan_emit_vary_special(out, present, PAN_VARY_FACE, quirks);
1676         } else {
1677                 pan_emit_general_varying(out, other, xfb, loc, format, present,
1678                                 quirks, gen_offsets, gen_formats, gen_stride,
1679                                 idx, should_alloc);
1680         }
1681 }
1682 
1683 static void
pan_emit_special_input(struct mali_attribute_buffer_packed * out,unsigned present,enum pan_special_varying v,unsigned special)1684 pan_emit_special_input(struct mali_attribute_buffer_packed *out,
1685                 unsigned present,
1686                 enum pan_special_varying v,
1687                 unsigned special)
1688 {
1689         if (present & (1 << v)) {
1690                 unsigned idx = pan_varying_index(present, v);
1691 
1692                 pan_pack(out + idx, ATTRIBUTE_BUFFER, cfg) {
1693                         cfg.special = special;
1694                         cfg.type = 0;
1695                 }
1696         }
1697 }
1698 
1699 void
panfrost_emit_varying_descriptor(struct panfrost_batch * batch,unsigned vertex_count,mali_ptr * vs_attribs,mali_ptr * fs_attribs,mali_ptr * buffers,mali_ptr * position,mali_ptr * psiz)1700 panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
1701                                  unsigned vertex_count,
1702                                  mali_ptr *vs_attribs,
1703                                  mali_ptr *fs_attribs,
1704                                  mali_ptr *buffers,
1705                                  mali_ptr *position,
1706                                  mali_ptr *psiz)
1707 {
1708         /* Load the shaders */
1709         struct panfrost_context *ctx = batch->ctx;
1710         struct panfrost_device *dev = pan_device(ctx->base.screen);
1711         struct panfrost_shader_state *vs, *fs;
1712         size_t vs_size, fs_size;
1713 
1714         /* Allocate the varying descriptor */
1715 
1716         vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1717         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
1718         vs_size = MALI_ATTRIBUTE_LENGTH * vs->varying_count;
1719         fs_size = MALI_ATTRIBUTE_LENGTH * fs->varying_count;
1720 
1721         struct panfrost_ptr trans = panfrost_pool_alloc_aligned(
1722                         &batch->pool, vs_size + fs_size, MALI_ATTRIBUTE_LENGTH);
1723 
1724         struct pipe_stream_output_info *so = &vs->stream_output;
1725         uint16_t point_coord_mask = ctx->rasterizer->base.sprite_coord_enable;
1726 
1727         /* TODO: point sprites need lowering on Bifrost */
1728         if (dev->quirks & IS_BIFROST)
1729                 point_coord_mask =  0;
1730 
1731         unsigned present = pan_varying_present(vs, fs, dev->quirks, point_coord_mask);
1732 
1733         /* Check if this varying is linked by us. This is the case for
1734          * general-purpose, non-captured varyings. If it is, link it. If it's
1735          * not, use the provided stream out information to determine the
1736          * offset, since it was already linked for us. */
1737 
1738         unsigned gen_offsets[32];
1739         enum mali_format gen_formats[32];
1740         memset(gen_offsets, 0, sizeof(gen_offsets));
1741         memset(gen_formats, 0, sizeof(gen_formats));
1742 
1743         unsigned gen_stride = 0;
1744         assert(vs->varying_count < ARRAY_SIZE(gen_offsets));
1745         assert(fs->varying_count < ARRAY_SIZE(gen_offsets));
1746 
1747         unsigned streamout_offsets[32];
1748 
1749         for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
1750                 streamout_offsets[i] = panfrost_streamout_offset(
1751                                         so->stride[i],
1752                                         ctx->streamout.targets[i]);
1753         }
1754 
1755         struct mali_attribute_packed *ovs = (struct mali_attribute_packed *)trans.cpu;
1756         struct mali_attribute_packed *ofs = ovs + vs->varying_count;
1757 
1758         for (unsigned i = 0; i < vs->varying_count; i++) {
1759                 panfrost_emit_varying(ovs + i, vs, fs, vs, present, 0,
1760                                 ctx->streamout.num_targets, streamout_offsets,
1761                                 dev->quirks,
1762                                 gen_offsets, gen_formats, &gen_stride, i, true, false);
1763         }
1764 
1765         for (unsigned i = 0; i < fs->varying_count; i++) {
1766                 panfrost_emit_varying(ofs + i, fs, vs, vs, present, point_coord_mask,
1767                                 ctx->streamout.num_targets, streamout_offsets,
1768                                 dev->quirks,
1769                                 gen_offsets, gen_formats, &gen_stride, i, false, true);
1770         }
1771 
1772         unsigned xfb_base = pan_xfb_base(present);
1773         struct panfrost_ptr T = panfrost_pool_alloc_aligned(&batch->pool,
1774                         MALI_ATTRIBUTE_BUFFER_LENGTH * (xfb_base + ctx->streamout.num_targets + 1),
1775                         MALI_ATTRIBUTE_BUFFER_LENGTH * 2);
1776         struct mali_attribute_buffer_packed *varyings =
1777                 (struct mali_attribute_buffer_packed *) T.cpu;
1778 
1779         /* Suppress prefetch on Bifrost */
1780         memset(varyings + (xfb_base * ctx->streamout.num_targets), 0, sizeof(*varyings));
1781 
1782         /* Emit the stream out buffers */
1783 
1784         unsigned out_count = u_stream_outputs_for_vertices(ctx->active_prim,
1785                                                            ctx->vertex_count);
1786 
1787         for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
1788                 panfrost_emit_streamout(batch, &varyings[xfb_base + i],
1789                                         so->stride[i],
1790                                         out_count,
1791                                         ctx->streamout.targets[i]);
1792         }
1793 
1794         panfrost_emit_varyings(batch,
1795                         &varyings[pan_varying_index(present, PAN_VARY_GENERAL)],
1796                         gen_stride, vertex_count);
1797 
1798         /* fp32 vec4 gl_Position */
1799         *position = panfrost_emit_varyings(batch,
1800                         &varyings[pan_varying_index(present, PAN_VARY_POSITION)],
1801                         sizeof(float) * 4, vertex_count);
1802 
1803         if (present & (1 << PAN_VARY_PSIZ)) {
1804                 *psiz = panfrost_emit_varyings(batch,
1805                                 &varyings[pan_varying_index(present, PAN_VARY_PSIZ)],
1806                                 2, vertex_count);
1807         }
1808 
1809         pan_emit_special_input(varyings, present, PAN_VARY_PNTCOORD, MALI_ATTRIBUTE_SPECIAL_POINT_COORD);
1810         pan_emit_special_input(varyings, present, PAN_VARY_FACE, MALI_ATTRIBUTE_SPECIAL_FRONT_FACING);
1811         pan_emit_special_input(varyings, present, PAN_VARY_FRAGCOORD, MALI_ATTRIBUTE_SPECIAL_FRAG_COORD);
1812 
1813         *buffers = T.gpu;
1814         *vs_attribs = trans.gpu;
1815         *fs_attribs = trans.gpu + vs_size;
1816 }
1817 
1818 void
panfrost_emit_vertex_tiler_jobs(struct panfrost_batch * batch,const struct panfrost_ptr * vertex_job,const struct panfrost_ptr * tiler_job)1819 panfrost_emit_vertex_tiler_jobs(struct panfrost_batch *batch,
1820                                 const struct panfrost_ptr *vertex_job,
1821                                 const struct panfrost_ptr *tiler_job)
1822 {
1823         struct panfrost_context *ctx = batch->ctx;
1824 
1825         /* If rasterizer discard is enable, only submit the vertex */
1826 
1827         unsigned vertex = panfrost_add_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_VERTEX, false, 0,
1828                                            vertex_job, false);
1829 
1830         if (ctx->rasterizer->base.rasterizer_discard)
1831                 return;
1832 
1833         panfrost_add_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_TILER, false, vertex, tiler_job, false);
1834 }
1835 
1836 /* TODO: stop hardcoding this */
1837 mali_ptr
panfrost_emit_sample_locations(struct panfrost_batch * batch)1838 panfrost_emit_sample_locations(struct panfrost_batch *batch)
1839 {
1840         uint16_t locations[] = {
1841             128, 128,
1842             0, 256,
1843             0, 256,
1844             0, 256,
1845             0, 256,
1846             0, 256,
1847             0, 256,
1848             0, 256,
1849             0, 256,
1850             0, 256,
1851             0, 256,
1852             0, 256,
1853             0, 256,
1854             0, 256,
1855             0, 256,
1856             0, 256,
1857             0, 256,
1858             0, 256,
1859             0, 256,
1860             0, 256,
1861             0, 256,
1862             0, 256,
1863             0, 256,
1864             0, 256,
1865             0, 256,
1866             0, 256,
1867             0, 256,
1868             0, 256,
1869             0, 256,
1870             0, 256,
1871             0, 256,
1872             0, 256,
1873             128, 128,
1874             0, 0,
1875             0, 0,
1876             0, 0,
1877             0, 0,
1878             0, 0,
1879             0, 0,
1880             0, 0,
1881             0, 0,
1882             0, 0,
1883             0, 0,
1884             0, 0,
1885             0, 0,
1886             0, 0,
1887             0, 0,
1888             0, 0,
1889         };
1890 
1891         return panfrost_pool_upload_aligned(&batch->pool, locations, 96 * sizeof(uint16_t), 64);
1892 }
1893