1 /*
2 * Copyright (C) 2018 Alyssa Rosenzweig
3 * Copyright (C) 2020 Collabora Ltd.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25 #include "util/macros.h"
26 #include "util/u_prim.h"
27 #include "util/u_vbuf.h"
28 #include "util/u_helpers.h"
29
30 #include "panfrost-quirks.h"
31
32 #include "pan_pool.h"
33 #include "pan_bo.h"
34 #include "pan_cmdstream.h"
35 #include "pan_context.h"
36 #include "pan_job.h"
37
38 /* If a BO is accessed for a particular shader stage, will it be in the primary
39 * batch (vertex/tiler) or the secondary batch (fragment)? Anything but
40 * fragment will be primary, e.g. compute jobs will be considered
41 * "vertex/tiler" by analogy */
42
43 static inline uint32_t
panfrost_bo_access_for_stage(enum pipe_shader_type stage)44 panfrost_bo_access_for_stage(enum pipe_shader_type stage)
45 {
46 assert(stage == PIPE_SHADER_FRAGMENT ||
47 stage == PIPE_SHADER_VERTEX ||
48 stage == PIPE_SHADER_COMPUTE);
49
50 return stage == PIPE_SHADER_FRAGMENT ?
51 PAN_BO_ACCESS_FRAGMENT :
52 PAN_BO_ACCESS_VERTEX_TILER;
53 }
54
55 /* Gets a GPU address for the associated index buffer. Only gauranteed to be
56 * good for the duration of the draw (transient), could last longer. Also get
57 * the bounds on the index buffer for the range accessed by the draw. We do
58 * these operations together because there are natural optimizations which
59 * require them to be together. */
60
61 mali_ptr
panfrost_get_index_buffer_bounded(struct panfrost_context * ctx,const struct pipe_draw_info * info,unsigned * min_index,unsigned * max_index)62 panfrost_get_index_buffer_bounded(struct panfrost_context *ctx,
63 const struct pipe_draw_info *info,
64 unsigned *min_index, unsigned *max_index)
65 {
66 struct panfrost_resource *rsrc = pan_resource(info->index.resource);
67 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
68 off_t offset = info->start * info->index_size;
69 bool needs_indices = true;
70 mali_ptr out = 0;
71
72 if (info->max_index != ~0u) {
73 *min_index = info->min_index;
74 *max_index = info->max_index;
75 needs_indices = false;
76 }
77
78 if (!info->has_user_indices) {
79 /* Only resources can be directly mapped */
80 panfrost_batch_add_bo(batch, rsrc->bo,
81 PAN_BO_ACCESS_SHARED |
82 PAN_BO_ACCESS_READ |
83 PAN_BO_ACCESS_VERTEX_TILER);
84 out = rsrc->bo->ptr.gpu + offset;
85
86 /* Check the cache */
87 needs_indices = !panfrost_minmax_cache_get(rsrc->index_cache,
88 info->start,
89 info->count,
90 min_index,
91 max_index);
92 } else {
93 /* Otherwise, we need to upload to transient memory */
94 const uint8_t *ibuf8 = (const uint8_t *) info->index.user;
95 struct panfrost_ptr T =
96 panfrost_pool_alloc_aligned(&batch->pool,
97 info->count * info->index_size,
98 info->index_size);
99
100 memcpy(T.cpu, ibuf8 + offset, info->count * info->index_size);
101 out = T.gpu;
102 }
103
104 if (needs_indices) {
105 /* Fallback */
106 u_vbuf_get_minmax_index(&ctx->base, info, min_index, max_index);
107
108 if (!info->has_user_indices)
109 panfrost_minmax_cache_add(rsrc->index_cache,
110 info->start, info->count,
111 *min_index, *max_index);
112 }
113
114 return out;
115 }
116
117 static unsigned
translate_tex_wrap(enum pipe_tex_wrap w)118 translate_tex_wrap(enum pipe_tex_wrap w)
119 {
120 switch (w) {
121 case PIPE_TEX_WRAP_REPEAT: return MALI_WRAP_MODE_REPEAT;
122 case PIPE_TEX_WRAP_CLAMP: return MALI_WRAP_MODE_CLAMP;
123 case PIPE_TEX_WRAP_CLAMP_TO_EDGE: return MALI_WRAP_MODE_CLAMP_TO_EDGE;
124 case PIPE_TEX_WRAP_CLAMP_TO_BORDER: return MALI_WRAP_MODE_CLAMP_TO_BORDER;
125 case PIPE_TEX_WRAP_MIRROR_REPEAT: return MALI_WRAP_MODE_MIRRORED_REPEAT;
126 case PIPE_TEX_WRAP_MIRROR_CLAMP: return MALI_WRAP_MODE_MIRRORED_CLAMP;
127 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE: return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_EDGE;
128 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_BORDER;
129 default: unreachable("Invalid wrap");
130 }
131 }
132
133 /* The hardware compares in the wrong order order, so we have to flip before
134 * encoding. Yes, really. */
135
136 static enum mali_func
panfrost_sampler_compare_func(const struct pipe_sampler_state * cso)137 panfrost_sampler_compare_func(const struct pipe_sampler_state *cso)
138 {
139 if (!cso->compare_mode)
140 return MALI_FUNC_NEVER;
141
142 enum mali_func f = panfrost_translate_compare_func(cso->compare_func);
143 return panfrost_flip_compare_func(f);
144 }
145
146 static enum mali_mipmap_mode
pan_pipe_to_mipmode(enum pipe_tex_mipfilter f)147 pan_pipe_to_mipmode(enum pipe_tex_mipfilter f)
148 {
149 switch (f) {
150 case PIPE_TEX_MIPFILTER_NEAREST: return MALI_MIPMAP_MODE_NEAREST;
151 case PIPE_TEX_MIPFILTER_LINEAR: return MALI_MIPMAP_MODE_TRILINEAR;
152 case PIPE_TEX_MIPFILTER_NONE: return MALI_MIPMAP_MODE_NONE;
153 default: unreachable("Invalid");
154 }
155 }
156
panfrost_sampler_desc_init(const struct pipe_sampler_state * cso,struct mali_midgard_sampler_packed * hw)157 void panfrost_sampler_desc_init(const struct pipe_sampler_state *cso,
158 struct mali_midgard_sampler_packed *hw)
159 {
160 pan_pack(hw, MIDGARD_SAMPLER, cfg) {
161 cfg.magnify_nearest = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
162 cfg.minify_nearest = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST;
163 cfg.mipmap_mode = (cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR) ?
164 MALI_MIPMAP_MODE_TRILINEAR : MALI_MIPMAP_MODE_NEAREST;
165 cfg.normalized_coordinates = cso->normalized_coords;
166
167 cfg.lod_bias = FIXED_16(cso->lod_bias, true);
168
169 cfg.minimum_lod = FIXED_16(cso->min_lod, false);
170
171 /* If necessary, we disable mipmapping in the sampler descriptor by
172 * clamping the LOD as tight as possible (from 0 to epsilon,
173 * essentially -- remember these are fixed point numbers, so
174 * epsilon=1/256) */
175
176 cfg.maximum_lod = (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE) ?
177 cfg.minimum_lod + 1 :
178 FIXED_16(cso->max_lod, false);
179
180 cfg.wrap_mode_s = translate_tex_wrap(cso->wrap_s);
181 cfg.wrap_mode_t = translate_tex_wrap(cso->wrap_t);
182 cfg.wrap_mode_r = translate_tex_wrap(cso->wrap_r);
183
184 cfg.compare_function = panfrost_sampler_compare_func(cso);
185 cfg.seamless_cube_map = cso->seamless_cube_map;
186
187 cfg.border_color_r = cso->border_color.f[0];
188 cfg.border_color_g = cso->border_color.f[1];
189 cfg.border_color_b = cso->border_color.f[2];
190 cfg.border_color_a = cso->border_color.f[3];
191 }
192 }
193
panfrost_sampler_desc_init_bifrost(const struct pipe_sampler_state * cso,struct mali_bifrost_sampler_packed * hw)194 void panfrost_sampler_desc_init_bifrost(const struct pipe_sampler_state *cso,
195 struct mali_bifrost_sampler_packed *hw)
196 {
197 pan_pack(hw, BIFROST_SAMPLER, cfg) {
198 cfg.point_sample_magnify = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
199 cfg.point_sample_minify = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST;
200 cfg.mipmap_mode = pan_pipe_to_mipmode(cso->min_mip_filter);
201 cfg.normalized_coordinates = cso->normalized_coords;
202
203 cfg.lod_bias = FIXED_16(cso->lod_bias, true);
204 cfg.minimum_lod = FIXED_16(cso->min_lod, false);
205 cfg.maximum_lod = FIXED_16(cso->max_lod, false);
206
207 cfg.wrap_mode_s = translate_tex_wrap(cso->wrap_s);
208 cfg.wrap_mode_t = translate_tex_wrap(cso->wrap_t);
209 cfg.wrap_mode_r = translate_tex_wrap(cso->wrap_r);
210
211 cfg.compare_function = panfrost_sampler_compare_func(cso);
212 cfg.seamless_cube_map = cso->seamless_cube_map;
213 }
214 }
215
216 static bool
panfrost_fs_required(struct panfrost_shader_state * fs,struct panfrost_blend_final * blend,unsigned rt_count)217 panfrost_fs_required(
218 struct panfrost_shader_state *fs,
219 struct panfrost_blend_final *blend,
220 unsigned rt_count)
221 {
222 /* If we generally have side effects */
223 if (fs->fs_sidefx)
224 return true;
225
226 /* If colour is written we need to execute */
227 for (unsigned i = 0; i < rt_count; ++i) {
228 if (!blend[i].no_colour)
229 return true;
230 }
231
232 /* If depth is written and not implied we need to execute.
233 * TODO: Predicate on Z/S writes being enabled */
234 return (fs->writes_depth || fs->writes_stencil);
235 }
236
237 static void
panfrost_emit_bifrost_blend(struct panfrost_batch * batch,struct panfrost_blend_final * blend,void * rts)238 panfrost_emit_bifrost_blend(struct panfrost_batch *batch,
239 struct panfrost_blend_final *blend,
240 void *rts)
241 {
242 unsigned rt_count = batch->key.nr_cbufs;
243
244 if (rt_count == 0) {
245 /* Disable blending for depth-only */
246 pan_pack(rts, BLEND, cfg) {
247 cfg.enable = false;
248 cfg.bifrost.internal.mode = MALI_BIFROST_BLEND_MODE_OFF;
249 }
250 return;
251 }
252
253 const struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
254 struct panfrost_shader_state *fs = panfrost_get_shader_state(batch->ctx, PIPE_SHADER_FRAGMENT);
255
256 for (unsigned i = 0; i < rt_count; ++i) {
257 pan_pack(rts + i * MALI_BLEND_LENGTH, BLEND, cfg) {
258 if (blend[i].no_colour) {
259 cfg.enable = false;
260 } else {
261 cfg.srgb = util_format_is_srgb(batch->key.cbufs[i]->format);
262 cfg.load_destination = blend[i].load_dest;
263 cfg.round_to_fb_precision = !batch->ctx->blend->base.dither;
264 }
265
266 if (blend[i].is_shader) {
267 /* The blend shader's address needs to be at
268 * the same top 32 bit as the fragment shader.
269 * TODO: Ensure that's always the case.
270 */
271 assert((blend[i].shader.gpu & (0xffffffffull << 32)) ==
272 (fs->bo->ptr.gpu & (0xffffffffull << 32)));
273 cfg.bifrost.internal.shader.pc = (u32)blend[i].shader.gpu;
274 assert(!(fs->blend_ret_addrs[i] & 0x7));
275 cfg.bifrost.internal.shader.return_value = fs->blend_ret_addrs[i];
276 cfg.bifrost.internal.mode = MALI_BIFROST_BLEND_MODE_SHADER;
277 } else {
278 enum pipe_format format = batch->key.cbufs[i]->format;
279 const struct util_format_description *format_desc;
280 unsigned chan_size = 0;
281
282 format_desc = util_format_description(format);
283
284 for (unsigned i = 0; i < format_desc->nr_channels; i++)
285 chan_size = MAX2(format_desc->channel[0].size, chan_size);
286
287 cfg.bifrost.equation = blend[i].equation.equation;
288
289 /* Fixed point constant */
290 u16 constant = blend[i].equation.constant * ((1 << chan_size) - 1);
291 constant <<= 16 - chan_size;
292 cfg.bifrost.constant = constant;
293
294 if (blend[i].opaque)
295 cfg.bifrost.internal.mode = MALI_BIFROST_BLEND_MODE_OPAQUE;
296 else
297 cfg.bifrost.internal.mode = MALI_BIFROST_BLEND_MODE_FIXED_FUNCTION;
298
299 /* If we want the conversion to work properly,
300 * num_comps must be set to 4
301 */
302 cfg.bifrost.internal.fixed_function.num_comps = 4;
303 cfg.bifrost.internal.fixed_function.conversion.memory_format.format =
304 panfrost_format_to_bifrost_blend(format_desc, true);
305 if (dev->quirks & HAS_SWIZZLES) {
306 cfg.bifrost.internal.fixed_function.conversion.memory_format.swizzle =
307 panfrost_get_default_swizzle(4);
308 }
309 cfg.bifrost.internal.fixed_function.conversion.register_format =
310 fs->blend_types[i];
311 }
312 }
313 }
314 }
315
316 static void
panfrost_emit_midgard_blend(struct panfrost_batch * batch,struct panfrost_blend_final * blend,void * rts)317 panfrost_emit_midgard_blend(struct panfrost_batch *batch,
318 struct panfrost_blend_final *blend,
319 void *rts)
320 {
321 unsigned rt_count = batch->key.nr_cbufs;
322
323 if (rt_count == 0) {
324 /* Disable blending for depth-only */
325 pan_pack(rts, BLEND, cfg) {
326 cfg.midgard.equation.color_mask = 0xf;
327 cfg.midgard.equation.rgb.a = MALI_BLEND_OPERAND_A_SRC;
328 cfg.midgard.equation.rgb.b = MALI_BLEND_OPERAND_B_SRC;
329 cfg.midgard.equation.rgb.c = MALI_BLEND_OPERAND_C_ZERO;
330 cfg.midgard.equation.alpha.a = MALI_BLEND_OPERAND_A_SRC;
331 cfg.midgard.equation.alpha.b = MALI_BLEND_OPERAND_B_SRC;
332 cfg.midgard.equation.alpha.c = MALI_BLEND_OPERAND_C_ZERO;
333 }
334 return;
335 }
336
337 for (unsigned i = 0; i < rt_count; ++i) {
338 pan_pack(rts + i * MALI_BLEND_LENGTH, BLEND, cfg) {
339 if (blend[i].no_colour) {
340 cfg.enable = false;
341 continue;
342 }
343
344 cfg.srgb = util_format_is_srgb(batch->key.cbufs[i]->format);
345 cfg.load_destination = blend[i].load_dest;
346 cfg.round_to_fb_precision = !batch->ctx->blend->base.dither;
347 cfg.midgard.blend_shader = blend[i].is_shader;
348 if (blend[i].is_shader) {
349 cfg.midgard.shader_pc = blend[i].shader.gpu | blend[i].shader.first_tag;
350 } else {
351 cfg.midgard.equation = blend[i].equation.equation;
352 cfg.midgard.constant = blend[i].equation.constant;
353 }
354 }
355 }
356 }
357
358 static void
panfrost_emit_blend(struct panfrost_batch * batch,void * rts,struct panfrost_blend_final * blend)359 panfrost_emit_blend(struct panfrost_batch *batch, void *rts,
360 struct panfrost_blend_final *blend)
361 {
362 const struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
363
364 if (dev->quirks & IS_BIFROST)
365 panfrost_emit_bifrost_blend(batch, blend, rts);
366 else
367 panfrost_emit_midgard_blend(batch, blend, rts);
368
369 for (unsigned i = 0; i < batch->key.nr_cbufs; ++i) {
370 if (!blend[i].no_colour)
371 batch->draws |= (PIPE_CLEAR_COLOR0 << i);
372 }
373 }
374
375 static void
panfrost_prepare_bifrost_fs_state(struct panfrost_context * ctx,struct panfrost_blend_final * blend,struct MALI_RENDERER_STATE * state)376 panfrost_prepare_bifrost_fs_state(struct panfrost_context *ctx,
377 struct panfrost_blend_final *blend,
378 struct MALI_RENDERER_STATE *state)
379 {
380 struct panfrost_shader_state *fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
381 unsigned rt_count = ctx->pipe_framebuffer.nr_cbufs;
382
383 if (!panfrost_fs_required(fs, blend, rt_count)) {
384 state->properties.uniform_buffer_count = 32;
385 state->properties.bifrost.shader_modifies_coverage = true;
386 state->properties.bifrost.allow_forward_pixel_to_kill = true;
387 state->properties.bifrost.allow_forward_pixel_to_be_killed = true;
388 state->properties.bifrost.zs_update_operation = MALI_PIXEL_KILL_STRONG_EARLY;
389 } else {
390 bool no_blend = true;
391
392 for (unsigned i = 0; i < rt_count; ++i)
393 no_blend &= (!blend[i].load_dest | blend[i].no_colour);
394
395 state->properties = fs->properties;
396 state->properties.bifrost.allow_forward_pixel_to_kill =
397 !fs->can_discard && !fs->writes_depth && no_blend;
398 state->shader = fs->shader;
399 state->preload = fs->preload;
400 }
401 }
402
403 static void
panfrost_prepare_midgard_fs_state(struct panfrost_context * ctx,struct panfrost_blend_final * blend,struct MALI_RENDERER_STATE * state)404 panfrost_prepare_midgard_fs_state(struct panfrost_context *ctx,
405 struct panfrost_blend_final *blend,
406 struct MALI_RENDERER_STATE *state)
407 {
408 const struct panfrost_device *dev = pan_device(ctx->base.screen);
409 struct panfrost_shader_state *fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
410 const struct panfrost_zsa_state *zsa = ctx->depth_stencil;
411 unsigned rt_count = ctx->pipe_framebuffer.nr_cbufs;
412 bool alpha_to_coverage = ctx->blend->base.alpha_to_coverage;
413
414 if (!panfrost_fs_required(fs, blend, rt_count)) {
415 state->shader.shader = 0x1;
416 state->properties.midgard.work_register_count = 1;
417 state->properties.depth_source = MALI_DEPTH_SOURCE_FIXED_FUNCTION;
418 state->properties.midgard.force_early_z = true;
419 } else {
420 /* Reasons to disable early-Z from a shader perspective */
421 bool late_z = fs->can_discard || fs->writes_global ||
422 fs->writes_depth || fs->writes_stencil;
423
424 /* If either depth or stencil is enabled, discard matters */
425 bool zs_enabled =
426 (zsa->base.depth.enabled && zsa->base.depth.func != PIPE_FUNC_ALWAYS) ||
427 zsa->base.stencil[0].enabled;
428
429 bool has_blend_shader = false;
430
431 for (unsigned c = 0; c < rt_count; ++c)
432 has_blend_shader |= blend[c].is_shader;
433
434 /* TODO: Reduce this limit? */
435 state->properties = fs->properties;
436 if (has_blend_shader)
437 state->properties.midgard.work_register_count = MAX2(fs->work_reg_count, 8);
438 else
439 state->properties.midgard.work_register_count = fs->work_reg_count;
440
441 state->properties.midgard.force_early_z = !(late_z || alpha_to_coverage);
442
443 /* Workaround a hardware errata where early-z cannot be enabled
444 * when discarding even when the depth buffer is read-only, by
445 * lying to the hardware about the discard and setting the
446 * reads tilebuffer? flag to compensate */
447 state->properties.midgard.shader_reads_tilebuffer =
448 fs->outputs_read || (!zs_enabled && fs->can_discard);
449 state->properties.midgard.shader_contains_discard = zs_enabled && fs->can_discard;
450 state->shader = fs->shader;
451 }
452
453 if (dev->quirks & MIDGARD_SFBD) {
454 state->multisample_misc.sfbd_load_destination = blend[0].load_dest;
455 state->multisample_misc.sfbd_blend_shader = blend[0].is_shader;
456 state->stencil_mask_misc.sfbd_write_enable = !blend[0].no_colour;
457 state->stencil_mask_misc.sfbd_srgb = util_format_is_srgb(ctx->pipe_framebuffer.cbufs[0]->format);
458 state->stencil_mask_misc.sfbd_dither_disable = !ctx->blend->base.dither;
459
460 if (blend[0].is_shader) {
461 state->sfbd_blend_shader = blend[0].shader.gpu |
462 blend[0].shader.first_tag;
463 } else {
464 state->sfbd_blend_equation = blend[0].equation.equation;
465 state->sfbd_blend_constant = blend[0].equation.constant;
466 }
467 } else {
468 /* Bug where MRT-capable hw apparently reads the last blend
469 * shader from here instead of the usual location? */
470
471 for (signed rt = ((signed) rt_count - 1); rt >= 0; --rt) {
472 if (!blend[rt].is_shader)
473 continue;
474
475 state->sfbd_blend_shader = blend[rt].shader.gpu |
476 blend[rt].shader.first_tag;
477 break;
478 }
479 }
480 }
481
482 static void
panfrost_prepare_fs_state(struct panfrost_context * ctx,struct panfrost_blend_final * blend,struct MALI_RENDERER_STATE * state)483 panfrost_prepare_fs_state(struct panfrost_context *ctx,
484 struct panfrost_blend_final *blend,
485 struct MALI_RENDERER_STATE *state)
486 {
487 const struct panfrost_device *dev = pan_device(ctx->base.screen);
488 struct panfrost_shader_state *fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
489 struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
490 const struct panfrost_zsa_state *zsa = ctx->depth_stencil;
491 bool alpha_to_coverage = ctx->blend->base.alpha_to_coverage;
492
493 if (dev->quirks & IS_BIFROST)
494 panfrost_prepare_bifrost_fs_state(ctx, blend, state);
495 else
496 panfrost_prepare_midgard_fs_state(ctx, blend, state);
497
498 bool msaa = rast->multisample;
499 state->multisample_misc.multisample_enable = msaa;
500 state->multisample_misc.sample_mask = (msaa ? ctx->sample_mask : ~0) & 0xFFFF;
501
502 /* EXT_shader_framebuffer_fetch requires per-sample */
503 bool per_sample = ctx->min_samples > 1 || fs->outputs_read;
504 state->multisample_misc.evaluate_per_sample = msaa && per_sample;
505 state->multisample_misc.depth_function = zsa->base.depth.enabled ?
506 panfrost_translate_compare_func(zsa->base.depth.func) :
507 MALI_FUNC_ALWAYS;
508
509 state->multisample_misc.depth_write_mask = zsa->base.depth.writemask;
510 state->multisample_misc.fixed_function_near_discard = rast->depth_clip_near;
511 state->multisample_misc.fixed_function_far_discard = rast->depth_clip_far;
512 state->multisample_misc.shader_depth_range_fixed = true;
513
514 state->stencil_mask_misc.stencil_mask_front = zsa->stencil_mask_front;
515 state->stencil_mask_misc.stencil_mask_back = zsa->stencil_mask_back;
516 state->stencil_mask_misc.stencil_enable = zsa->base.stencil[0].enabled;
517 state->stencil_mask_misc.alpha_to_coverage = alpha_to_coverage;
518 state->stencil_mask_misc.alpha_test_compare_function = MALI_FUNC_ALWAYS;
519 state->stencil_mask_misc.depth_range_1 = rast->offset_tri;
520 state->stencil_mask_misc.depth_range_2 = rast->offset_tri;
521 state->stencil_mask_misc.single_sampled_lines = !rast->multisample;
522 state->depth_units = rast->offset_units * 2.0f;
523 state->depth_factor = rast->offset_scale;
524
525 bool back_enab = zsa->base.stencil[1].enabled;
526 state->stencil_front = zsa->stencil_front;
527 state->stencil_back = zsa->stencil_back;
528 state->stencil_front.reference_value = ctx->stencil_ref.ref_value[0];
529 state->stencil_back.reference_value = ctx->stencil_ref.ref_value[back_enab ? 1 : 0];
530 }
531
532
533 static void
panfrost_emit_frag_shader(struct panfrost_context * ctx,struct mali_renderer_state_packed * fragmeta,struct panfrost_blend_final * blend)534 panfrost_emit_frag_shader(struct panfrost_context *ctx,
535 struct mali_renderer_state_packed *fragmeta,
536 struct panfrost_blend_final *blend)
537 {
538 pan_pack(fragmeta, RENDERER_STATE, cfg) {
539 panfrost_prepare_fs_state(ctx, blend, &cfg);
540 }
541 }
542
543 mali_ptr
panfrost_emit_compute_shader_meta(struct panfrost_batch * batch,enum pipe_shader_type stage)544 panfrost_emit_compute_shader_meta(struct panfrost_batch *batch, enum pipe_shader_type stage)
545 {
546 struct panfrost_shader_state *ss = panfrost_get_shader_state(batch->ctx, stage);
547
548 panfrost_batch_add_bo(batch, ss->bo,
549 PAN_BO_ACCESS_PRIVATE |
550 PAN_BO_ACCESS_READ |
551 PAN_BO_ACCESS_VERTEX_TILER);
552
553 panfrost_batch_add_bo(batch, pan_resource(ss->upload.rsrc)->bo,
554 PAN_BO_ACCESS_PRIVATE |
555 PAN_BO_ACCESS_READ |
556 PAN_BO_ACCESS_VERTEX_TILER);
557
558 return pan_resource(ss->upload.rsrc)->bo->ptr.gpu + ss->upload.offset;
559 }
560
561 mali_ptr
panfrost_emit_frag_shader_meta(struct panfrost_batch * batch)562 panfrost_emit_frag_shader_meta(struct panfrost_batch *batch)
563 {
564 struct panfrost_context *ctx = batch->ctx;
565 struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
566
567 /* Add the shader BO to the batch. */
568 panfrost_batch_add_bo(batch, ss->bo,
569 PAN_BO_ACCESS_PRIVATE |
570 PAN_BO_ACCESS_READ |
571 PAN_BO_ACCESS_FRAGMENT);
572
573 struct panfrost_device *dev = pan_device(ctx->base.screen);
574 unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
575 struct panfrost_ptr xfer;
576 unsigned rt_size;
577
578 if (dev->quirks & MIDGARD_SFBD)
579 rt_size = 0;
580 else
581 rt_size = MALI_BLEND_LENGTH;
582
583 unsigned desc_size = MALI_RENDERER_STATE_LENGTH + rt_size * rt_count;
584 xfer = panfrost_pool_alloc_aligned(&batch->pool, desc_size, MALI_RENDERER_STATE_LENGTH);
585
586 struct panfrost_blend_final blend[PIPE_MAX_COLOR_BUFS];
587 unsigned shader_offset = 0;
588 struct panfrost_bo *shader_bo = NULL;
589
590 for (unsigned c = 0; c < ctx->pipe_framebuffer.nr_cbufs; ++c)
591 blend[c] = panfrost_get_blend_for_context(ctx, c, &shader_bo,
592 &shader_offset);
593 panfrost_emit_frag_shader(ctx, (struct mali_renderer_state_packed *) xfer.cpu, blend);
594
595 if (!(dev->quirks & MIDGARD_SFBD))
596 panfrost_emit_blend(batch, xfer.cpu + MALI_RENDERER_STATE_LENGTH, blend);
597 else
598 batch->draws |= PIPE_CLEAR_COLOR0;
599
600 return xfer.gpu;
601 }
602
603 mali_ptr
panfrost_emit_viewport(struct panfrost_batch * batch)604 panfrost_emit_viewport(struct panfrost_batch *batch)
605 {
606 struct panfrost_context *ctx = batch->ctx;
607 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
608 const struct pipe_scissor_state *ss = &ctx->scissor;
609 const struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
610 const struct pipe_framebuffer_state *fb = &ctx->pipe_framebuffer;
611
612 /* Derive min/max from translate/scale. Note since |x| >= 0 by
613 * definition, we have that -|x| <= |x| hence translate - |scale| <=
614 * translate + |scale|, so the ordering is correct here. */
615 float vp_minx = vp->translate[0] - fabsf(vp->scale[0]);
616 float vp_maxx = vp->translate[0] + fabsf(vp->scale[0]);
617 float vp_miny = vp->translate[1] - fabsf(vp->scale[1]);
618 float vp_maxy = vp->translate[1] + fabsf(vp->scale[1]);
619 float minz = (vp->translate[2] - fabsf(vp->scale[2]));
620 float maxz = (vp->translate[2] + fabsf(vp->scale[2]));
621
622 /* Scissor to the intersection of viewport and to the scissor, clamped
623 * to the framebuffer */
624
625 unsigned minx = MIN2(fb->width, MAX2((int) vp_minx, 0));
626 unsigned maxx = MIN2(fb->width, MAX2((int) vp_maxx, 0));
627 unsigned miny = MIN2(fb->height, MAX2((int) vp_miny, 0));
628 unsigned maxy = MIN2(fb->height, MAX2((int) vp_maxy, 0));
629
630 if (ss && rast->scissor) {
631 minx = MAX2(ss->minx, minx);
632 miny = MAX2(ss->miny, miny);
633 maxx = MIN2(ss->maxx, maxx);
634 maxy = MIN2(ss->maxy, maxy);
635 }
636
637 /* Set the range to [1, 1) so max values don't wrap round */
638 if (maxx == 0 || maxy == 0)
639 maxx = maxy = minx = miny = 1;
640
641 struct panfrost_ptr T = panfrost_pool_alloc(&batch->pool, MALI_VIEWPORT_LENGTH);
642
643 pan_pack(T.cpu, VIEWPORT, cfg) {
644 /* [minx, maxx) and [miny, maxy) are exclusive ranges, but
645 * these are inclusive */
646 cfg.scissor_minimum_x = minx;
647 cfg.scissor_minimum_y = miny;
648 cfg.scissor_maximum_x = maxx - 1;
649 cfg.scissor_maximum_y = maxy - 1;
650
651 cfg.minimum_z = rast->depth_clip_near ? minz : -INFINITY;
652 cfg.maximum_z = rast->depth_clip_far ? maxz : INFINITY;
653 }
654
655 panfrost_batch_union_scissor(batch, minx, miny, maxx, maxy);
656 return T.gpu;
657 }
658
659 static mali_ptr
panfrost_map_constant_buffer_gpu(struct panfrost_batch * batch,enum pipe_shader_type st,struct panfrost_constant_buffer * buf,unsigned index)660 panfrost_map_constant_buffer_gpu(struct panfrost_batch *batch,
661 enum pipe_shader_type st,
662 struct panfrost_constant_buffer *buf,
663 unsigned index)
664 {
665 struct pipe_constant_buffer *cb = &buf->cb[index];
666 struct panfrost_resource *rsrc = pan_resource(cb->buffer);
667
668 if (rsrc) {
669 panfrost_batch_add_bo(batch, rsrc->bo,
670 PAN_BO_ACCESS_SHARED |
671 PAN_BO_ACCESS_READ |
672 panfrost_bo_access_for_stage(st));
673
674 /* Alignment gauranteed by
675 * PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT */
676 return rsrc->bo->ptr.gpu + cb->buffer_offset;
677 } else if (cb->user_buffer) {
678 return panfrost_pool_upload_aligned(&batch->pool,
679 cb->user_buffer +
680 cb->buffer_offset,
681 cb->buffer_size, 16);
682 } else {
683 unreachable("No constant buffer");
684 }
685 }
686
687 struct sysval_uniform {
688 union {
689 float f[4];
690 int32_t i[4];
691 uint32_t u[4];
692 uint64_t du[2];
693 };
694 };
695
696 static void
panfrost_upload_viewport_scale_sysval(struct panfrost_batch * batch,struct sysval_uniform * uniform)697 panfrost_upload_viewport_scale_sysval(struct panfrost_batch *batch,
698 struct sysval_uniform *uniform)
699 {
700 struct panfrost_context *ctx = batch->ctx;
701 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
702
703 uniform->f[0] = vp->scale[0];
704 uniform->f[1] = vp->scale[1];
705 uniform->f[2] = vp->scale[2];
706 }
707
708 static void
panfrost_upload_viewport_offset_sysval(struct panfrost_batch * batch,struct sysval_uniform * uniform)709 panfrost_upload_viewport_offset_sysval(struct panfrost_batch *batch,
710 struct sysval_uniform *uniform)
711 {
712 struct panfrost_context *ctx = batch->ctx;
713 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
714
715 uniform->f[0] = vp->translate[0];
716 uniform->f[1] = vp->translate[1];
717 uniform->f[2] = vp->translate[2];
718 }
719
panfrost_upload_txs_sysval(struct panfrost_batch * batch,enum pipe_shader_type st,unsigned int sysvalid,struct sysval_uniform * uniform)720 static void panfrost_upload_txs_sysval(struct panfrost_batch *batch,
721 enum pipe_shader_type st,
722 unsigned int sysvalid,
723 struct sysval_uniform *uniform)
724 {
725 struct panfrost_context *ctx = batch->ctx;
726 unsigned texidx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
727 unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
728 bool is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
729 struct pipe_sampler_view *tex = &ctx->sampler_views[st][texidx]->base;
730
731 assert(dim);
732 uniform->i[0] = u_minify(tex->texture->width0, tex->u.tex.first_level);
733
734 if (dim > 1)
735 uniform->i[1] = u_minify(tex->texture->height0,
736 tex->u.tex.first_level);
737
738 if (dim > 2)
739 uniform->i[2] = u_minify(tex->texture->depth0,
740 tex->u.tex.first_level);
741
742 if (is_array)
743 uniform->i[dim] = tex->texture->array_size;
744 }
745
746 static void
panfrost_upload_ssbo_sysval(struct panfrost_batch * batch,enum pipe_shader_type st,unsigned ssbo_id,struct sysval_uniform * uniform)747 panfrost_upload_ssbo_sysval(struct panfrost_batch *batch,
748 enum pipe_shader_type st,
749 unsigned ssbo_id,
750 struct sysval_uniform *uniform)
751 {
752 struct panfrost_context *ctx = batch->ctx;
753
754 assert(ctx->ssbo_mask[st] & (1 << ssbo_id));
755 struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id];
756
757 /* Compute address */
758 struct panfrost_bo *bo = pan_resource(sb.buffer)->bo;
759
760 panfrost_batch_add_bo(batch, bo,
761 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_RW |
762 panfrost_bo_access_for_stage(st));
763
764 /* Upload address and size as sysval */
765 uniform->du[0] = bo->ptr.gpu + sb.buffer_offset;
766 uniform->u[2] = sb.buffer_size;
767 }
768
769 static void
panfrost_upload_sampler_sysval(struct panfrost_batch * batch,enum pipe_shader_type st,unsigned samp_idx,struct sysval_uniform * uniform)770 panfrost_upload_sampler_sysval(struct panfrost_batch *batch,
771 enum pipe_shader_type st,
772 unsigned samp_idx,
773 struct sysval_uniform *uniform)
774 {
775 struct panfrost_context *ctx = batch->ctx;
776 struct pipe_sampler_state *sampl = &ctx->samplers[st][samp_idx]->base;
777
778 uniform->f[0] = sampl->min_lod;
779 uniform->f[1] = sampl->max_lod;
780 uniform->f[2] = sampl->lod_bias;
781
782 /* Even without any errata, Midgard represents "no mipmapping" as
783 * fixing the LOD with the clamps; keep behaviour consistent. c.f.
784 * panfrost_create_sampler_state which also explains our choice of
785 * epsilon value (again to keep behaviour consistent) */
786
787 if (sampl->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
788 uniform->f[1] = uniform->f[0] + (1.0/256.0);
789 }
790
791 static void
panfrost_upload_num_work_groups_sysval(struct panfrost_batch * batch,struct sysval_uniform * uniform)792 panfrost_upload_num_work_groups_sysval(struct panfrost_batch *batch,
793 struct sysval_uniform *uniform)
794 {
795 struct panfrost_context *ctx = batch->ctx;
796
797 uniform->u[0] = ctx->compute_grid->grid[0];
798 uniform->u[1] = ctx->compute_grid->grid[1];
799 uniform->u[2] = ctx->compute_grid->grid[2];
800 }
801
802 static void
panfrost_upload_sysvals(struct panfrost_batch * batch,void * buf,struct panfrost_shader_state * ss,enum pipe_shader_type st)803 panfrost_upload_sysvals(struct panfrost_batch *batch, void *buf,
804 struct panfrost_shader_state *ss,
805 enum pipe_shader_type st)
806 {
807 struct sysval_uniform *uniforms = (void *)buf;
808
809 for (unsigned i = 0; i < ss->sysval_count; ++i) {
810 int sysval = ss->sysval[i];
811
812 switch (PAN_SYSVAL_TYPE(sysval)) {
813 case PAN_SYSVAL_VIEWPORT_SCALE:
814 panfrost_upload_viewport_scale_sysval(batch,
815 &uniforms[i]);
816 break;
817 case PAN_SYSVAL_VIEWPORT_OFFSET:
818 panfrost_upload_viewport_offset_sysval(batch,
819 &uniforms[i]);
820 break;
821 case PAN_SYSVAL_TEXTURE_SIZE:
822 panfrost_upload_txs_sysval(batch, st,
823 PAN_SYSVAL_ID(sysval),
824 &uniforms[i]);
825 break;
826 case PAN_SYSVAL_SSBO:
827 panfrost_upload_ssbo_sysval(batch, st,
828 PAN_SYSVAL_ID(sysval),
829 &uniforms[i]);
830 break;
831 case PAN_SYSVAL_NUM_WORK_GROUPS:
832 panfrost_upload_num_work_groups_sysval(batch,
833 &uniforms[i]);
834 break;
835 case PAN_SYSVAL_SAMPLER:
836 panfrost_upload_sampler_sysval(batch, st,
837 PAN_SYSVAL_ID(sysval),
838 &uniforms[i]);
839 break;
840 default:
841 assert(0);
842 }
843 }
844 }
845
846 static const void *
panfrost_map_constant_buffer_cpu(struct panfrost_constant_buffer * buf,unsigned index)847 panfrost_map_constant_buffer_cpu(struct panfrost_constant_buffer *buf,
848 unsigned index)
849 {
850 struct pipe_constant_buffer *cb = &buf->cb[index];
851 struct panfrost_resource *rsrc = pan_resource(cb->buffer);
852
853 if (rsrc)
854 return rsrc->bo->ptr.cpu;
855 else if (cb->user_buffer)
856 return cb->user_buffer;
857 else
858 unreachable("No constant buffer");
859 }
860
861 mali_ptr
panfrost_emit_const_buf(struct panfrost_batch * batch,enum pipe_shader_type stage,mali_ptr * push_constants)862 panfrost_emit_const_buf(struct panfrost_batch *batch,
863 enum pipe_shader_type stage,
864 mali_ptr *push_constants)
865 {
866 struct panfrost_context *ctx = batch->ctx;
867 struct panfrost_shader_variants *all = ctx->shader[stage];
868
869 if (!all)
870 return 0;
871
872 struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage];
873
874 struct panfrost_shader_state *ss = &all->variants[all->active_variant];
875
876 /* Uniforms are implicitly UBO #0 */
877 bool has_uniforms = buf->enabled_mask & (1 << 0);
878
879 /* Allocate room for the sysval and the uniforms */
880 size_t sys_size = sizeof(float) * 4 * ss->sysval_count;
881 size_t uniform_size = has_uniforms ? (buf->cb[0].buffer_size) : 0;
882 size_t size = sys_size + uniform_size;
883 struct panfrost_ptr transfer =
884 panfrost_pool_alloc_aligned(&batch->pool, size, 16);
885
886 /* Upload sysvals requested by the shader */
887 panfrost_upload_sysvals(batch, transfer.cpu, ss, stage);
888
889 /* Upload uniforms */
890 if (has_uniforms && uniform_size) {
891 const void *cpu = panfrost_map_constant_buffer_cpu(buf, 0);
892 memcpy(transfer.cpu + sys_size, cpu, uniform_size);
893 }
894
895 /* Next up, attach UBOs. UBO #0 is the uniforms we just
896 * uploaded, so it's always included. The count is the highest UBO
897 * addressable -- gaps are included. */
898
899 unsigned ubo_count = 32 - __builtin_clz(buf->enabled_mask | 1);
900
901 size_t sz = MALI_UNIFORM_BUFFER_LENGTH * ubo_count;
902 struct panfrost_ptr ubos =
903 panfrost_pool_alloc_aligned(&batch->pool, sz,
904 MALI_UNIFORM_BUFFER_LENGTH);
905
906 uint64_t *ubo_ptr = (uint64_t *) ubos.cpu;
907
908 /* Upload uniforms as a UBO */
909
910 if (size) {
911 pan_pack(ubo_ptr, UNIFORM_BUFFER, cfg) {
912 cfg.entries = DIV_ROUND_UP(size, 16);
913 cfg.pointer = transfer.gpu;
914 }
915 } else {
916 *ubo_ptr = 0;
917 }
918
919 /* The rest are honest-to-goodness UBOs */
920
921 for (unsigned ubo = 1; ubo < ubo_count; ++ubo) {
922 size_t usz = buf->cb[ubo].buffer_size;
923 bool enabled = buf->enabled_mask & (1 << ubo);
924 bool empty = usz == 0;
925
926 if (!enabled || empty) {
927 ubo_ptr[ubo] = 0;
928 continue;
929 }
930
931 /* Issue (57) for the ARB_uniform_buffer_object spec says that
932 * the buffer can be larger than the uniform data inside it,
933 * so clamp ubo size to what hardware supports. */
934
935 pan_pack(ubo_ptr + ubo, UNIFORM_BUFFER, cfg) {
936 cfg.entries = MIN2(DIV_ROUND_UP(usz, 16), 1 << 12);
937 cfg.pointer = panfrost_map_constant_buffer_gpu(batch,
938 stage, buf, ubo);
939 }
940 }
941
942 if (ss->uniform_count)
943 *push_constants = transfer.gpu;
944
945 buf->dirty_mask = 0;
946 return ubos.gpu;
947 }
948
949 mali_ptr
panfrost_emit_shared_memory(struct panfrost_batch * batch,const struct pipe_grid_info * info)950 panfrost_emit_shared_memory(struct panfrost_batch *batch,
951 const struct pipe_grid_info *info)
952 {
953 struct panfrost_context *ctx = batch->ctx;
954 struct panfrost_device *dev = pan_device(ctx->base.screen);
955 struct panfrost_shader_variants *all = ctx->shader[PIPE_SHADER_COMPUTE];
956 struct panfrost_shader_state *ss = &all->variants[all->active_variant];
957 unsigned single_size = util_next_power_of_two(MAX2(ss->shared_size,
958 128));
959
960 unsigned instances =
961 util_next_power_of_two(info->grid[0]) *
962 util_next_power_of_two(info->grid[1]) *
963 util_next_power_of_two(info->grid[2]);
964
965 unsigned shared_size = single_size * instances * dev->core_count;
966 struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch,
967 shared_size,
968 1);
969 struct panfrost_ptr t =
970 panfrost_pool_alloc_aligned(&batch->pool,
971 MALI_LOCAL_STORAGE_LENGTH,
972 64);
973
974 pan_pack(t.cpu, LOCAL_STORAGE, ls) {
975 ls.wls_base_pointer = bo->ptr.gpu;
976 ls.wls_instances = instances;
977 ls.wls_size_scale = util_logbase2(single_size) + 1;
978 };
979
980 return t.gpu;
981 }
982
983 static mali_ptr
panfrost_get_tex_desc(struct panfrost_batch * batch,enum pipe_shader_type st,struct panfrost_sampler_view * view)984 panfrost_get_tex_desc(struct panfrost_batch *batch,
985 enum pipe_shader_type st,
986 struct panfrost_sampler_view *view)
987 {
988 if (!view)
989 return (mali_ptr) 0;
990
991 struct pipe_sampler_view *pview = &view->base;
992 struct panfrost_resource *rsrc = pan_resource(pview->texture);
993
994 /* Add the BO to the job so it's retained until the job is done. */
995
996 panfrost_batch_add_bo(batch, rsrc->bo,
997 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
998 panfrost_bo_access_for_stage(st));
999
1000 panfrost_batch_add_bo(batch, view->bo,
1001 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1002 panfrost_bo_access_for_stage(st));
1003
1004 return view->bo->ptr.gpu;
1005 }
1006
1007 static void
panfrost_update_sampler_view(struct panfrost_sampler_view * view,struct pipe_context * pctx)1008 panfrost_update_sampler_view(struct panfrost_sampler_view *view,
1009 struct pipe_context *pctx)
1010 {
1011 struct panfrost_resource *rsrc = pan_resource(view->base.texture);
1012 if (view->texture_bo != rsrc->bo->ptr.gpu ||
1013 view->modifier != rsrc->modifier) {
1014 panfrost_bo_unreference(view->bo);
1015 panfrost_create_sampler_view_bo(view, pctx, &rsrc->base);
1016 }
1017 }
1018
1019 mali_ptr
panfrost_emit_texture_descriptors(struct panfrost_batch * batch,enum pipe_shader_type stage)1020 panfrost_emit_texture_descriptors(struct panfrost_batch *batch,
1021 enum pipe_shader_type stage)
1022 {
1023 struct panfrost_context *ctx = batch->ctx;
1024 struct panfrost_device *device = pan_device(ctx->base.screen);
1025
1026 if (!ctx->sampler_view_count[stage])
1027 return 0;
1028
1029 if (device->quirks & IS_BIFROST) {
1030 struct panfrost_ptr T = panfrost_pool_alloc_aligned(&batch->pool,
1031 MALI_BIFROST_TEXTURE_LENGTH *
1032 ctx->sampler_view_count[stage],
1033 MALI_BIFROST_TEXTURE_LENGTH);
1034
1035 struct mali_bifrost_texture_packed *out =
1036 (struct mali_bifrost_texture_packed *) T.cpu;
1037
1038 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1039 struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1040 struct pipe_sampler_view *pview = &view->base;
1041 struct panfrost_resource *rsrc = pan_resource(pview->texture);
1042
1043 panfrost_update_sampler_view(view, &ctx->base);
1044 out[i] = view->bifrost_descriptor;
1045
1046 /* Add the BOs to the job so they are retained until the job is done. */
1047
1048 panfrost_batch_add_bo(batch, rsrc->bo,
1049 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1050 panfrost_bo_access_for_stage(stage));
1051
1052 panfrost_batch_add_bo(batch, view->bo,
1053 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1054 panfrost_bo_access_for_stage(stage));
1055 }
1056
1057 return T.gpu;
1058 } else {
1059 uint64_t trampolines[PIPE_MAX_SHADER_SAMPLER_VIEWS];
1060
1061 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1062 struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1063
1064 panfrost_update_sampler_view(view, &ctx->base);
1065
1066 trampolines[i] = panfrost_get_tex_desc(batch, stage, view);
1067 }
1068
1069 return panfrost_pool_upload_aligned(&batch->pool, trampolines,
1070 sizeof(uint64_t) *
1071 ctx->sampler_view_count[stage],
1072 sizeof(uint64_t));
1073 }
1074 }
1075
1076 mali_ptr
panfrost_emit_sampler_descriptors(struct panfrost_batch * batch,enum pipe_shader_type stage)1077 panfrost_emit_sampler_descriptors(struct panfrost_batch *batch,
1078 enum pipe_shader_type stage)
1079 {
1080 struct panfrost_context *ctx = batch->ctx;
1081
1082 if (!ctx->sampler_count[stage])
1083 return 0;
1084
1085 size_t desc_size = MALI_BIFROST_SAMPLER_LENGTH;
1086 assert(MALI_BIFROST_SAMPLER_LENGTH == MALI_MIDGARD_SAMPLER_LENGTH);
1087
1088 size_t sz = desc_size * ctx->sampler_count[stage];
1089 struct panfrost_ptr T = panfrost_pool_alloc_aligned(&batch->pool, sz, desc_size);
1090 struct mali_midgard_sampler_packed *out = (struct mali_midgard_sampler_packed *) T.cpu;
1091
1092 for (unsigned i = 0; i < ctx->sampler_count[stage]; ++i)
1093 out[i] = ctx->samplers[stage][i]->hw;
1094
1095 return T.gpu;
1096 }
1097
1098 mali_ptr
panfrost_emit_vertex_data(struct panfrost_batch * batch,mali_ptr * buffers)1099 panfrost_emit_vertex_data(struct panfrost_batch *batch,
1100 mali_ptr *buffers)
1101 {
1102 struct panfrost_context *ctx = batch->ctx;
1103 struct panfrost_device *dev = pan_device(ctx->base.screen);
1104 bool is_bifrost = !!(dev->quirks & IS_BIFROST);
1105 struct panfrost_vertex_state *so = ctx->vertex;
1106 struct panfrost_shader_state *vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1107
1108 /* Worst case: everything is NPOT, which is only possible if instancing
1109 * is enabled. Otherwise single record is gauranteed */
1110 struct panfrost_ptr S = panfrost_pool_alloc_aligned(&batch->pool,
1111 MALI_ATTRIBUTE_BUFFER_LENGTH * (vs->attribute_count + 1) *
1112 (ctx->instance_count > 1 ? 2 : 1),
1113 MALI_ATTRIBUTE_BUFFER_LENGTH * 2);
1114
1115 struct panfrost_ptr T = panfrost_pool_alloc_aligned(&batch->pool,
1116 MALI_ATTRIBUTE_LENGTH * vs->attribute_count,
1117 MALI_ATTRIBUTE_LENGTH);
1118
1119 struct mali_attribute_buffer_packed *bufs =
1120 (struct mali_attribute_buffer_packed *) S.cpu;
1121
1122 /* Determine (n + 1)'th index to suppress prefetch on Bifrost */
1123 unsigned last = vs->attribute_count * ((ctx->instance_count > 1) ? 2 : 1);
1124 memset(bufs + last, 0, sizeof(*bufs));
1125
1126 struct mali_attribute_packed *out =
1127 (struct mali_attribute_packed *) T.cpu;
1128
1129 unsigned attrib_to_buffer[PIPE_MAX_ATTRIBS] = { 0 };
1130 unsigned k = 0;
1131
1132 for (unsigned i = 0; i < so->num_elements; ++i) {
1133 /* We map buffers 1:1 with the attributes, which
1134 * means duplicating some vertex buffers (who cares? aside from
1135 * maybe some caching implications but I somehow doubt that
1136 * matters) */
1137
1138 struct pipe_vertex_element *elem = &so->pipe[i];
1139 unsigned vbi = elem->vertex_buffer_index;
1140 attrib_to_buffer[i] = k;
1141
1142 if (!(ctx->vb_mask & (1 << vbi)))
1143 continue;
1144
1145 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1146 struct panfrost_resource *rsrc;
1147
1148 rsrc = pan_resource(buf->buffer.resource);
1149 if (!rsrc)
1150 continue;
1151
1152 /* Add a dependency of the batch on the vertex buffer */
1153 panfrost_batch_add_bo(batch, rsrc->bo,
1154 PAN_BO_ACCESS_SHARED |
1155 PAN_BO_ACCESS_READ |
1156 PAN_BO_ACCESS_VERTEX_TILER);
1157
1158 /* Mask off lower bits, see offset fixup below */
1159 mali_ptr raw_addr = rsrc->bo->ptr.gpu + buf->buffer_offset;
1160 mali_ptr addr = raw_addr & ~63;
1161
1162 /* Since we advanced the base pointer, we shrink the buffer
1163 * size, but add the offset we subtracted */
1164 unsigned size = rsrc->base.width0 + (raw_addr - addr)
1165 - buf->buffer_offset;
1166
1167 /* When there is a divisor, the hardware-level divisor is
1168 * the product of the instance divisor and the padded count */
1169 unsigned divisor = elem->instance_divisor;
1170 unsigned hw_divisor = ctx->padded_count * divisor;
1171 unsigned stride = buf->stride;
1172
1173 /* If there's a divisor(=1) but no instancing, we want every
1174 * attribute to be the same */
1175
1176 if (divisor && ctx->instance_count == 1)
1177 stride = 0;
1178
1179 if (!divisor || ctx->instance_count <= 1) {
1180 pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1181 if (ctx->instance_count > 1) {
1182 cfg.type = MALI_ATTRIBUTE_TYPE_1D_MODULUS;
1183 cfg.divisor = ctx->padded_count;
1184 }
1185
1186 cfg.pointer = addr;
1187 cfg.stride = stride;
1188 cfg.size = size;
1189 }
1190 } else if (util_is_power_of_two_or_zero(hw_divisor)) {
1191 pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1192 cfg.type = MALI_ATTRIBUTE_TYPE_1D_POT_DIVISOR;
1193 cfg.pointer = addr;
1194 cfg.stride = stride;
1195 cfg.size = size;
1196 cfg.divisor_r = __builtin_ctz(hw_divisor);
1197 }
1198
1199 } else {
1200 unsigned shift = 0, extra_flags = 0;
1201
1202 unsigned magic_divisor =
1203 panfrost_compute_magic_divisor(hw_divisor, &shift, &extra_flags);
1204
1205 pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1206 cfg.type = MALI_ATTRIBUTE_TYPE_1D_NPOT_DIVISOR;
1207 cfg.pointer = addr;
1208 cfg.stride = stride;
1209 cfg.size = size;
1210
1211 cfg.divisor_r = shift;
1212 cfg.divisor_e = extra_flags;
1213 }
1214
1215 pan_pack(bufs + k + 1, ATTRIBUTE_BUFFER_CONTINUATION_NPOT, cfg) {
1216 cfg.divisor_numerator = magic_divisor;
1217 cfg.divisor = divisor;
1218 }
1219
1220 ++k;
1221 }
1222
1223 ++k;
1224 }
1225
1226 /* Add special gl_VertexID/gl_InstanceID buffers */
1227
1228 if (unlikely(vs->attribute_count >= PAN_VERTEX_ID)) {
1229 panfrost_vertex_id(ctx->padded_count, &bufs[k], ctx->instance_count > 1);
1230
1231 pan_pack(out + PAN_VERTEX_ID, ATTRIBUTE, cfg) {
1232 cfg.buffer_index = k++;
1233 cfg.format = so->formats[PAN_VERTEX_ID];
1234 }
1235
1236 panfrost_instance_id(ctx->padded_count, &bufs[k], ctx->instance_count > 1);
1237
1238 pan_pack(out + PAN_INSTANCE_ID, ATTRIBUTE, cfg) {
1239 cfg.buffer_index = k++;
1240 cfg.format = so->formats[PAN_INSTANCE_ID];
1241 }
1242 }
1243
1244 /* We need an empty attrib buf to stop the prefetching on Bifrost */
1245 if (is_bifrost)
1246 pan_pack(&bufs[k], ATTRIBUTE_BUFFER, cfg);
1247
1248 /* Attribute addresses require 64-byte alignment, so let:
1249 *
1250 * base' = base & ~63 = base - (base & 63)
1251 * offset' = offset + (base & 63)
1252 *
1253 * Since base' + offset' = base + offset, these are equivalent
1254 * addressing modes and now base is 64 aligned.
1255 */
1256
1257 for (unsigned i = 0; i < so->num_elements; ++i) {
1258 unsigned vbi = so->pipe[i].vertex_buffer_index;
1259 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1260
1261 /* Adjust by the masked off bits of the offset. Make sure we
1262 * read src_offset from so->hw (which is not GPU visible)
1263 * rather than target (which is) due to caching effects */
1264
1265 unsigned src_offset = so->pipe[i].src_offset;
1266
1267 /* BOs aligned to 4k so guaranteed aligned to 64 */
1268 src_offset += (buf->buffer_offset & 63);
1269
1270 /* Also, somewhat obscurely per-instance data needs to be
1271 * offset in response to a delayed start in an indexed draw */
1272
1273 if (so->pipe[i].instance_divisor && ctx->instance_count > 1)
1274 src_offset -= buf->stride * ctx->offset_start;
1275
1276 pan_pack(out + i, ATTRIBUTE, cfg) {
1277 cfg.buffer_index = attrib_to_buffer[i];
1278 cfg.format = so->formats[i];
1279 cfg.offset = src_offset;
1280 }
1281 }
1282
1283 *buffers = S.gpu;
1284 return T.gpu;
1285 }
1286
1287 static mali_ptr
panfrost_emit_varyings(struct panfrost_batch * batch,struct mali_attribute_buffer_packed * slot,unsigned stride,unsigned count)1288 panfrost_emit_varyings(struct panfrost_batch *batch,
1289 struct mali_attribute_buffer_packed *slot,
1290 unsigned stride, unsigned count)
1291 {
1292 unsigned size = stride * count;
1293 mali_ptr ptr = panfrost_pool_alloc_aligned(&batch->invisible_pool, size, 64).gpu;
1294
1295 pan_pack(slot, ATTRIBUTE_BUFFER, cfg) {
1296 cfg.stride = stride;
1297 cfg.size = size;
1298 cfg.pointer = ptr;
1299 }
1300
1301 return ptr;
1302 }
1303
1304 static unsigned
panfrost_streamout_offset(unsigned stride,struct pipe_stream_output_target * target)1305 panfrost_streamout_offset(unsigned stride,
1306 struct pipe_stream_output_target *target)
1307 {
1308 return (target->buffer_offset + (pan_so_target(target)->offset * stride * 4)) & 63;
1309 }
1310
1311 static void
panfrost_emit_streamout(struct panfrost_batch * batch,struct mali_attribute_buffer_packed * slot,unsigned stride_words,unsigned count,struct pipe_stream_output_target * target)1312 panfrost_emit_streamout(struct panfrost_batch *batch,
1313 struct mali_attribute_buffer_packed *slot,
1314 unsigned stride_words, unsigned count,
1315 struct pipe_stream_output_target *target)
1316 {
1317 unsigned stride = stride_words * 4;
1318 unsigned max_size = target->buffer_size;
1319 unsigned expected_size = stride * count;
1320
1321 /* Grab the BO and bind it to the batch */
1322 struct panfrost_bo *bo = pan_resource(target->buffer)->bo;
1323
1324 /* Varyings are WRITE from the perspective of the VERTEX but READ from
1325 * the perspective of the TILER and FRAGMENT.
1326 */
1327 panfrost_batch_add_bo(batch, bo,
1328 PAN_BO_ACCESS_SHARED |
1329 PAN_BO_ACCESS_RW |
1330 PAN_BO_ACCESS_VERTEX_TILER |
1331 PAN_BO_ACCESS_FRAGMENT);
1332
1333 /* We will have an offset applied to get alignment */
1334 mali_ptr addr = bo->ptr.gpu + target->buffer_offset + (pan_so_target(target)->offset * stride);
1335
1336 pan_pack(slot, ATTRIBUTE_BUFFER, cfg) {
1337 cfg.pointer = (addr & ~63);
1338 cfg.stride = stride;
1339 cfg.size = MIN2(max_size, expected_size) + (addr & 63);
1340 }
1341 }
1342
1343 /* Helpers for manipulating stream out information so we can pack varyings
1344 * accordingly. Compute the src_offset for a given captured varying */
1345
1346 static struct pipe_stream_output *
pan_get_so(struct pipe_stream_output_info * info,gl_varying_slot loc)1347 pan_get_so(struct pipe_stream_output_info *info, gl_varying_slot loc)
1348 {
1349 for (unsigned i = 0; i < info->num_outputs; ++i) {
1350 if (info->output[i].register_index == loc)
1351 return &info->output[i];
1352 }
1353
1354 unreachable("Varying not captured");
1355 }
1356
1357 static unsigned
pan_varying_size(enum mali_format fmt)1358 pan_varying_size(enum mali_format fmt)
1359 {
1360 unsigned type = MALI_EXTRACT_TYPE(fmt);
1361 unsigned chan = MALI_EXTRACT_CHANNELS(fmt);
1362 unsigned bits = MALI_EXTRACT_BITS(fmt);
1363 unsigned bpc = 0;
1364
1365 if (bits == MALI_CHANNEL_FLOAT) {
1366 /* No doubles */
1367 bool fp16 = (type == MALI_FORMAT_SINT);
1368 assert(fp16 || (type == MALI_FORMAT_UNORM));
1369
1370 bpc = fp16 ? 2 : 4;
1371 } else {
1372 assert(type >= MALI_FORMAT_SNORM && type <= MALI_FORMAT_SINT);
1373
1374 /* See the enums */
1375 bits = 1 << bits;
1376 assert(bits >= 8);
1377 bpc = bits / 8;
1378 }
1379
1380 return bpc * chan;
1381 }
1382
1383 /* Indices for named (non-XFB) varyings that are present. These are packed
1384 * tightly so they correspond to a bitfield present (P) indexed by (1 <<
1385 * PAN_VARY_*). This has the nice property that you can lookup the buffer index
1386 * of a given special field given a shift S by:
1387 *
1388 * idx = popcount(P & ((1 << S) - 1))
1389 *
1390 * That is... look at all of the varyings that come earlier and count them, the
1391 * count is the new index since plus one. Likewise, the total number of special
1392 * buffers required is simply popcount(P)
1393 */
1394
1395 enum pan_special_varying {
1396 PAN_VARY_GENERAL = 0,
1397 PAN_VARY_POSITION = 1,
1398 PAN_VARY_PSIZ = 2,
1399 PAN_VARY_PNTCOORD = 3,
1400 PAN_VARY_FACE = 4,
1401 PAN_VARY_FRAGCOORD = 5,
1402
1403 /* Keep last */
1404 PAN_VARY_MAX,
1405 };
1406
1407 /* Given a varying, figure out which index it correpsonds to */
1408
1409 static inline unsigned
pan_varying_index(unsigned present,enum pan_special_varying v)1410 pan_varying_index(unsigned present, enum pan_special_varying v)
1411 {
1412 unsigned mask = (1 << v) - 1;
1413 return util_bitcount(present & mask);
1414 }
1415
1416 /* Get the base offset for XFB buffers, which by convention come after
1417 * everything else. Wrapper function for semantic reasons; by construction this
1418 * is just popcount. */
1419
1420 static inline unsigned
pan_xfb_base(unsigned present)1421 pan_xfb_base(unsigned present)
1422 {
1423 return util_bitcount(present);
1424 }
1425
1426 /* Computes the present mask for varyings so we can start emitting varying records */
1427
1428 static inline unsigned
pan_varying_present(struct panfrost_shader_state * vs,struct panfrost_shader_state * fs,unsigned quirks,uint16_t point_coord_mask)1429 pan_varying_present(
1430 struct panfrost_shader_state *vs,
1431 struct panfrost_shader_state *fs,
1432 unsigned quirks,
1433 uint16_t point_coord_mask)
1434 {
1435 /* At the moment we always emit general and position buffers. Not
1436 * strictly necessary but usually harmless */
1437
1438 unsigned present = (1 << PAN_VARY_GENERAL) | (1 << PAN_VARY_POSITION);
1439
1440 /* Enable special buffers by the shader info */
1441
1442 if (vs->writes_point_size)
1443 present |= (1 << PAN_VARY_PSIZ);
1444
1445 if (fs->reads_point_coord)
1446 present |= (1 << PAN_VARY_PNTCOORD);
1447
1448 if (fs->reads_face)
1449 present |= (1 << PAN_VARY_FACE);
1450
1451 if (fs->reads_frag_coord && !(quirks & IS_BIFROST))
1452 present |= (1 << PAN_VARY_FRAGCOORD);
1453
1454 /* Also, if we have a point sprite, we need a point coord buffer */
1455
1456 for (unsigned i = 0; i < fs->varying_count; i++) {
1457 gl_varying_slot loc = fs->varyings_loc[i];
1458
1459 if (util_varying_is_point_coord(loc, point_coord_mask))
1460 present |= (1 << PAN_VARY_PNTCOORD);
1461 }
1462
1463 return present;
1464 }
1465
1466 /* Emitters for varying records */
1467
1468 static void
pan_emit_vary(struct mali_attribute_packed * out,unsigned present,enum pan_special_varying buf,unsigned quirks,enum mali_format format,unsigned offset)1469 pan_emit_vary(struct mali_attribute_packed *out,
1470 unsigned present, enum pan_special_varying buf,
1471 unsigned quirks, enum mali_format format,
1472 unsigned offset)
1473 {
1474 unsigned nr_channels = MALI_EXTRACT_CHANNELS(format);
1475 unsigned swizzle = quirks & HAS_SWIZZLES ?
1476 panfrost_get_default_swizzle(nr_channels) :
1477 panfrost_bifrost_swizzle(nr_channels);
1478
1479 pan_pack(out, ATTRIBUTE, cfg) {
1480 cfg.buffer_index = pan_varying_index(present, buf);
1481 cfg.offset_enable = quirks & IS_BIFROST ? false : true;
1482 cfg.format = (format << 12) | swizzle;
1483 cfg.offset = offset;
1484 }
1485 }
1486
1487 /* General varying that is unused */
1488
1489 static void
pan_emit_vary_only(struct mali_attribute_packed * out,unsigned present,unsigned quirks)1490 pan_emit_vary_only(struct mali_attribute_packed *out,
1491 unsigned present, unsigned quirks)
1492 {
1493 pan_emit_vary(out, present, 0, quirks, MALI_CONSTANT, 0);
1494 }
1495
1496 /* Special records */
1497
1498 static const enum mali_format pan_varying_formats[PAN_VARY_MAX] = {
1499 [PAN_VARY_POSITION] = MALI_SNAP_4,
1500 [PAN_VARY_PSIZ] = MALI_R16F,
1501 [PAN_VARY_PNTCOORD] = MALI_R16F,
1502 [PAN_VARY_FACE] = MALI_R32I,
1503 [PAN_VARY_FRAGCOORD] = MALI_RGBA32F
1504 };
1505
1506 static void
pan_emit_vary_special(struct mali_attribute_packed * out,unsigned present,enum pan_special_varying buf,unsigned quirks)1507 pan_emit_vary_special(struct mali_attribute_packed *out,
1508 unsigned present, enum pan_special_varying buf,
1509 unsigned quirks)
1510 {
1511 assert(buf < PAN_VARY_MAX);
1512 pan_emit_vary(out, present, buf, quirks, pan_varying_formats[buf], 0);
1513 }
1514
1515 static enum mali_format
pan_xfb_format(enum mali_format format,unsigned nr)1516 pan_xfb_format(enum mali_format format, unsigned nr)
1517 {
1518 if (MALI_EXTRACT_BITS(format) == MALI_CHANNEL_FLOAT)
1519 return MALI_R32F | MALI_NR_CHANNELS(nr);
1520 else
1521 return MALI_EXTRACT_TYPE(format) | MALI_NR_CHANNELS(nr) | MALI_CHANNEL_32;
1522 }
1523
1524 /* Transform feedback records. Note struct pipe_stream_output is (if packed as
1525 * a bitfield) 32-bit, smaller than a 64-bit pointer, so may as well pass by
1526 * value. */
1527
1528 static void
pan_emit_vary_xfb(struct mali_attribute_packed * out,unsigned present,unsigned max_xfb,unsigned * streamout_offsets,unsigned quirks,enum mali_format format,struct pipe_stream_output o)1529 pan_emit_vary_xfb(struct mali_attribute_packed *out,
1530 unsigned present,
1531 unsigned max_xfb,
1532 unsigned *streamout_offsets,
1533 unsigned quirks,
1534 enum mali_format format,
1535 struct pipe_stream_output o)
1536 {
1537 unsigned swizzle = quirks & HAS_SWIZZLES ?
1538 panfrost_get_default_swizzle(o.num_components) :
1539 panfrost_bifrost_swizzle(o.num_components);
1540
1541 pan_pack(out, ATTRIBUTE, cfg) {
1542 /* XFB buffers come after everything else */
1543 cfg.buffer_index = pan_xfb_base(present) + o.output_buffer;
1544 cfg.offset_enable = quirks & IS_BIFROST ? false : true;
1545
1546 /* Override number of channels and precision to highp */
1547 cfg.format = (pan_xfb_format(format, o.num_components) << 12) | swizzle;
1548
1549 /* Apply given offsets together */
1550 cfg.offset = (o.dst_offset * 4) /* dwords */
1551 + streamout_offsets[o.output_buffer];
1552 }
1553 }
1554
1555 /* Determine if we should capture a varying for XFB. This requires actually
1556 * having a buffer for it. If we don't capture it, we'll fallback to a general
1557 * varying path (linked or unlinked, possibly discarding the write) */
1558
1559 static bool
panfrost_xfb_captured(struct panfrost_shader_state * xfb,unsigned loc,unsigned max_xfb)1560 panfrost_xfb_captured(struct panfrost_shader_state *xfb,
1561 unsigned loc, unsigned max_xfb)
1562 {
1563 if (!(xfb->so_mask & (1ll << loc)))
1564 return false;
1565
1566 struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1567 return o->output_buffer < max_xfb;
1568 }
1569
1570 static void
pan_emit_general_varying(struct mali_attribute_packed * out,struct panfrost_shader_state * other,struct panfrost_shader_state * xfb,gl_varying_slot loc,enum mali_format format,unsigned present,unsigned quirks,unsigned * gen_offsets,enum mali_format * gen_formats,unsigned * gen_stride,unsigned idx,bool should_alloc)1571 pan_emit_general_varying(struct mali_attribute_packed *out,
1572 struct panfrost_shader_state *other,
1573 struct panfrost_shader_state *xfb,
1574 gl_varying_slot loc,
1575 enum mali_format format,
1576 unsigned present,
1577 unsigned quirks,
1578 unsigned *gen_offsets,
1579 enum mali_format *gen_formats,
1580 unsigned *gen_stride,
1581 unsigned idx,
1582 bool should_alloc)
1583 {
1584 /* Check if we're linked */
1585 signed other_idx = -1;
1586
1587 for (unsigned j = 0; j < other->varying_count; ++j) {
1588 if (other->varyings_loc[j] == loc) {
1589 other_idx = j;
1590 break;
1591 }
1592 }
1593
1594 if (other_idx < 0) {
1595 pan_emit_vary_only(out, present, quirks);
1596 return;
1597 }
1598
1599 unsigned offset = gen_offsets[other_idx];
1600
1601 if (should_alloc) {
1602 /* We're linked, so allocate a space via a watermark allocation */
1603 enum mali_format alt = other->varyings[other_idx];
1604
1605 /* Do interpolation at minimum precision */
1606 unsigned size_main = pan_varying_size(format);
1607 unsigned size_alt = pan_varying_size(alt);
1608 unsigned size = MIN2(size_main, size_alt);
1609
1610 /* If a varying is marked for XFB but not actually captured, we
1611 * should match the format to the format that would otherwise
1612 * be used for XFB, since dEQP checks for invariance here. It's
1613 * unclear if this is required by the spec. */
1614
1615 if (xfb->so_mask & (1ull << loc)) {
1616 struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1617 format = pan_xfb_format(format, o->num_components);
1618 size = pan_varying_size(format);
1619 } else if (size == size_alt) {
1620 format = alt;
1621 }
1622
1623 gen_offsets[idx] = *gen_stride;
1624 gen_formats[other_idx] = format;
1625 offset = *gen_stride;
1626 *gen_stride += size;
1627 }
1628
1629 pan_emit_vary(out, present, PAN_VARY_GENERAL, quirks, format, offset);
1630 }
1631
1632 /* Higher-level wrapper around all of the above, classifying a varying into one
1633 * of the above types */
1634
1635 static void
panfrost_emit_varying(struct mali_attribute_packed * out,struct panfrost_shader_state * stage,struct panfrost_shader_state * other,struct panfrost_shader_state * xfb,unsigned present,uint16_t point_sprite_mask,unsigned max_xfb,unsigned * streamout_offsets,unsigned quirks,unsigned * gen_offsets,enum mali_format * gen_formats,unsigned * gen_stride,unsigned idx,bool should_alloc,bool is_fragment)1636 panfrost_emit_varying(
1637 struct mali_attribute_packed *out,
1638 struct panfrost_shader_state *stage,
1639 struct panfrost_shader_state *other,
1640 struct panfrost_shader_state *xfb,
1641 unsigned present,
1642 uint16_t point_sprite_mask,
1643 unsigned max_xfb,
1644 unsigned *streamout_offsets,
1645 unsigned quirks,
1646 unsigned *gen_offsets,
1647 enum mali_format *gen_formats,
1648 unsigned *gen_stride,
1649 unsigned idx,
1650 bool should_alloc,
1651 bool is_fragment)
1652 {
1653 gl_varying_slot loc = stage->varyings_loc[idx];
1654 enum mali_format format = stage->varyings[idx];
1655
1656 /* Override format to match linkage */
1657 if (!should_alloc && gen_formats[idx])
1658 format = gen_formats[idx];
1659
1660 if (util_varying_is_point_coord(loc, point_sprite_mask)) {
1661 pan_emit_vary_special(out, present, PAN_VARY_PNTCOORD, quirks);
1662 } else if (panfrost_xfb_captured(xfb, loc, max_xfb)) {
1663 struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1664 pan_emit_vary_xfb(out, present, max_xfb, streamout_offsets, quirks, format, *o);
1665 } else if (loc == VARYING_SLOT_POS) {
1666 if (is_fragment)
1667 pan_emit_vary_special(out, present, PAN_VARY_FRAGCOORD, quirks);
1668 else
1669 pan_emit_vary_special(out, present, PAN_VARY_POSITION, quirks);
1670 } else if (loc == VARYING_SLOT_PSIZ) {
1671 pan_emit_vary_special(out, present, PAN_VARY_PSIZ, quirks);
1672 } else if (loc == VARYING_SLOT_PNTC) {
1673 pan_emit_vary_special(out, present, PAN_VARY_PNTCOORD, quirks);
1674 } else if (loc == VARYING_SLOT_FACE) {
1675 pan_emit_vary_special(out, present, PAN_VARY_FACE, quirks);
1676 } else {
1677 pan_emit_general_varying(out, other, xfb, loc, format, present,
1678 quirks, gen_offsets, gen_formats, gen_stride,
1679 idx, should_alloc);
1680 }
1681 }
1682
1683 static void
pan_emit_special_input(struct mali_attribute_buffer_packed * out,unsigned present,enum pan_special_varying v,unsigned special)1684 pan_emit_special_input(struct mali_attribute_buffer_packed *out,
1685 unsigned present,
1686 enum pan_special_varying v,
1687 unsigned special)
1688 {
1689 if (present & (1 << v)) {
1690 unsigned idx = pan_varying_index(present, v);
1691
1692 pan_pack(out + idx, ATTRIBUTE_BUFFER, cfg) {
1693 cfg.special = special;
1694 cfg.type = 0;
1695 }
1696 }
1697 }
1698
1699 void
panfrost_emit_varying_descriptor(struct panfrost_batch * batch,unsigned vertex_count,mali_ptr * vs_attribs,mali_ptr * fs_attribs,mali_ptr * buffers,mali_ptr * position,mali_ptr * psiz)1700 panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
1701 unsigned vertex_count,
1702 mali_ptr *vs_attribs,
1703 mali_ptr *fs_attribs,
1704 mali_ptr *buffers,
1705 mali_ptr *position,
1706 mali_ptr *psiz)
1707 {
1708 /* Load the shaders */
1709 struct panfrost_context *ctx = batch->ctx;
1710 struct panfrost_device *dev = pan_device(ctx->base.screen);
1711 struct panfrost_shader_state *vs, *fs;
1712 size_t vs_size, fs_size;
1713
1714 /* Allocate the varying descriptor */
1715
1716 vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1717 fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
1718 vs_size = MALI_ATTRIBUTE_LENGTH * vs->varying_count;
1719 fs_size = MALI_ATTRIBUTE_LENGTH * fs->varying_count;
1720
1721 struct panfrost_ptr trans = panfrost_pool_alloc_aligned(
1722 &batch->pool, vs_size + fs_size, MALI_ATTRIBUTE_LENGTH);
1723
1724 struct pipe_stream_output_info *so = &vs->stream_output;
1725 uint16_t point_coord_mask = ctx->rasterizer->base.sprite_coord_enable;
1726
1727 /* TODO: point sprites need lowering on Bifrost */
1728 if (dev->quirks & IS_BIFROST)
1729 point_coord_mask = 0;
1730
1731 unsigned present = pan_varying_present(vs, fs, dev->quirks, point_coord_mask);
1732
1733 /* Check if this varying is linked by us. This is the case for
1734 * general-purpose, non-captured varyings. If it is, link it. If it's
1735 * not, use the provided stream out information to determine the
1736 * offset, since it was already linked for us. */
1737
1738 unsigned gen_offsets[32];
1739 enum mali_format gen_formats[32];
1740 memset(gen_offsets, 0, sizeof(gen_offsets));
1741 memset(gen_formats, 0, sizeof(gen_formats));
1742
1743 unsigned gen_stride = 0;
1744 assert(vs->varying_count < ARRAY_SIZE(gen_offsets));
1745 assert(fs->varying_count < ARRAY_SIZE(gen_offsets));
1746
1747 unsigned streamout_offsets[32];
1748
1749 for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
1750 streamout_offsets[i] = panfrost_streamout_offset(
1751 so->stride[i],
1752 ctx->streamout.targets[i]);
1753 }
1754
1755 struct mali_attribute_packed *ovs = (struct mali_attribute_packed *)trans.cpu;
1756 struct mali_attribute_packed *ofs = ovs + vs->varying_count;
1757
1758 for (unsigned i = 0; i < vs->varying_count; i++) {
1759 panfrost_emit_varying(ovs + i, vs, fs, vs, present, 0,
1760 ctx->streamout.num_targets, streamout_offsets,
1761 dev->quirks,
1762 gen_offsets, gen_formats, &gen_stride, i, true, false);
1763 }
1764
1765 for (unsigned i = 0; i < fs->varying_count; i++) {
1766 panfrost_emit_varying(ofs + i, fs, vs, vs, present, point_coord_mask,
1767 ctx->streamout.num_targets, streamout_offsets,
1768 dev->quirks,
1769 gen_offsets, gen_formats, &gen_stride, i, false, true);
1770 }
1771
1772 unsigned xfb_base = pan_xfb_base(present);
1773 struct panfrost_ptr T = panfrost_pool_alloc_aligned(&batch->pool,
1774 MALI_ATTRIBUTE_BUFFER_LENGTH * (xfb_base + ctx->streamout.num_targets + 1),
1775 MALI_ATTRIBUTE_BUFFER_LENGTH * 2);
1776 struct mali_attribute_buffer_packed *varyings =
1777 (struct mali_attribute_buffer_packed *) T.cpu;
1778
1779 /* Suppress prefetch on Bifrost */
1780 memset(varyings + (xfb_base * ctx->streamout.num_targets), 0, sizeof(*varyings));
1781
1782 /* Emit the stream out buffers */
1783
1784 unsigned out_count = u_stream_outputs_for_vertices(ctx->active_prim,
1785 ctx->vertex_count);
1786
1787 for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
1788 panfrost_emit_streamout(batch, &varyings[xfb_base + i],
1789 so->stride[i],
1790 out_count,
1791 ctx->streamout.targets[i]);
1792 }
1793
1794 panfrost_emit_varyings(batch,
1795 &varyings[pan_varying_index(present, PAN_VARY_GENERAL)],
1796 gen_stride, vertex_count);
1797
1798 /* fp32 vec4 gl_Position */
1799 *position = panfrost_emit_varyings(batch,
1800 &varyings[pan_varying_index(present, PAN_VARY_POSITION)],
1801 sizeof(float) * 4, vertex_count);
1802
1803 if (present & (1 << PAN_VARY_PSIZ)) {
1804 *psiz = panfrost_emit_varyings(batch,
1805 &varyings[pan_varying_index(present, PAN_VARY_PSIZ)],
1806 2, vertex_count);
1807 }
1808
1809 pan_emit_special_input(varyings, present, PAN_VARY_PNTCOORD, MALI_ATTRIBUTE_SPECIAL_POINT_COORD);
1810 pan_emit_special_input(varyings, present, PAN_VARY_FACE, MALI_ATTRIBUTE_SPECIAL_FRONT_FACING);
1811 pan_emit_special_input(varyings, present, PAN_VARY_FRAGCOORD, MALI_ATTRIBUTE_SPECIAL_FRAG_COORD);
1812
1813 *buffers = T.gpu;
1814 *vs_attribs = trans.gpu;
1815 *fs_attribs = trans.gpu + vs_size;
1816 }
1817
1818 void
panfrost_emit_vertex_tiler_jobs(struct panfrost_batch * batch,const struct panfrost_ptr * vertex_job,const struct panfrost_ptr * tiler_job)1819 panfrost_emit_vertex_tiler_jobs(struct panfrost_batch *batch,
1820 const struct panfrost_ptr *vertex_job,
1821 const struct panfrost_ptr *tiler_job)
1822 {
1823 struct panfrost_context *ctx = batch->ctx;
1824
1825 /* If rasterizer discard is enable, only submit the vertex */
1826
1827 unsigned vertex = panfrost_add_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_VERTEX, false, 0,
1828 vertex_job, false);
1829
1830 if (ctx->rasterizer->base.rasterizer_discard)
1831 return;
1832
1833 panfrost_add_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_TILER, false, vertex, tiler_job, false);
1834 }
1835
1836 /* TODO: stop hardcoding this */
1837 mali_ptr
panfrost_emit_sample_locations(struct panfrost_batch * batch)1838 panfrost_emit_sample_locations(struct panfrost_batch *batch)
1839 {
1840 uint16_t locations[] = {
1841 128, 128,
1842 0, 256,
1843 0, 256,
1844 0, 256,
1845 0, 256,
1846 0, 256,
1847 0, 256,
1848 0, 256,
1849 0, 256,
1850 0, 256,
1851 0, 256,
1852 0, 256,
1853 0, 256,
1854 0, 256,
1855 0, 256,
1856 0, 256,
1857 0, 256,
1858 0, 256,
1859 0, 256,
1860 0, 256,
1861 0, 256,
1862 0, 256,
1863 0, 256,
1864 0, 256,
1865 0, 256,
1866 0, 256,
1867 0, 256,
1868 0, 256,
1869 0, 256,
1870 0, 256,
1871 0, 256,
1872 0, 256,
1873 128, 128,
1874 0, 0,
1875 0, 0,
1876 0, 0,
1877 0, 0,
1878 0, 0,
1879 0, 0,
1880 0, 0,
1881 0, 0,
1882 0, 0,
1883 0, 0,
1884 0, 0,
1885 0, 0,
1886 0, 0,
1887 0, 0,
1888 0, 0,
1889 };
1890
1891 return panfrost_pool_upload_aligned(&batch->pool, locations, 96 * sizeof(uint16_t), 64);
1892 }
1893