1 /*
2  * Copyright © 2019 Raspberry Pi
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "v3dv_private.h"
25 #include "broadcom/cle/v3dx_pack.h"
26 #include "util/half_float.h"
27 #include "util/u_pack_color.h"
28 #include "vk_format_info.h"
29 
30 const struct v3dv_dynamic_state default_dynamic_state = {
31    .viewport = {
32       .count = 0,
33    },
34    .scissor = {
35       .count = 0,
36    },
37    .stencil_compare_mask =
38    {
39      .front = ~0u,
40      .back = ~0u,
41    },
42    .stencil_write_mask =
43    {
44      .front = ~0u,
45      .back = ~0u,
46    },
47    .stencil_reference =
48    {
49      .front = 0u,
50      .back = 0u,
51    },
52    .blend_constants = { 0.0f, 0.0f, 0.0f, 0.0f },
53    .depth_bias = {
54       .constant_factor = 0.0f,
55       .slope_factor = 0.0f,
56    },
57    .line_width = 1.0f,
58 };
59 
60 void
v3dv_job_add_bo(struct v3dv_job * job,struct v3dv_bo * bo)61 v3dv_job_add_bo(struct v3dv_job *job, struct v3dv_bo *bo)
62 {
63    if (!bo)
64       return;
65 
66    if (_mesa_set_search(job->bos, bo))
67       return;
68 
69    _mesa_set_add(job->bos, bo);
70    job->bo_count++;
71 }
72 
73 static void
74 cmd_buffer_emit_render_pass_rcl(struct v3dv_cmd_buffer *cmd_buffer);
75 
76 VkResult
v3dv_CreateCommandPool(VkDevice _device,const VkCommandPoolCreateInfo * pCreateInfo,const VkAllocationCallbacks * pAllocator,VkCommandPool * pCmdPool)77 v3dv_CreateCommandPool(VkDevice _device,
78                        const VkCommandPoolCreateInfo *pCreateInfo,
79                        const VkAllocationCallbacks *pAllocator,
80                        VkCommandPool *pCmdPool)
81 {
82    V3DV_FROM_HANDLE(v3dv_device, device, _device);
83    struct v3dv_cmd_pool *pool;
84 
85    /* We only support one queue */
86    assert(pCreateInfo->queueFamilyIndex == 0);
87 
88    pool = vk_alloc2(&device->alloc, pAllocator, sizeof(*pool), 8,
89                      VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
90    if (pool == NULL)
91       return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
92 
93    if (pAllocator)
94       pool->alloc = *pAllocator;
95    else
96       pool->alloc = device->alloc;
97 
98    list_inithead(&pool->cmd_buffers);
99 
100    *pCmdPool = v3dv_cmd_pool_to_handle(pool);
101 
102    return VK_SUCCESS;
103 }
104 
105 static void
cmd_buffer_init(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_device * device,struct v3dv_cmd_pool * pool,VkCommandBufferLevel level)106 cmd_buffer_init(struct v3dv_cmd_buffer *cmd_buffer,
107                 struct v3dv_device *device,
108                 struct v3dv_cmd_pool *pool,
109                 VkCommandBufferLevel level)
110 {
111    /* Do not reset the loader data header! If we are calling this from
112     * a command buffer reset that would reset the loader's dispatch table for
113     * the command buffer.
114     */
115    const uint32_t ld_size = sizeof(VK_LOADER_DATA);
116    uint8_t *cmd_buffer_driver_start = ((uint8_t *) cmd_buffer) + ld_size;
117    memset(cmd_buffer_driver_start, 0, sizeof(*cmd_buffer) - ld_size);
118 
119    cmd_buffer->device = device;
120    cmd_buffer->pool = pool;
121    cmd_buffer->level = level;
122 
123    list_inithead(&cmd_buffer->private_objs);
124    list_inithead(&cmd_buffer->jobs);
125    list_inithead(&cmd_buffer->list_link);
126 
127    assert(pool);
128    list_addtail(&cmd_buffer->pool_link, &pool->cmd_buffers);
129 
130    cmd_buffer->state.subpass_idx = -1;
131    cmd_buffer->state.meta.subpass_idx = -1;
132 
133    cmd_buffer->status = V3DV_CMD_BUFFER_STATUS_INITIALIZED;
134 }
135 
136 static VkResult
cmd_buffer_create(struct v3dv_device * device,struct v3dv_cmd_pool * pool,VkCommandBufferLevel level,VkCommandBuffer * pCommandBuffer)137 cmd_buffer_create(struct v3dv_device *device,
138                   struct v3dv_cmd_pool *pool,
139                   VkCommandBufferLevel level,
140                   VkCommandBuffer *pCommandBuffer)
141 {
142    struct v3dv_cmd_buffer *cmd_buffer;
143    cmd_buffer = vk_alloc(&pool->alloc, sizeof(*cmd_buffer), 8,
144                          VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
145    if (cmd_buffer == NULL)
146       return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
147 
148    cmd_buffer_init(cmd_buffer, device, pool, level);
149 
150    cmd_buffer->_loader_data.loaderMagic = ICD_LOADER_MAGIC;
151 
152    *pCommandBuffer = v3dv_cmd_buffer_to_handle(cmd_buffer);
153 
154    return VK_SUCCESS;
155 }
156 
157 static void
job_destroy_gpu_cl_resources(struct v3dv_job * job)158 job_destroy_gpu_cl_resources(struct v3dv_job *job)
159 {
160    assert(job->type == V3DV_JOB_TYPE_GPU_CL ||
161           job->type == V3DV_JOB_TYPE_GPU_CL_SECONDARY);
162 
163    v3dv_cl_destroy(&job->bcl);
164    v3dv_cl_destroy(&job->rcl);
165    v3dv_cl_destroy(&job->indirect);
166 
167    /* Since we don't ref BOs when we add them to the command buffer, don't
168     * unref them here either. Bo's will be freed when their corresponding API
169     * objects are destroyed.
170     */
171    _mesa_set_destroy(job->bos, NULL);
172 
173    v3dv_bo_free(job->device, job->tile_alloc);
174    v3dv_bo_free(job->device, job->tile_state);
175 }
176 
177 static void
job_destroy_cloned_gpu_cl_resources(struct v3dv_job * job)178 job_destroy_cloned_gpu_cl_resources(struct v3dv_job *job)
179 {
180    assert(job->type == V3DV_JOB_TYPE_GPU_CL);
181 
182    list_for_each_entry_safe(struct v3dv_bo, bo, &job->bcl.bo_list, list_link) {
183       list_del(&bo->list_link);
184       vk_free(&job->device->alloc, bo);
185    }
186 
187    list_for_each_entry_safe(struct v3dv_bo, bo, &job->rcl.bo_list, list_link) {
188       list_del(&bo->list_link);
189       vk_free(&job->device->alloc, bo);
190    }
191 
192    list_for_each_entry_safe(struct v3dv_bo, bo, &job->indirect.bo_list, list_link) {
193       list_del(&bo->list_link);
194       vk_free(&job->device->alloc, bo);
195    }
196 }
197 
198 static void
job_destroy_gpu_csd_resources(struct v3dv_job * job)199 job_destroy_gpu_csd_resources(struct v3dv_job *job)
200 {
201    assert(job->type == V3DV_JOB_TYPE_GPU_CSD);
202    assert(job->cmd_buffer);
203 
204    v3dv_cl_destroy(&job->indirect);
205 
206    _mesa_set_destroy(job->bos, NULL);
207 
208    if (job->csd.shared_memory)
209       v3dv_bo_free(job->device, job->csd.shared_memory);
210 }
211 
212 static void
job_destroy_cpu_wait_events_resources(struct v3dv_job * job)213 job_destroy_cpu_wait_events_resources(struct v3dv_job *job)
214 {
215    assert(job->type == V3DV_JOB_TYPE_CPU_WAIT_EVENTS);
216    assert(job->cmd_buffer);
217    vk_free(&job->cmd_buffer->device->alloc, job->cpu.event_wait.events);
218 }
219 
220 static void
job_destroy_cpu_csd_indirect_resources(struct v3dv_job * job)221 job_destroy_cpu_csd_indirect_resources(struct v3dv_job *job)
222 {
223    assert(job->type == V3DV_JOB_TYPE_CPU_CSD_INDIRECT);
224    assert(job->cmd_buffer);
225    v3dv_job_destroy(job->cpu.csd_indirect.csd_job);
226 }
227 
228 void
v3dv_job_destroy(struct v3dv_job * job)229 v3dv_job_destroy(struct v3dv_job *job)
230 {
231    assert(job);
232 
233    list_del(&job->list_link);
234 
235    /* Cloned jobs don't make deep copies of the original jobs, so they don't
236     * own any of their resources. However, they do allocate clones of BO
237     * structs, so make sure we free those.
238     */
239    if (!job->is_clone) {
240       switch (job->type) {
241       case V3DV_JOB_TYPE_GPU_CL:
242       case V3DV_JOB_TYPE_GPU_CL_SECONDARY:
243          job_destroy_gpu_cl_resources(job);
244          break;
245       case V3DV_JOB_TYPE_GPU_CSD:
246          job_destroy_gpu_csd_resources(job);
247          break;
248       case V3DV_JOB_TYPE_CPU_WAIT_EVENTS:
249          job_destroy_cpu_wait_events_resources(job);
250          break;
251       case V3DV_JOB_TYPE_CPU_CSD_INDIRECT:
252          job_destroy_cpu_csd_indirect_resources(job);
253          break;
254       default:
255          break;
256       }
257    } else {
258       /* Cloned jobs */
259       if (job->type == V3DV_JOB_TYPE_GPU_CL)
260          job_destroy_cloned_gpu_cl_resources(job);
261    }
262 
263    vk_free(&job->device->alloc, job);
264 }
265 
266 void
v3dv_cmd_buffer_add_private_obj(struct v3dv_cmd_buffer * cmd_buffer,uint64_t obj,v3dv_cmd_buffer_private_obj_destroy_cb destroy_cb)267 v3dv_cmd_buffer_add_private_obj(struct v3dv_cmd_buffer *cmd_buffer,
268                                 uint64_t obj,
269                                 v3dv_cmd_buffer_private_obj_destroy_cb destroy_cb)
270 {
271    struct v3dv_cmd_buffer_private_obj *pobj =
272       vk_alloc(&cmd_buffer->device->alloc, sizeof(*pobj), 8,
273                VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
274    if (!pobj) {
275       v3dv_flag_oom(cmd_buffer, NULL);
276       return;
277    }
278 
279    pobj->obj = obj;
280    pobj->destroy_cb = destroy_cb;
281 
282    list_addtail(&pobj->list_link, &cmd_buffer->private_objs);
283 }
284 
285 static void
cmd_buffer_destroy_private_obj(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_cmd_buffer_private_obj * pobj)286 cmd_buffer_destroy_private_obj(struct v3dv_cmd_buffer *cmd_buffer,
287                                struct v3dv_cmd_buffer_private_obj *pobj)
288 {
289    assert(pobj && pobj->obj && pobj->destroy_cb);
290    pobj->destroy_cb(v3dv_device_to_handle(cmd_buffer->device),
291                     pobj->obj,
292                     &cmd_buffer->device->alloc);
293    list_del(&pobj->list_link);
294    vk_free(&cmd_buffer->device->alloc, pobj);
295 }
296 
297 static void
cmd_buffer_free_resources(struct v3dv_cmd_buffer * cmd_buffer)298 cmd_buffer_free_resources(struct v3dv_cmd_buffer *cmd_buffer)
299 {
300    list_for_each_entry_safe(struct v3dv_job, job,
301                             &cmd_buffer->jobs, list_link) {
302       v3dv_job_destroy(job);
303    }
304 
305    if (cmd_buffer->state.job)
306       v3dv_job_destroy(cmd_buffer->state.job);
307 
308    if (cmd_buffer->state.attachments)
309       vk_free(&cmd_buffer->pool->alloc, cmd_buffer->state.attachments);
310 
311    if (cmd_buffer->state.query.end.alloc_count > 0)
312       vk_free(&cmd_buffer->device->alloc, cmd_buffer->state.query.end.states);
313 
314    if (cmd_buffer->push_constants_resource.bo)
315       v3dv_bo_free(cmd_buffer->device, cmd_buffer->push_constants_resource.bo);
316 
317    list_for_each_entry_safe(struct v3dv_cmd_buffer_private_obj, pobj,
318                             &cmd_buffer->private_objs, list_link) {
319       cmd_buffer_destroy_private_obj(cmd_buffer, pobj);
320    }
321 
322    if (cmd_buffer->state.meta.attachments) {
323          assert(cmd_buffer->state.meta.attachment_alloc_count > 0);
324          vk_free(&cmd_buffer->device->alloc, cmd_buffer->state.meta.attachments);
325    }
326 }
327 
328 static void
cmd_buffer_destroy(struct v3dv_cmd_buffer * cmd_buffer)329 cmd_buffer_destroy(struct v3dv_cmd_buffer *cmd_buffer)
330 {
331    list_del(&cmd_buffer->pool_link);
332    cmd_buffer_free_resources(cmd_buffer);
333    vk_free(&cmd_buffer->pool->alloc, cmd_buffer);
334 }
335 
336 void
v3dv_job_emit_binning_flush(struct v3dv_job * job)337 v3dv_job_emit_binning_flush(struct v3dv_job *job)
338 {
339    assert(job);
340 
341    v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(FLUSH));
342    v3dv_return_if_oom(NULL, job);
343 
344    cl_emit(&job->bcl, FLUSH, flush);
345 }
346 
347 static bool
attachment_list_is_subset(struct v3dv_subpass_attachment * l1,uint32_t l1_count,struct v3dv_subpass_attachment * l2,uint32_t l2_count)348 attachment_list_is_subset(struct v3dv_subpass_attachment *l1, uint32_t l1_count,
349                           struct v3dv_subpass_attachment *l2, uint32_t l2_count)
350 {
351    for (uint32_t i = 0; i < l1_count; i++) {
352       uint32_t attachment_idx = l1[i].attachment;
353       if (attachment_idx == VK_ATTACHMENT_UNUSED)
354          continue;
355 
356       uint32_t j;
357       for (j = 0; j < l2_count; j++) {
358          if (l2[j].attachment == attachment_idx)
359             break;
360       }
361       if (j == l2_count)
362          return false;
363    }
364 
365    return true;
366  }
367 
368 static bool
cmd_buffer_can_merge_subpass(struct v3dv_cmd_buffer * cmd_buffer,uint32_t subpass_idx)369 cmd_buffer_can_merge_subpass(struct v3dv_cmd_buffer *cmd_buffer,
370                              uint32_t subpass_idx)
371 {
372    const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
373    assert(state->pass);
374 
375    const struct v3dv_physical_device *physical_device =
376       &cmd_buffer->device->instance->physicalDevice;
377 
378    if (cmd_buffer->level != VK_COMMAND_BUFFER_LEVEL_PRIMARY)
379       return false;
380 
381    if (!cmd_buffer->state.job)
382       return false;
383 
384    if (cmd_buffer->state.job->always_flush)
385       return false;
386 
387    if (!physical_device->options.merge_jobs)
388       return false;
389 
390    /* Each render pass starts a new job */
391    if (subpass_idx == 0)
392       return false;
393 
394    /* Two subpasses can be merged in the same job if we can emit a single RCL
395     * for them (since the RCL includes the END_OF_RENDERING command that
396     * triggers the "render job finished" interrupt). We can do this so long
397     * as both subpasses render against the same attachments.
398     */
399    assert(state->subpass_idx == subpass_idx - 1);
400    struct v3dv_subpass *prev_subpass = &state->pass->subpasses[state->subpass_idx];
401    struct v3dv_subpass *subpass = &state->pass->subpasses[subpass_idx];
402 
403    /* Because the list of subpass attachments can include VK_ATTACHMENT_UNUSED,
404     * we need to check that for each subpass all its used attachments are
405     * used by the other subpass.
406     */
407    bool compatible =
408       attachment_list_is_subset(prev_subpass->color_attachments,
409                                 prev_subpass->color_count,
410                                 subpass->color_attachments,
411                                 subpass->color_count);
412    if (!compatible)
413       return false;
414 
415    compatible =
416       attachment_list_is_subset(subpass->color_attachments,
417                                 subpass->color_count,
418                                 prev_subpass->color_attachments,
419                                 prev_subpass->color_count);
420    if (!compatible)
421       return false;
422 
423    if (subpass->ds_attachment.attachment !=
424        prev_subpass->ds_attachment.attachment)
425       return false;
426 
427    /* FIXME: Since some attachment formats can't be resolved using the TLB we
428     * need to emit separate resolve jobs for them and that would not be
429     * compatible with subpass merges. We could fix that by testing if any of
430     * the attachments to resolve doesn't suppotr TLB resolves.
431     */
432    if (prev_subpass->resolve_attachments || subpass->resolve_attachments)
433       return false;
434 
435    return true;
436 }
437 
438 /**
439  * Computes and sets the job frame tiling information required to setup frame
440  * binning and rendering.
441  */
442 static struct v3dv_frame_tiling *
job_compute_frame_tiling(struct v3dv_job * job,uint32_t width,uint32_t height,uint32_t layers,uint32_t render_target_count,uint8_t max_internal_bpp,bool msaa)443 job_compute_frame_tiling(struct v3dv_job *job,
444                          uint32_t width,
445                          uint32_t height,
446                          uint32_t layers,
447                          uint32_t render_target_count,
448                          uint8_t max_internal_bpp,
449                          bool msaa)
450 {
451    static const uint8_t tile_sizes[] = {
452       64, 64,
453       64, 32,
454       32, 32,
455       32, 16,
456       16, 16,
457       16,  8,
458        8,  8
459    };
460 
461    assert(job);
462    struct v3dv_frame_tiling *tiling = &job->frame_tiling;
463 
464    tiling->width = width;
465    tiling->height = height;
466    tiling->layers = layers;
467    tiling->render_target_count = render_target_count;
468    tiling->msaa = msaa;
469 
470    uint32_t tile_size_index = 0;
471 
472    if (render_target_count > 2)
473       tile_size_index += 2;
474    else if (render_target_count > 1)
475       tile_size_index += 1;
476 
477    if (msaa)
478       tile_size_index += 2;
479 
480    tiling->internal_bpp = max_internal_bpp;
481    tile_size_index += tiling->internal_bpp;
482    assert(tile_size_index < ARRAY_SIZE(tile_sizes));
483 
484    tiling->tile_width = tile_sizes[tile_size_index * 2];
485    tiling->tile_height = tile_sizes[tile_size_index * 2 + 1];
486 
487    tiling->draw_tiles_x = DIV_ROUND_UP(width, tiling->tile_width);
488    tiling->draw_tiles_y = DIV_ROUND_UP(height, tiling->tile_height);
489 
490    /* Size up our supertiles until we get under the limit */
491    const uint32_t max_supertiles = 256;
492    tiling->supertile_width = 1;
493    tiling->supertile_height = 1;
494    for (;;) {
495       tiling->frame_width_in_supertiles =
496          DIV_ROUND_UP(tiling->draw_tiles_x, tiling->supertile_width);
497       tiling->frame_height_in_supertiles =
498          DIV_ROUND_UP(tiling->draw_tiles_y, tiling->supertile_height);
499       const uint32_t num_supertiles = tiling->frame_width_in_supertiles *
500                                       tiling->frame_height_in_supertiles;
501       if (num_supertiles < max_supertiles)
502          break;
503 
504       if (tiling->supertile_width < tiling->supertile_height)
505          tiling->supertile_width++;
506       else
507          tiling->supertile_height++;
508    }
509 
510    return tiling;
511 }
512 
513 void
v3dv_job_start_frame(struct v3dv_job * job,uint32_t width,uint32_t height,uint32_t layers,uint32_t render_target_count,uint8_t max_internal_bpp,bool msaa)514 v3dv_job_start_frame(struct v3dv_job *job,
515                      uint32_t width,
516                      uint32_t height,
517                      uint32_t layers,
518                      uint32_t render_target_count,
519                      uint8_t max_internal_bpp,
520                      bool msaa)
521 {
522    assert(job);
523 
524    /* Start by computing frame tiling spec for this job */
525    const struct v3dv_frame_tiling *tiling =
526       job_compute_frame_tiling(job,
527                                width, height, layers,
528                                render_target_count, max_internal_bpp, msaa);
529 
530    v3dv_cl_ensure_space_with_branch(&job->bcl, 256);
531    v3dv_return_if_oom(NULL, job);
532 
533    /* The PTB will request the tile alloc initial size per tile at start
534     * of tile binning.
535     */
536    uint32_t tile_alloc_size = 64 * tiling->layers *
537                               tiling->draw_tiles_x *
538                               tiling->draw_tiles_y;
539 
540    /* The PTB allocates in aligned 4k chunks after the initial setup. */
541    tile_alloc_size = align(tile_alloc_size, 4096);
542 
543    /* Include the first two chunk allocations that the PTB does so that
544     * we definitely clear the OOM condition before triggering one (the HW
545     * won't trigger OOM during the first allocations).
546     */
547    tile_alloc_size += 8192;
548 
549    /* For performance, allocate some extra initial memory after the PTB's
550     * minimal allocations, so that we hopefully don't have to block the
551     * GPU on the kernel handling an OOM signal.
552     */
553    tile_alloc_size += 512 * 1024;
554 
555    job->tile_alloc = v3dv_bo_alloc(job->device, tile_alloc_size,
556                                    "tile_alloc", true);
557    if (!job->tile_alloc) {
558       v3dv_flag_oom(NULL, job);
559       return;
560    }
561 
562    v3dv_job_add_bo(job, job->tile_alloc);
563 
564    const uint32_t tsda_per_tile_size = 256;
565    const uint32_t tile_state_size = tiling->layers *
566                                     tiling->draw_tiles_x *
567                                     tiling->draw_tiles_y *
568                                     tsda_per_tile_size;
569    job->tile_state = v3dv_bo_alloc(job->device, tile_state_size, "TSDA", true);
570    if (!job->tile_state) {
571       v3dv_flag_oom(NULL, job);
572       return;
573    }
574 
575    v3dv_job_add_bo(job, job->tile_state);
576 
577    /* This must go before the binning mode configuration. It is
578     * required for layered framebuffers to work.
579     */
580    cl_emit(&job->bcl, NUMBER_OF_LAYERS, config) {
581       config.number_of_layers = layers;
582    }
583 
584    cl_emit(&job->bcl, TILE_BINNING_MODE_CFG, config) {
585       config.width_in_pixels = tiling->width;
586       config.height_in_pixels = tiling->height;
587       config.number_of_render_targets = MAX2(tiling->render_target_count, 1);
588       config.multisample_mode_4x = tiling->msaa;
589       config.maximum_bpp_of_all_render_targets = tiling->internal_bpp;
590    }
591 
592    /* There's definitely nothing in the VCD cache we want. */
593    cl_emit(&job->bcl, FLUSH_VCD_CACHE, bin);
594 
595    /* "Binning mode lists must have a Start Tile Binning item (6) after
596     *  any prefix state data before the binning list proper starts."
597     */
598    cl_emit(&job->bcl, START_TILE_BINNING, bin);
599 
600    job->ez_state = VC5_EZ_UNDECIDED;
601    job->first_ez_state = VC5_EZ_UNDECIDED;
602 }
603 
604 static void
cmd_buffer_end_render_pass_frame(struct v3dv_cmd_buffer * cmd_buffer)605 cmd_buffer_end_render_pass_frame(struct v3dv_cmd_buffer *cmd_buffer)
606 {
607    assert(cmd_buffer->state.job);
608 
609    /* Typically, we have a single job for each subpass and we emit the job's RCL
610     * here when we are ending the frame for the subpass. However, some commands
611     * such as vkCmdClearAttachments need to run in their own separate job and
612     * they emit their own RCL even if they execute inside a subpass. In this
613     * scenario, we don't want to emit subpass RCL when we end the frame for
614     * those jobs, so we only emit the subpass RCL if the job has not recorded
615     * any RCL commands of its own.
616     */
617    if (v3dv_cl_offset(&cmd_buffer->state.job->rcl) == 0)
618       cmd_buffer_emit_render_pass_rcl(cmd_buffer);
619 
620    v3dv_job_emit_binning_flush(cmd_buffer->state.job);
621 }
622 
623 static void
cmd_buffer_end_render_pass_secondary(struct v3dv_cmd_buffer * cmd_buffer)624 cmd_buffer_end_render_pass_secondary(struct v3dv_cmd_buffer *cmd_buffer)
625 {
626    assert(cmd_buffer->state.job);
627    v3dv_cl_ensure_space_with_branch(&cmd_buffer->state.job->bcl,
628                                     cl_packet_length(RETURN_FROM_SUB_LIST));
629    v3dv_return_if_oom(cmd_buffer, NULL);
630    cl_emit(&cmd_buffer->state.job->bcl, RETURN_FROM_SUB_LIST, ret);
631 }
632 
633 struct v3dv_job *
v3dv_cmd_buffer_create_cpu_job(struct v3dv_device * device,enum v3dv_job_type type,struct v3dv_cmd_buffer * cmd_buffer,uint32_t subpass_idx)634 v3dv_cmd_buffer_create_cpu_job(struct v3dv_device *device,
635                                enum v3dv_job_type type,
636                                struct v3dv_cmd_buffer *cmd_buffer,
637                                uint32_t subpass_idx)
638 {
639    struct v3dv_job *job = vk_zalloc(&device->alloc,
640                                     sizeof(struct v3dv_job), 8,
641                                     VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
642    if (!job) {
643       v3dv_flag_oom(cmd_buffer, NULL);
644       return NULL;
645    }
646 
647    v3dv_job_init(job, type, device, cmd_buffer, subpass_idx);
648    return job;
649 }
650 
651 static void
cmd_buffer_add_cpu_jobs_for_pending_state(struct v3dv_cmd_buffer * cmd_buffer)652 cmd_buffer_add_cpu_jobs_for_pending_state(struct v3dv_cmd_buffer *cmd_buffer)
653 {
654    struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
655 
656    if (state->query.end.used_count > 0) {
657       const uint32_t query_count = state->query.end.used_count;
658       for (uint32_t i = 0; i < query_count; i++) {
659          assert(i < state->query.end.used_count);
660          struct v3dv_job *job =
661             v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device,
662                                            V3DV_JOB_TYPE_CPU_END_QUERY,
663                                            cmd_buffer, -1);
664          v3dv_return_if_oom(cmd_buffer, NULL);
665 
666          job->cpu.query_end = state->query.end.states[i];
667          list_addtail(&job->list_link, &cmd_buffer->jobs);
668       }
669    }
670 }
671 
672 void
v3dv_cmd_buffer_finish_job(struct v3dv_cmd_buffer * cmd_buffer)673 v3dv_cmd_buffer_finish_job(struct v3dv_cmd_buffer *cmd_buffer)
674 {
675    struct v3dv_job *job = cmd_buffer->state.job;
676    if (!job)
677       return;
678 
679    if (cmd_buffer->state.oom) {
680       v3dv_job_destroy(job);
681       cmd_buffer->state.job = NULL;
682       return;
683    }
684 
685    /* If we have created a job for a command buffer then we should have
686     * recorded something into it: if the job was started in a render pass, it
687     * should at least have the start frame commands, otherwise, it should have
688     * a transfer command. The only exception are secondary command buffers
689     * inside a render pass.
690     */
691    assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY ||
692           v3dv_cl_offset(&job->bcl) > 0);
693 
694    /* When we merge multiple subpasses into the same job we must only emit one
695     * RCL, so we do that here, when we decided that we need to finish the job.
696     * Any rendering that happens outside a render pass is never merged, so
697     * the RCL should have been emitted by the time we got here.
698     */
699    assert(v3dv_cl_offset(&job->rcl) != 0 || cmd_buffer->state.pass);
700 
701    /* If we are finishing a job inside a render pass we have two scenarios:
702     *
703     * 1. It is a regular CL, in which case we will submit the job to the GPU,
704     *    so we may need to generate an RCL and add a binning flush.
705     *
706     * 2. It is a partial CL recorded in a secondary command buffer, in which
707     *    case we are not submitting it directly to the GPU but rather branch to
708     *    it from a primary command buffer. In this case we just want to end
709     *    the BCL with a RETURN_FROM_SUB_LIST and the RCL and binning flush
710     *    will be the primary job that branches to this CL.
711     */
712    if (cmd_buffer->state.pass) {
713       if (job->type == V3DV_JOB_TYPE_GPU_CL) {
714          cmd_buffer_end_render_pass_frame(cmd_buffer);
715       } else {
716          assert(job->type == V3DV_JOB_TYPE_GPU_CL_SECONDARY);
717          cmd_buffer_end_render_pass_secondary(cmd_buffer);
718       }
719    }
720 
721    list_addtail(&job->list_link, &cmd_buffer->jobs);
722    cmd_buffer->state.job = NULL;
723 
724    /* If we have recorded any state with this last GPU job that requires to
725     * emit CPU jobs after the job is completed, add them now. The only
726     * exception is secondary command buffers inside a render pass, because in
727     * that case we want to defer this until we finish recording the primary
728     * job into which we execute the secondary.
729     */
730    if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY ||
731        !cmd_buffer->state.pass) {
732       cmd_buffer_add_cpu_jobs_for_pending_state(cmd_buffer);
733    }
734 }
735 
736 static bool
job_type_is_gpu(struct v3dv_job * job)737 job_type_is_gpu(struct v3dv_job *job)
738 {
739    switch (job->type) {
740    case V3DV_JOB_TYPE_GPU_CL:
741    case V3DV_JOB_TYPE_GPU_CL_SECONDARY:
742    case V3DV_JOB_TYPE_GPU_TFU:
743    case V3DV_JOB_TYPE_GPU_CSD:
744       return true;
745    default:
746       return false;
747    }
748 }
749 
750 static void
cmd_buffer_serialize_job_if_needed(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_job * job)751 cmd_buffer_serialize_job_if_needed(struct v3dv_cmd_buffer *cmd_buffer,
752                                    struct v3dv_job *job)
753 {
754    assert(cmd_buffer && job);
755 
756    if (!cmd_buffer->state.has_barrier)
757       return;
758 
759    /* Serialization only affects GPU jobs, CPU jobs are always automatically
760     * serialized.
761     */
762    if (!job_type_is_gpu(job))
763       return;
764 
765    job->serialize = true;
766    if (cmd_buffer->state.has_bcl_barrier &&
767        (job->type == V3DV_JOB_TYPE_GPU_CL ||
768         job->type == V3DV_JOB_TYPE_GPU_CL_SECONDARY)) {
769       job->needs_bcl_sync = true;
770    }
771 
772    cmd_buffer->state.has_barrier = false;
773    cmd_buffer->state.has_bcl_barrier = false;
774 }
775 
776 void
v3dv_job_init(struct v3dv_job * job,enum v3dv_job_type type,struct v3dv_device * device,struct v3dv_cmd_buffer * cmd_buffer,int32_t subpass_idx)777 v3dv_job_init(struct v3dv_job *job,
778               enum v3dv_job_type type,
779               struct v3dv_device *device,
780               struct v3dv_cmd_buffer *cmd_buffer,
781               int32_t subpass_idx)
782 {
783    assert(job);
784 
785    /* Make sure we haven't made this new job current before calling here */
786    assert(!cmd_buffer || cmd_buffer->state.job != job);
787 
788    job->type = type;
789 
790    job->device = device;
791    job->cmd_buffer = cmd_buffer;
792 
793    list_inithead(&job->list_link);
794 
795    if (type == V3DV_JOB_TYPE_GPU_CL ||
796        type == V3DV_JOB_TYPE_GPU_CL_SECONDARY ||
797        type == V3DV_JOB_TYPE_GPU_CSD) {
798       job->bos =
799          _mesa_set_create(NULL, _mesa_hash_pointer, _mesa_key_pointer_equal);
800       job->bo_count = 0;
801 
802       v3dv_cl_init(job, &job->indirect);
803 
804       if (V3D_DEBUG & V3D_DEBUG_ALWAYS_FLUSH)
805          job->always_flush = true;
806    }
807 
808    if (type == V3DV_JOB_TYPE_GPU_CL ||
809        type == V3DV_JOB_TYPE_GPU_CL_SECONDARY) {
810       v3dv_cl_init(job, &job->bcl);
811       v3dv_cl_init(job, &job->rcl);
812    }
813 
814    if (cmd_buffer) {
815       /* Flag all state as dirty. Generally, we need to re-emit state for each
816        * new job.
817        *
818        * FIXME: there may be some exceptions, in which case we could skip some
819        * bits.
820        */
821       cmd_buffer->state.dirty = ~0;
822 
823       /* Honor inheritance of occlussion queries in secondaries if requested */
824       if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY &&
825           cmd_buffer->state.inheritance.occlusion_query_enable) {
826          cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_OCCLUSION_QUERY;
827       }
828 
829       /* Keep track of the first subpass that we are recording in this new job.
830        * We will use this when we emit the RCL to decide how to emit our loads
831        * and stores.
832        */
833       if (cmd_buffer->state.pass)
834          job->first_subpass = subpass_idx;
835 
836       cmd_buffer_serialize_job_if_needed(cmd_buffer, job);
837    }
838 }
839 
840 struct v3dv_job *
v3dv_cmd_buffer_start_job(struct v3dv_cmd_buffer * cmd_buffer,int32_t subpass_idx,enum v3dv_job_type type)841 v3dv_cmd_buffer_start_job(struct v3dv_cmd_buffer *cmd_buffer,
842                           int32_t subpass_idx,
843                           enum v3dv_job_type type)
844 {
845    /* Don't create a new job if we can merge the current subpass into
846     * the current job.
847     */
848    if (cmd_buffer->state.pass &&
849        subpass_idx != -1 &&
850        cmd_buffer_can_merge_subpass(cmd_buffer, subpass_idx)) {
851       cmd_buffer->state.job->is_subpass_finish = false;
852       return cmd_buffer->state.job;
853    }
854 
855    /* Ensure we are not starting a new job without finishing a previous one */
856    if (cmd_buffer->state.job != NULL)
857       v3dv_cmd_buffer_finish_job(cmd_buffer);
858 
859    assert(cmd_buffer->state.job == NULL);
860    struct v3dv_job *job = vk_zalloc(&cmd_buffer->device->alloc,
861                                     sizeof(struct v3dv_job), 8,
862                                     VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
863 
864    if (!job) {
865       fprintf(stderr, "Error: failed to allocate CPU memory for job\n");
866       v3dv_flag_oom(cmd_buffer, NULL);
867       return NULL;
868    }
869 
870    v3dv_job_init(job, type, cmd_buffer->device, cmd_buffer, subpass_idx);
871    cmd_buffer->state.job = job;
872 
873    return job;
874 }
875 
876 static VkResult
cmd_buffer_reset(struct v3dv_cmd_buffer * cmd_buffer,VkCommandBufferResetFlags flags)877 cmd_buffer_reset(struct v3dv_cmd_buffer *cmd_buffer,
878                  VkCommandBufferResetFlags flags)
879 {
880    if (cmd_buffer->status != V3DV_CMD_BUFFER_STATUS_INITIALIZED) {
881       struct v3dv_device *device = cmd_buffer->device;
882       struct v3dv_cmd_pool *pool = cmd_buffer->pool;
883       VkCommandBufferLevel level = cmd_buffer->level;
884 
885       /* cmd_buffer_init below will re-add the command buffer to the pool
886        * so remove it here so we don't end up adding it again.
887        */
888       list_del(&cmd_buffer->pool_link);
889 
890       /* FIXME: For now we always free all resources as if
891        * VK_COMMAND_BUFFER_RESET_RELEASE_RESOURCES_BIT was set.
892        */
893       if (cmd_buffer->status != V3DV_CMD_BUFFER_STATUS_NEW)
894          cmd_buffer_free_resources(cmd_buffer);
895 
896       cmd_buffer_init(cmd_buffer, device, pool, level);
897    }
898 
899    assert(cmd_buffer->status == V3DV_CMD_BUFFER_STATUS_INITIALIZED);
900    return VK_SUCCESS;
901 }
902 
903 VkResult
v3dv_AllocateCommandBuffers(VkDevice _device,const VkCommandBufferAllocateInfo * pAllocateInfo,VkCommandBuffer * pCommandBuffers)904 v3dv_AllocateCommandBuffers(VkDevice _device,
905                             const VkCommandBufferAllocateInfo *pAllocateInfo,
906                             VkCommandBuffer *pCommandBuffers)
907 {
908    V3DV_FROM_HANDLE(v3dv_device, device, _device);
909    V3DV_FROM_HANDLE(v3dv_cmd_pool, pool, pAllocateInfo->commandPool);
910 
911    VkResult result = VK_SUCCESS;
912    uint32_t i;
913 
914    for (i = 0; i < pAllocateInfo->commandBufferCount; i++) {
915       result = cmd_buffer_create(device, pool, pAllocateInfo->level,
916                                  &pCommandBuffers[i]);
917       if (result != VK_SUCCESS)
918          break;
919    }
920 
921    if (result != VK_SUCCESS) {
922       v3dv_FreeCommandBuffers(_device, pAllocateInfo->commandPool,
923                               i, pCommandBuffers);
924       for (i = 0; i < pAllocateInfo->commandBufferCount; i++)
925          pCommandBuffers[i] = VK_NULL_HANDLE;
926    }
927 
928    return result;
929 }
930 
931 void
v3dv_FreeCommandBuffers(VkDevice device,VkCommandPool commandPool,uint32_t commandBufferCount,const VkCommandBuffer * pCommandBuffers)932 v3dv_FreeCommandBuffers(VkDevice device,
933                         VkCommandPool commandPool,
934                         uint32_t commandBufferCount,
935                         const VkCommandBuffer *pCommandBuffers)
936 {
937    for (uint32_t i = 0; i < commandBufferCount; i++) {
938       V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, pCommandBuffers[i]);
939 
940       if (!cmd_buffer)
941          continue;
942 
943       cmd_buffer_destroy(cmd_buffer);
944    }
945 }
946 
947 void
v3dv_DestroyCommandPool(VkDevice _device,VkCommandPool commandPool,const VkAllocationCallbacks * pAllocator)948 v3dv_DestroyCommandPool(VkDevice _device,
949                         VkCommandPool commandPool,
950                         const VkAllocationCallbacks *pAllocator)
951 {
952    V3DV_FROM_HANDLE(v3dv_device, device, _device);
953    V3DV_FROM_HANDLE(v3dv_cmd_pool, pool, commandPool);
954 
955    if (!pool)
956       return;
957 
958    list_for_each_entry_safe(struct v3dv_cmd_buffer, cmd_buffer,
959                             &pool->cmd_buffers, pool_link) {
960       cmd_buffer_destroy(cmd_buffer);
961    }
962 
963    vk_free2(&device->alloc, pAllocator, pool);
964 }
965 
966 void
v3dv_TrimCommandPool(VkDevice device,VkCommandPool commandPool,VkCommandPoolTrimFlags flags)967 v3dv_TrimCommandPool(VkDevice device,
968                      VkCommandPool commandPool,
969                      VkCommandPoolTrimFlags flags)
970 {
971    /* We don't need to do anything here, our command pools never hold on to
972     * any resources from command buffers that are freed or reset.
973     */
974 }
975 
976 
977 static void
cmd_buffer_subpass_handle_pending_resolves(struct v3dv_cmd_buffer * cmd_buffer)978 cmd_buffer_subpass_handle_pending_resolves(struct v3dv_cmd_buffer *cmd_buffer)
979 {
980    assert(cmd_buffer->state.subpass_idx < cmd_buffer->state.pass->subpass_count);
981    const struct v3dv_render_pass *pass = cmd_buffer->state.pass;
982    const struct v3dv_subpass *subpass =
983       &pass->subpasses[cmd_buffer->state.subpass_idx];
984 
985    if (!subpass->resolve_attachments)
986       return;
987 
988    struct v3dv_framebuffer *fb = cmd_buffer->state.framebuffer;
989 
990    /* At this point we have already ended the current subpass and now we are
991     * about to emit vkCmdResolveImage calls to get the resolves we can't handle
992     * handle in the subpass RCL.
993     *
994     * vkCmdResolveImage is not supposed to be called inside a render pass so
995     * before we call that we need to make sure our command buffer state reflects
996     * that we are no longer in a subpass by finishing the current job and
997     * resetting the framebuffer and render pass state temporarily and then
998     * restoring it after we are done with the resolves.
999     */
1000    if (cmd_buffer->state.job)
1001       v3dv_cmd_buffer_finish_job(cmd_buffer);
1002    struct v3dv_framebuffer *restore_fb = cmd_buffer->state.framebuffer;
1003    struct v3dv_render_pass *restore_pass = cmd_buffer->state.pass;
1004    uint32_t restore_subpass_idx = cmd_buffer->state.subpass_idx;
1005    cmd_buffer->state.framebuffer = NULL;
1006    cmd_buffer->state.pass = NULL;
1007    cmd_buffer->state.subpass_idx = -1;
1008 
1009    VkCommandBuffer cmd_buffer_handle = v3dv_cmd_buffer_to_handle(cmd_buffer);
1010    for (uint32_t i = 0; i < subpass->color_count; i++) {
1011       const uint32_t src_attachment_idx =
1012          subpass->color_attachments[i].attachment;
1013       if (src_attachment_idx == VK_ATTACHMENT_UNUSED)
1014          continue;
1015 
1016       if (pass->attachments[src_attachment_idx].use_tlb_resolve)
1017          continue;
1018 
1019       const uint32_t dst_attachment_idx =
1020          subpass->resolve_attachments[i].attachment;
1021       if (dst_attachment_idx == VK_ATTACHMENT_UNUSED)
1022          continue;
1023 
1024       struct v3dv_image_view *src_iview = fb->attachments[src_attachment_idx];
1025       struct v3dv_image_view *dst_iview = fb->attachments[dst_attachment_idx];
1026 
1027       VkImageResolve region = {
1028          .srcSubresource = {
1029             VK_IMAGE_ASPECT_COLOR_BIT,
1030             src_iview->base_level,
1031             src_iview->first_layer,
1032             src_iview->last_layer - src_iview->first_layer + 1,
1033          },
1034          .srcOffset = { 0, 0, 0 },
1035          .dstSubresource =  {
1036             VK_IMAGE_ASPECT_COLOR_BIT,
1037             dst_iview->base_level,
1038             dst_iview->first_layer,
1039             dst_iview->last_layer - dst_iview->first_layer + 1,
1040          },
1041          .dstOffset = { 0, 0, 0 },
1042          .extent = src_iview->image->extent,
1043       };
1044 
1045       VkImage src_image_handle =
1046          v3dv_image_to_handle((struct v3dv_image *) src_iview->image);
1047       VkImage dst_image_handle =
1048          v3dv_image_to_handle((struct v3dv_image *) dst_iview->image);
1049       v3dv_CmdResolveImage(cmd_buffer_handle,
1050                            src_image_handle,
1051                            VK_IMAGE_LAYOUT_GENERAL,
1052                            dst_image_handle,
1053                            VK_IMAGE_LAYOUT_GENERAL,
1054                            1, &region);
1055    }
1056 
1057    cmd_buffer->state.framebuffer = restore_fb;
1058    cmd_buffer->state.pass = restore_pass;
1059    cmd_buffer->state.subpass_idx = restore_subpass_idx;
1060 }
1061 
1062 static VkResult
cmd_buffer_begin_render_pass_secondary(struct v3dv_cmd_buffer * cmd_buffer,const VkCommandBufferInheritanceInfo * inheritance_info)1063 cmd_buffer_begin_render_pass_secondary(
1064    struct v3dv_cmd_buffer *cmd_buffer,
1065    const VkCommandBufferInheritanceInfo *inheritance_info)
1066 {
1067    assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
1068    assert(cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT);
1069    assert(inheritance_info);
1070 
1071    cmd_buffer->state.pass =
1072       v3dv_render_pass_from_handle(inheritance_info->renderPass);
1073    assert(cmd_buffer->state.pass);
1074 
1075    cmd_buffer->state.framebuffer =
1076       v3dv_framebuffer_from_handle(inheritance_info->framebuffer);
1077 
1078    assert(inheritance_info->subpass < cmd_buffer->state.pass->subpass_count);
1079    cmd_buffer->state.subpass_idx = inheritance_info->subpass;
1080 
1081    cmd_buffer->state.inheritance.occlusion_query_enable =
1082       inheritance_info->occlusionQueryEnable;
1083 
1084    /* Secondaries that execute inside a render pass won't start subpasses
1085     * so we want to create a job for them here.
1086     */
1087    struct v3dv_job *job =
1088       v3dv_cmd_buffer_start_job(cmd_buffer, inheritance_info->subpass,
1089                                 V3DV_JOB_TYPE_GPU_CL_SECONDARY);
1090    if (!job) {
1091       v3dv_flag_oom(cmd_buffer, NULL);
1092       return VK_ERROR_OUT_OF_HOST_MEMORY;
1093    }
1094 
1095    /* Secondary command buffers don't know about the render area, but our
1096     * scissor setup accounts for it, so let's make sure we make it large
1097     * enough that it doesn't actually constrain any rendering. This should
1098     * be fine, since the Vulkan spec states:
1099     *
1100     *    "The application must ensure (using scissor if necessary) that all
1101     *     rendering is contained within the render area."
1102     *
1103     * FIXME: setup constants for the max framebuffer dimensions and use them
1104     * here and when filling in VkPhysicalDeviceLimits.
1105     */
1106    const struct v3dv_framebuffer *framebuffer = cmd_buffer->state.framebuffer;
1107    cmd_buffer->state.render_area.offset.x = 0;
1108    cmd_buffer->state.render_area.offset.y = 0;
1109    cmd_buffer->state.render_area.extent.width =
1110       framebuffer ? framebuffer->width : 4096;
1111    cmd_buffer->state.render_area.extent.height =
1112       framebuffer ? framebuffer->height : 4096;
1113 
1114    return VK_SUCCESS;
1115 }
1116 
1117 VkResult
v3dv_BeginCommandBuffer(VkCommandBuffer commandBuffer,const VkCommandBufferBeginInfo * pBeginInfo)1118 v3dv_BeginCommandBuffer(VkCommandBuffer commandBuffer,
1119                         const VkCommandBufferBeginInfo *pBeginInfo)
1120 {
1121    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
1122 
1123    /* If this is the first vkBeginCommandBuffer, we must initialize the
1124     * command buffer's state. Otherwise, we must reset its state. In both
1125     * cases we reset it.
1126     */
1127    VkResult result = cmd_buffer_reset(cmd_buffer, 0);
1128    if (result != VK_SUCCESS)
1129       return result;
1130 
1131    assert(cmd_buffer->status == V3DV_CMD_BUFFER_STATUS_INITIALIZED);
1132 
1133    cmd_buffer->usage_flags = pBeginInfo->flags;
1134 
1135    if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
1136       if (pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
1137          result =
1138             cmd_buffer_begin_render_pass_secondary(cmd_buffer,
1139                                                    pBeginInfo->pInheritanceInfo);
1140          if (result != VK_SUCCESS)
1141             return result;
1142       }
1143    }
1144 
1145    cmd_buffer->status = V3DV_CMD_BUFFER_STATUS_RECORDING;
1146 
1147    return VK_SUCCESS;
1148 }
1149 
1150 VkResult
v3dv_ResetCommandBuffer(VkCommandBuffer commandBuffer,VkCommandBufferResetFlags flags)1151 v3dv_ResetCommandBuffer(VkCommandBuffer commandBuffer,
1152                         VkCommandBufferResetFlags flags)
1153 {
1154    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
1155    return cmd_buffer_reset(cmd_buffer, flags);
1156 }
1157 
1158 VkResult
v3dv_ResetCommandPool(VkDevice device,VkCommandPool commandPool,VkCommandPoolResetFlags flags)1159 v3dv_ResetCommandPool(VkDevice device,
1160                       VkCommandPool commandPool,
1161                       VkCommandPoolResetFlags flags)
1162 {
1163    V3DV_FROM_HANDLE(v3dv_cmd_pool, pool, commandPool);
1164 
1165    VkCommandBufferResetFlags reset_flags = 0;
1166    if (flags & VK_COMMAND_POOL_RESET_RELEASE_RESOURCES_BIT)
1167       reset_flags = VK_COMMAND_BUFFER_RESET_RELEASE_RESOURCES_BIT;
1168    list_for_each_entry_safe(struct v3dv_cmd_buffer, cmd_buffer,
1169                             &pool->cmd_buffers, pool_link) {
1170       cmd_buffer_reset(cmd_buffer, reset_flags);
1171    }
1172 
1173    return VK_SUCCESS;
1174 }
1175 
1176 static void
emit_clip_window(struct v3dv_job * job,const VkRect2D * rect)1177 emit_clip_window(struct v3dv_job *job, const VkRect2D *rect)
1178 {
1179    assert(job);
1180 
1181    v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(CLIP_WINDOW));
1182    v3dv_return_if_oom(NULL, job);
1183 
1184    cl_emit(&job->bcl, CLIP_WINDOW, clip) {
1185       clip.clip_window_left_pixel_coordinate = rect->offset.x;
1186       clip.clip_window_bottom_pixel_coordinate = rect->offset.y;
1187       clip.clip_window_width_in_pixels = rect->extent.width;
1188       clip.clip_window_height_in_pixels = rect->extent.height;
1189    }
1190 }
1191 
1192 static void
cmd_buffer_update_tile_alignment(struct v3dv_cmd_buffer * cmd_buffer)1193 cmd_buffer_update_tile_alignment(struct v3dv_cmd_buffer *cmd_buffer)
1194 {
1195    /* Render areas and scissor/viewport are only relevant inside render passes,
1196     * otherwise we are dealing with transfer operations where these elements
1197     * don't apply.
1198     */
1199    assert(cmd_buffer->state.pass);
1200    const VkRect2D *rect = &cmd_buffer->state.render_area;
1201 
1202    /* We should only call this at the beginning of a subpass so we should
1203     * always have framebuffer information available.
1204     */
1205    assert(cmd_buffer->state.framebuffer);
1206    cmd_buffer->state.tile_aligned_render_area =
1207       v3dv_subpass_area_is_tile_aligned(rect,
1208                                         cmd_buffer->state.framebuffer,
1209                                         cmd_buffer->state.pass,
1210                                         cmd_buffer->state.subpass_idx);
1211 
1212    if (!cmd_buffer->state.tile_aligned_render_area) {
1213       perf_debug("Render area for subpass %d of render pass %p doesn't "
1214                  "match render pass granularity.\n",
1215                  cmd_buffer->state.subpass_idx, cmd_buffer->state.pass);
1216    }
1217 }
1218 
1219 void
v3dv_get_hw_clear_color(const VkClearColorValue * color,uint32_t internal_type,uint32_t internal_size,uint32_t * hw_color)1220 v3dv_get_hw_clear_color(const VkClearColorValue *color,
1221                         uint32_t internal_type,
1222                         uint32_t internal_size,
1223                         uint32_t *hw_color)
1224 {
1225    union util_color uc;
1226    switch (internal_type) {
1227    case V3D_INTERNAL_TYPE_8:
1228       util_pack_color(color->float32, PIPE_FORMAT_R8G8B8A8_UNORM, &uc);
1229       memcpy(hw_color, uc.ui, internal_size);
1230    break;
1231    case V3D_INTERNAL_TYPE_8I:
1232    case V3D_INTERNAL_TYPE_8UI:
1233       hw_color[0] = ((color->uint32[0] & 0xff) |
1234                      (color->uint32[1] & 0xff) << 8 |
1235                      (color->uint32[2] & 0xff) << 16 |
1236                      (color->uint32[3] & 0xff) << 24);
1237    break;
1238    case V3D_INTERNAL_TYPE_16F:
1239       util_pack_color(color->float32, PIPE_FORMAT_R16G16B16A16_FLOAT, &uc);
1240       memcpy(hw_color, uc.ui, internal_size);
1241    break;
1242    case V3D_INTERNAL_TYPE_16I:
1243    case V3D_INTERNAL_TYPE_16UI:
1244       hw_color[0] = ((color->uint32[0] & 0xffff) | color->uint32[1] << 16);
1245       hw_color[1] = ((color->uint32[2] & 0xffff) | color->uint32[3] << 16);
1246    break;
1247    case V3D_INTERNAL_TYPE_32F:
1248    case V3D_INTERNAL_TYPE_32I:
1249    case V3D_INTERNAL_TYPE_32UI:
1250       memcpy(hw_color, color->uint32, internal_size);
1251       break;
1252    }
1253 }
1254 
1255 static void
cmd_buffer_state_set_attachment_clear_color(struct v3dv_cmd_buffer * cmd_buffer,uint32_t attachment_idx,const VkClearColorValue * color)1256 cmd_buffer_state_set_attachment_clear_color(struct v3dv_cmd_buffer *cmd_buffer,
1257                                             uint32_t attachment_idx,
1258                                             const VkClearColorValue *color)
1259 {
1260    assert(attachment_idx < cmd_buffer->state.pass->attachment_count);
1261 
1262    const struct v3dv_render_pass_attachment *attachment =
1263       &cmd_buffer->state.pass->attachments[attachment_idx];
1264 
1265    uint32_t internal_type, internal_bpp;
1266    const struct v3dv_format *format = v3dv_get_format(attachment->desc.format);
1267    v3dv_get_internal_type_bpp_for_output_format(format->rt_type,
1268                                                 &internal_type,
1269                                                 &internal_bpp);
1270 
1271    uint32_t internal_size = 4 << internal_bpp;
1272 
1273    struct v3dv_cmd_buffer_attachment_state *attachment_state =
1274       &cmd_buffer->state.attachments[attachment_idx];
1275 
1276    v3dv_get_hw_clear_color(color, internal_type, internal_size,
1277                            &attachment_state->clear_value.color[0]);
1278 
1279    attachment_state->vk_clear_value.color = *color;
1280 }
1281 
1282 static void
cmd_buffer_state_set_attachment_clear_depth_stencil(struct v3dv_cmd_buffer * cmd_buffer,uint32_t attachment_idx,bool clear_depth,bool clear_stencil,const VkClearDepthStencilValue * ds)1283 cmd_buffer_state_set_attachment_clear_depth_stencil(
1284    struct v3dv_cmd_buffer *cmd_buffer,
1285    uint32_t attachment_idx,
1286    bool clear_depth, bool clear_stencil,
1287    const VkClearDepthStencilValue *ds)
1288 {
1289    struct v3dv_cmd_buffer_attachment_state *attachment_state =
1290       &cmd_buffer->state.attachments[attachment_idx];
1291 
1292    if (clear_depth)
1293       attachment_state->clear_value.z = ds->depth;
1294 
1295    if (clear_stencil)
1296       attachment_state->clear_value.s = ds->stencil;
1297 
1298    attachment_state->vk_clear_value.depthStencil = *ds;
1299 }
1300 
1301 static void
cmd_buffer_state_set_clear_values(struct v3dv_cmd_buffer * cmd_buffer,uint32_t count,const VkClearValue * values)1302 cmd_buffer_state_set_clear_values(struct v3dv_cmd_buffer *cmd_buffer,
1303                                   uint32_t count, const VkClearValue *values)
1304 {
1305    struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
1306    const struct v3dv_render_pass *pass = state->pass;
1307 
1308    /* There could be less clear values than attachments in the render pass, in
1309     * which case we only want to process as many as we have, or there could be
1310     * more, in which case we want to ignore those for which we don't have a
1311     * corresponding attachment.
1312     */
1313    count = MIN2(count, pass->attachment_count);
1314    for (uint32_t i = 0; i < count; i++) {
1315       const struct v3dv_render_pass_attachment *attachment =
1316          &pass->attachments[i];
1317 
1318       if (attachment->desc.loadOp != VK_ATTACHMENT_LOAD_OP_CLEAR)
1319          continue;
1320 
1321       VkImageAspectFlags aspects = vk_format_aspects(attachment->desc.format);
1322       if (aspects & VK_IMAGE_ASPECT_COLOR_BIT) {
1323          cmd_buffer_state_set_attachment_clear_color(cmd_buffer, i,
1324                                                      &values[i].color);
1325       } else if (aspects & (VK_IMAGE_ASPECT_DEPTH_BIT |
1326                             VK_IMAGE_ASPECT_STENCIL_BIT)) {
1327          cmd_buffer_state_set_attachment_clear_depth_stencil(
1328             cmd_buffer, i,
1329             aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
1330             aspects & VK_IMAGE_ASPECT_STENCIL_BIT,
1331             &values[i].depthStencil);
1332       }
1333    }
1334 }
1335 
1336 static void
cmd_buffer_init_render_pass_attachment_state(struct v3dv_cmd_buffer * cmd_buffer,const VkRenderPassBeginInfo * pRenderPassBegin)1337 cmd_buffer_init_render_pass_attachment_state(struct v3dv_cmd_buffer *cmd_buffer,
1338                                              const VkRenderPassBeginInfo *pRenderPassBegin)
1339 {
1340    cmd_buffer_state_set_clear_values(cmd_buffer,
1341                                      pRenderPassBegin->clearValueCount,
1342                                      pRenderPassBegin->pClearValues);
1343 }
1344 
1345 static void
cmd_buffer_ensure_render_pass_attachment_state(struct v3dv_cmd_buffer * cmd_buffer)1346 cmd_buffer_ensure_render_pass_attachment_state(struct v3dv_cmd_buffer *cmd_buffer)
1347 {
1348    struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
1349    const struct v3dv_render_pass *pass = state->pass;
1350 
1351    if (state->attachment_alloc_count < pass->attachment_count) {
1352       if (state->attachments > 0) {
1353          assert(state->attachment_alloc_count > 0);
1354          vk_free(&cmd_buffer->device->alloc, state->attachments);
1355       }
1356 
1357       uint32_t size = sizeof(struct v3dv_cmd_buffer_attachment_state) *
1358                       pass->attachment_count;
1359       state->attachments = vk_zalloc(&cmd_buffer->device->alloc, size, 8,
1360                                      VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
1361       if (!state->attachments) {
1362          v3dv_flag_oom(cmd_buffer, NULL);
1363          return;
1364       }
1365       state->attachment_alloc_count = pass->attachment_count;
1366    }
1367 
1368    assert(state->attachment_alloc_count >= pass->attachment_count);
1369 }
1370 
1371 void
v3dv_CmdBeginRenderPass(VkCommandBuffer commandBuffer,const VkRenderPassBeginInfo * pRenderPassBegin,VkSubpassContents contents)1372 v3dv_CmdBeginRenderPass(VkCommandBuffer commandBuffer,
1373                         const VkRenderPassBeginInfo *pRenderPassBegin,
1374                         VkSubpassContents contents)
1375 {
1376    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
1377    V3DV_FROM_HANDLE(v3dv_render_pass, pass, pRenderPassBegin->renderPass);
1378    V3DV_FROM_HANDLE(v3dv_framebuffer, framebuffer, pRenderPassBegin->framebuffer);
1379 
1380    struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
1381    state->pass = pass;
1382    state->framebuffer = framebuffer;
1383 
1384    cmd_buffer_ensure_render_pass_attachment_state(cmd_buffer);
1385    v3dv_return_if_oom(cmd_buffer, NULL);
1386 
1387    cmd_buffer_init_render_pass_attachment_state(cmd_buffer, pRenderPassBegin);
1388 
1389    state->render_area = pRenderPassBegin->renderArea;
1390 
1391    /* If our render area is smaller than the current clip window we will have
1392     * to emit a new clip window to constraint it to the render area.
1393     */
1394    uint32_t min_render_x = state->render_area.offset.x;
1395    uint32_t min_render_y = state->render_area.offset.x;
1396    uint32_t max_render_x = min_render_x + state->render_area.extent.width - 1;
1397    uint32_t max_render_y = min_render_y + state->render_area.extent.height - 1;
1398    uint32_t min_clip_x = state->clip_window.offset.x;
1399    uint32_t min_clip_y = state->clip_window.offset.y;
1400    uint32_t max_clip_x = min_clip_x + state->clip_window.extent.width - 1;
1401    uint32_t max_clip_y = min_clip_y + state->clip_window.extent.height - 1;
1402    if (min_render_x > min_clip_x || min_render_y > min_clip_y ||
1403        max_render_x < max_clip_x || max_render_y < max_clip_y) {
1404       state->dirty |= V3DV_CMD_DIRTY_SCISSOR;
1405    }
1406 
1407    /* Setup for first subpass */
1408    v3dv_cmd_buffer_subpass_start(cmd_buffer, 0);
1409 }
1410 
1411 void
v3dv_CmdNextSubpass(VkCommandBuffer commandBuffer,VkSubpassContents contents)1412 v3dv_CmdNextSubpass(VkCommandBuffer commandBuffer, VkSubpassContents contents)
1413 {
1414    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
1415 
1416    struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
1417    assert(state->subpass_idx < state->pass->subpass_count - 1);
1418 
1419    /* Finish the previous subpass */
1420    v3dv_cmd_buffer_subpass_finish(cmd_buffer);
1421    cmd_buffer_subpass_handle_pending_resolves(cmd_buffer);
1422 
1423    /* Start the next subpass */
1424    v3dv_cmd_buffer_subpass_start(cmd_buffer, state->subpass_idx + 1);
1425 }
1426 
1427 void
v3dv_render_pass_setup_render_target(struct v3dv_cmd_buffer * cmd_buffer,int rt,uint32_t * rt_bpp,uint32_t * rt_type,uint32_t * rt_clamp)1428 v3dv_render_pass_setup_render_target(struct v3dv_cmd_buffer *cmd_buffer,
1429                                      int rt,
1430                                      uint32_t *rt_bpp,
1431                                      uint32_t *rt_type,
1432                                      uint32_t *rt_clamp)
1433 {
1434    const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
1435 
1436    assert(state->subpass_idx < state->pass->subpass_count);
1437    const struct v3dv_subpass *subpass =
1438       &state->pass->subpasses[state->subpass_idx];
1439 
1440    if (rt >= subpass->color_count)
1441       return;
1442 
1443    struct v3dv_subpass_attachment *attachment = &subpass->color_attachments[rt];
1444    const uint32_t attachment_idx = attachment->attachment;
1445    if (attachment_idx == VK_ATTACHMENT_UNUSED)
1446       return;
1447 
1448    const struct v3dv_framebuffer *framebuffer = state->framebuffer;
1449    assert(attachment_idx < framebuffer->attachment_count);
1450    struct v3dv_image_view *iview = framebuffer->attachments[attachment_idx];
1451    assert(iview->aspects & VK_IMAGE_ASPECT_COLOR_BIT);
1452 
1453    *rt_bpp = iview->internal_bpp;
1454    *rt_type = iview->internal_type;
1455    *rt_clamp =vk_format_is_int(iview->vk_format) ?
1456       V3D_RENDER_TARGET_CLAMP_INT : V3D_RENDER_TARGET_CLAMP_NONE;
1457 }
1458 
1459 static void
cmd_buffer_render_pass_emit_load(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_cl * cl,struct v3dv_image_view * iview,uint32_t layer,uint32_t buffer)1460 cmd_buffer_render_pass_emit_load(struct v3dv_cmd_buffer *cmd_buffer,
1461                                  struct v3dv_cl *cl,
1462                                  struct v3dv_image_view *iview,
1463                                  uint32_t layer,
1464                                  uint32_t buffer)
1465 {
1466    const struct v3dv_image *image = iview->image;
1467    const struct v3d_resource_slice *slice = &image->slices[iview->base_level];
1468    uint32_t layer_offset = v3dv_layer_offset(image,
1469                                              iview->base_level,
1470                                              iview->first_layer + layer);
1471 
1472    cl_emit(cl, LOAD_TILE_BUFFER_GENERAL, load) {
1473       load.buffer_to_load = buffer;
1474       load.address = v3dv_cl_address(image->mem->bo, layer_offset);
1475 
1476       load.input_image_format = iview->format->rt_type;
1477       load.r_b_swap = iview->swap_rb;
1478       load.memory_format = slice->tiling;
1479 
1480       if (slice->tiling == VC5_TILING_UIF_NO_XOR ||
1481           slice->tiling == VC5_TILING_UIF_XOR) {
1482          load.height_in_ub_or_stride =
1483             slice->padded_height_of_output_image_in_uif_blocks;
1484       } else if (slice->tiling == VC5_TILING_RASTER) {
1485          load.height_in_ub_or_stride = slice->stride;
1486       }
1487 
1488       if (image->samples > VK_SAMPLE_COUNT_1_BIT)
1489          load.decimate_mode = V3D_DECIMATE_MODE_ALL_SAMPLES;
1490       else
1491          load.decimate_mode = V3D_DECIMATE_MODE_SAMPLE_0;
1492    }
1493 }
1494 
1495 static bool
check_needs_load(const struct v3dv_cmd_buffer_state * state,uint32_t att_first_subpass_idx,VkAttachmentLoadOp load_op)1496 check_needs_load(const struct v3dv_cmd_buffer_state *state,
1497                  uint32_t att_first_subpass_idx,
1498                  VkAttachmentLoadOp load_op)
1499 {
1500    return state->job->first_subpass > att_first_subpass_idx ||
1501           state->job->is_subpass_continue ||
1502           load_op == VK_ATTACHMENT_LOAD_OP_LOAD ||
1503           !state->tile_aligned_render_area;
1504 }
1505 
1506 static void
cmd_buffer_render_pass_emit_loads(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_cl * cl,uint32_t layer)1507 cmd_buffer_render_pass_emit_loads(struct v3dv_cmd_buffer *cmd_buffer,
1508                                   struct v3dv_cl *cl,
1509                                   uint32_t layer)
1510 {
1511    const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
1512    const struct v3dv_framebuffer *framebuffer = state->framebuffer;
1513    const struct v3dv_render_pass *pass = state->pass;
1514    const struct v3dv_subpass *subpass = &pass->subpasses[state->subpass_idx];
1515 
1516    for (uint32_t i = 0; i < subpass->color_count; i++) {
1517       uint32_t attachment_idx = subpass->color_attachments[i].attachment;
1518 
1519       if (attachment_idx == VK_ATTACHMENT_UNUSED)
1520          continue;
1521 
1522       const struct v3dv_render_pass_attachment *attachment =
1523          &state->pass->attachments[attachment_idx];
1524 
1525       /* According to the Vulkan spec:
1526        *
1527        *    "The load operation for each sample in an attachment happens before
1528        *     any recorded command which accesses the sample in the first subpass
1529        *     where the attachment is used."
1530        *
1531        * If the load operation is CLEAR, we must only clear once on the first
1532        * subpass that uses the attachment (and in that case we don't LOAD).
1533        * After that, we always want to load so we don't lose any rendering done
1534        * by a previous subpass to the same attachment. We also want to load
1535        * if the current job is continuing subpass work started by a previous
1536        * job, for the same reason.
1537        *
1538        * If the render area is not aligned to tile boundaries then we have
1539        * tiles which are partially covered by it. In this case, we need to
1540        * load the tiles so we can preserve the pixels that are outside the
1541        * render area for any such tiles.
1542        */
1543       bool needs_load = check_needs_load(state,
1544                                          attachment->first_subpass,
1545                                          attachment->desc.loadOp);
1546       if (needs_load) {
1547          struct v3dv_image_view *iview = framebuffer->attachments[attachment_idx];
1548          cmd_buffer_render_pass_emit_load(cmd_buffer, cl, iview,
1549                                           layer, RENDER_TARGET_0 + i);
1550       }
1551    }
1552 
1553    uint32_t ds_attachment_idx = subpass->ds_attachment.attachment;
1554    if (ds_attachment_idx != VK_ATTACHMENT_UNUSED) {
1555       const struct v3dv_render_pass_attachment *ds_attachment =
1556          &state->pass->attachments[ds_attachment_idx];
1557 
1558       const bool needs_depth_load =
1559          vk_format_has_depth(ds_attachment->desc.format) &&
1560          check_needs_load(state, ds_attachment->first_subpass,
1561                           ds_attachment->desc.loadOp);
1562 
1563       const bool needs_stencil_load =
1564          vk_format_has_stencil(ds_attachment->desc.format) &&
1565          check_needs_load(state, ds_attachment->first_subpass,
1566                           ds_attachment->desc.stencilLoadOp);
1567 
1568       if (needs_depth_load || needs_stencil_load) {
1569          struct v3dv_image_view *iview =
1570             framebuffer->attachments[ds_attachment_idx];
1571          /* From the Vulkan spec:
1572           *
1573           *   "When an image view of a depth/stencil image is used as a
1574           *   depth/stencil framebuffer attachment, the aspectMask is ignored
1575           *   and both depth and stencil image subresources are used."
1576           *
1577           * So we ignore the aspects from the subresource range of the image
1578           * view for the depth/stencil attachment, but we still need to restrict
1579           * the to aspects compatible with the render pass and the image.
1580           */
1581          const uint32_t zs_buffer =
1582             v3dv_zs_buffer(needs_depth_load, needs_stencil_load);
1583          cmd_buffer_render_pass_emit_load(cmd_buffer, cl,
1584                                           iview, layer, zs_buffer);
1585       }
1586    }
1587 
1588    cl_emit(cl, END_OF_LOADS, end);
1589 }
1590 
1591 static void
cmd_buffer_render_pass_emit_store(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_cl * cl,uint32_t attachment_idx,uint32_t layer,uint32_t buffer,bool clear,bool is_multisample_resolve)1592 cmd_buffer_render_pass_emit_store(struct v3dv_cmd_buffer *cmd_buffer,
1593                                   struct v3dv_cl *cl,
1594                                   uint32_t attachment_idx,
1595                                   uint32_t layer,
1596                                   uint32_t buffer,
1597                                   bool clear,
1598                                   bool is_multisample_resolve)
1599 {
1600    const struct v3dv_image_view *iview =
1601       cmd_buffer->state.framebuffer->attachments[attachment_idx];
1602    const struct v3dv_image *image = iview->image;
1603    const struct v3d_resource_slice *slice = &image->slices[iview->base_level];
1604    uint32_t layer_offset = v3dv_layer_offset(image,
1605                                              iview->base_level,
1606                                              iview->first_layer + layer);
1607 
1608    cl_emit(cl, STORE_TILE_BUFFER_GENERAL, store) {
1609       store.buffer_to_store = buffer;
1610       store.address = v3dv_cl_address(image->mem->bo, layer_offset);
1611       store.clear_buffer_being_stored = clear;
1612 
1613       store.output_image_format = iview->format->rt_type;
1614       store.r_b_swap = iview->swap_rb;
1615       store.memory_format = slice->tiling;
1616 
1617       if (slice->tiling == VC5_TILING_UIF_NO_XOR ||
1618           slice->tiling == VC5_TILING_UIF_XOR) {
1619          store.height_in_ub_or_stride =
1620             slice->padded_height_of_output_image_in_uif_blocks;
1621       } else if (slice->tiling == VC5_TILING_RASTER) {
1622          store.height_in_ub_or_stride = slice->stride;
1623       }
1624 
1625       if (image->samples > VK_SAMPLE_COUNT_1_BIT)
1626          store.decimate_mode = V3D_DECIMATE_MODE_ALL_SAMPLES;
1627       else if (is_multisample_resolve)
1628          store.decimate_mode = V3D_DECIMATE_MODE_4X;
1629       else
1630          store.decimate_mode = V3D_DECIMATE_MODE_SAMPLE_0;
1631    }
1632 }
1633 
1634 static void
cmd_buffer_render_pass_emit_stores(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_cl * cl,uint32_t layer)1635 cmd_buffer_render_pass_emit_stores(struct v3dv_cmd_buffer *cmd_buffer,
1636                                    struct v3dv_cl *cl,
1637                                    uint32_t layer)
1638 {
1639    struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
1640    const struct v3dv_subpass *subpass =
1641       &state->pass->subpasses[state->subpass_idx];
1642 
1643    bool has_stores = false;
1644    bool use_global_clear = false;
1645 
1646    /* FIXME: separate stencil */
1647    uint32_t ds_attachment_idx = subpass->ds_attachment.attachment;
1648    if (ds_attachment_idx != VK_ATTACHMENT_UNUSED) {
1649       const struct v3dv_render_pass_attachment *ds_attachment =
1650          &state->pass->attachments[ds_attachment_idx];
1651 
1652       assert(state->job->first_subpass >= ds_attachment->first_subpass);
1653       assert(state->subpass_idx >= ds_attachment->first_subpass);
1654       assert(state->subpass_idx <= ds_attachment->last_subpass);
1655 
1656       /* From the Vulkan spec, VkImageSubresourceRange:
1657        *
1658        *   "When an image view of a depth/stencil image is used as a
1659        *   depth/stencil framebuffer attachment, the aspectMask is ignored
1660        *   and both depth and stencil image subresources are used."
1661        *
1662        * So we ignore the aspects from the subresource range of the image
1663        * view for the depth/stencil attachment, but we still need to restrict
1664        * the to aspects compatible with the render pass and the image.
1665        */
1666       const VkImageAspectFlags aspects =
1667          vk_format_aspects(ds_attachment->desc.format);
1668 
1669       /* Only clear once on the first subpass that uses the attachment */
1670       bool needs_depth_clear =
1671          (aspects & VK_IMAGE_ASPECT_DEPTH_BIT) &&
1672          state->tile_aligned_render_area &&
1673          state->job->first_subpass == ds_attachment->first_subpass &&
1674          ds_attachment->desc.loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR &&
1675          !state->job->is_subpass_continue &&
1676          !subpass->do_depth_clear_with_draw;
1677 
1678       bool needs_stencil_clear =
1679          (aspects & VK_IMAGE_ASPECT_STENCIL_BIT) &&
1680          state->tile_aligned_render_area &&
1681          state->job->first_subpass == ds_attachment->first_subpass &&
1682          ds_attachment->desc.stencilLoadOp == VK_ATTACHMENT_LOAD_OP_CLEAR &&
1683          !state->job->is_subpass_continue &&
1684          !subpass->do_stencil_clear_with_draw;
1685 
1686       /* Skip the last store if it is not required */
1687       bool needs_depth_store =
1688          (aspects & VK_IMAGE_ASPECT_DEPTH_BIT) &&
1689          (state->subpass_idx < ds_attachment->last_subpass ||
1690           ds_attachment->desc.storeOp == VK_ATTACHMENT_STORE_OP_STORE ||
1691           !state->job->is_subpass_finish);
1692 
1693       bool needs_stencil_store =
1694          (aspects & VK_IMAGE_ASPECT_STENCIL_BIT) &&
1695          (state->subpass_idx < ds_attachment->last_subpass ||
1696           ds_attachment->desc.stencilStoreOp == VK_ATTACHMENT_STORE_OP_STORE ||
1697           !state->job->is_subpass_finish);
1698 
1699       /* GFXH-1689: The per-buffer store command's clear buffer bit is broken
1700        * for depth/stencil.  In addition, the clear packet's Z/S bit is broken,
1701        * but the RTs bit ends up clearing Z/S.
1702        *
1703        * So if we have to emit a clear of depth or stencil we don't use
1704        * per-buffer clears, not even for color, since we will have to emit
1705        * a clear command for all tile buffers (including color) to handle
1706        * the depth/stencil clears.
1707        *
1708        * Note that this bug is not reproduced in the simulator, where
1709        * using the clear buffer bit in depth/stencil stores seems to work
1710        * correctly.
1711        */
1712       use_global_clear = needs_depth_clear || needs_stencil_clear;
1713       if (needs_depth_store || needs_stencil_store) {
1714          const uint32_t zs_buffer =
1715             v3dv_zs_buffer(needs_depth_store, needs_stencil_store);
1716          cmd_buffer_render_pass_emit_store(cmd_buffer, cl,
1717                                            ds_attachment_idx, layer,
1718                                            zs_buffer, false, false);
1719          has_stores = true;
1720       }
1721    }
1722 
1723    for (uint32_t i = 0; i < subpass->color_count; i++) {
1724       uint32_t attachment_idx = subpass->color_attachments[i].attachment;
1725 
1726       if (attachment_idx == VK_ATTACHMENT_UNUSED)
1727          continue;
1728 
1729       const struct v3dv_render_pass_attachment *attachment =
1730          &state->pass->attachments[attachment_idx];
1731 
1732       assert(state->job->first_subpass >= attachment->first_subpass);
1733       assert(state->subpass_idx >= attachment->first_subpass);
1734       assert(state->subpass_idx <= attachment->last_subpass);
1735 
1736       /* Only clear once on the first subpass that uses the attachment */
1737       bool needs_clear =
1738          state->tile_aligned_render_area &&
1739          state->job->first_subpass == attachment->first_subpass &&
1740          attachment->desc.loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR &&
1741          !state->job->is_subpass_continue;
1742 
1743       /* Skip the last store if it is not required  */
1744       bool needs_store =
1745          state->subpass_idx < attachment->last_subpass ||
1746          attachment->desc.storeOp == VK_ATTACHMENT_STORE_OP_STORE ||
1747          !state->job->is_subpass_finish;
1748 
1749       /* If we need to resolve this attachment emit that store first. Notice
1750        * that we must not request a tile buffer clear here in that case, since
1751        * that would clear the tile buffer before we get to emit the actual
1752        * color attachment store below, since the clear happens after the
1753        * store is completed.
1754        *
1755        * If the attachment doesn't support TLB resolves then we will have to
1756        * fallback to doing the resolve in a shader separately after this
1757        * job, so we will need to store the multisampled sttachment even if that
1758        * wansn't requested by the client.
1759        */
1760       const bool needs_resolve =
1761          subpass->resolve_attachments &&
1762          subpass->resolve_attachments[i].attachment != VK_ATTACHMENT_UNUSED;
1763       if (needs_resolve && attachment->use_tlb_resolve) {
1764          const uint32_t resolve_attachment_idx =
1765             subpass->resolve_attachments[i].attachment;
1766          cmd_buffer_render_pass_emit_store(cmd_buffer, cl,
1767                                            resolve_attachment_idx, layer,
1768                                            RENDER_TARGET_0 + i,
1769                                            false, true);
1770          has_stores = true;
1771       } else if (needs_resolve) {
1772          needs_store = true;
1773       }
1774 
1775       /* Emit the color attachment store if needed */
1776       if (needs_store) {
1777          cmd_buffer_render_pass_emit_store(cmd_buffer, cl,
1778                                            attachment_idx, layer,
1779                                            RENDER_TARGET_0 + i,
1780                                            needs_clear && !use_global_clear,
1781                                            false);
1782          has_stores = true;
1783       } else if (needs_clear) {
1784          use_global_clear = true;
1785       }
1786    }
1787 
1788    /* We always need to emit at least one dummy store */
1789    if (!has_stores) {
1790       cl_emit(cl, STORE_TILE_BUFFER_GENERAL, store) {
1791          store.buffer_to_store = NONE;
1792       }
1793    }
1794 
1795    /* If we have any depth/stencil clears we can't use the per-buffer clear
1796     * bit and instead we have to emit a single clear of all tile buffers.
1797     */
1798    if (use_global_clear) {
1799       cl_emit(cl, CLEAR_TILE_BUFFERS, clear) {
1800          clear.clear_z_stencil_buffer = true;
1801          clear.clear_all_render_targets = true;
1802       }
1803    }
1804 }
1805 
1806 static void
cmd_buffer_render_pass_emit_per_tile_rcl(struct v3dv_cmd_buffer * cmd_buffer,uint32_t layer)1807 cmd_buffer_render_pass_emit_per_tile_rcl(struct v3dv_cmd_buffer *cmd_buffer,
1808                                          uint32_t layer)
1809 {
1810    struct v3dv_job *job = cmd_buffer->state.job;
1811    assert(job);
1812 
1813    /* Emit the generic list in our indirect state -- the rcl will just
1814     * have pointers into it.
1815     */
1816    struct v3dv_cl *cl = &job->indirect;
1817    v3dv_cl_ensure_space(cl, 200, 1);
1818    v3dv_return_if_oom(cmd_buffer, NULL);
1819 
1820    struct v3dv_cl_reloc tile_list_start = v3dv_cl_get_address(cl);
1821 
1822    cl_emit(cl, TILE_COORDINATES_IMPLICIT, coords);
1823 
1824    cmd_buffer_render_pass_emit_loads(cmd_buffer, cl, layer);
1825 
1826    /* The binner starts out writing tiles assuming that the initial mode
1827     * is triangles, so make sure that's the case.
1828     */
1829    cl_emit(cl, PRIM_LIST_FORMAT, fmt) {
1830       fmt.primitive_type = LIST_TRIANGLES;
1831    }
1832 
1833    /* PTB assumes that value to be 0, but hw will not set it. */
1834    cl_emit(cl, SET_INSTANCEID, set) {
1835       set.instance_id = 0;
1836    }
1837 
1838    cl_emit(cl, BRANCH_TO_IMPLICIT_TILE_LIST, branch);
1839 
1840    cmd_buffer_render_pass_emit_stores(cmd_buffer, cl, layer);
1841 
1842    cl_emit(cl, END_OF_TILE_MARKER, end);
1843 
1844    cl_emit(cl, RETURN_FROM_SUB_LIST, ret);
1845 
1846    cl_emit(&job->rcl, START_ADDRESS_OF_GENERIC_TILE_LIST, branch) {
1847       branch.start = tile_list_start;
1848       branch.end = v3dv_cl_get_address(cl);
1849    }
1850 }
1851 
1852 static void
cmd_buffer_emit_render_pass_layer_rcl(struct v3dv_cmd_buffer * cmd_buffer,uint32_t layer)1853 cmd_buffer_emit_render_pass_layer_rcl(struct v3dv_cmd_buffer *cmd_buffer,
1854                                       uint32_t layer)
1855 {
1856    const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
1857 
1858    struct v3dv_job *job = cmd_buffer->state.job;
1859    struct v3dv_cl *rcl = &job->rcl;
1860 
1861    /* If doing multicore binning, we would need to initialize each
1862     * core's tile list here.
1863     */
1864    const struct v3dv_frame_tiling *tiling = &job->frame_tiling;
1865    const uint32_t tile_alloc_offset =
1866       64 * layer * tiling->draw_tiles_x * tiling->draw_tiles_y;
1867    cl_emit(rcl, MULTICORE_RENDERING_TILE_LIST_SET_BASE, list) {
1868       list.address = v3dv_cl_address(job->tile_alloc, tile_alloc_offset);
1869    }
1870 
1871    cl_emit(rcl, MULTICORE_RENDERING_SUPERTILE_CFG, config) {
1872       config.number_of_bin_tile_lists = 1;
1873       config.total_frame_width_in_tiles = tiling->draw_tiles_x;
1874       config.total_frame_height_in_tiles = tiling->draw_tiles_y;
1875 
1876       config.supertile_width_in_tiles = tiling->supertile_width;
1877       config.supertile_height_in_tiles = tiling->supertile_height;
1878 
1879       config.total_frame_width_in_supertiles =
1880          tiling->frame_width_in_supertiles;
1881       config.total_frame_height_in_supertiles =
1882          tiling->frame_height_in_supertiles;
1883    }
1884 
1885    /* Start by clearing the tile buffer. */
1886    cl_emit(rcl, TILE_COORDINATES, coords) {
1887       coords.tile_column_number = 0;
1888       coords.tile_row_number = 0;
1889    }
1890 
1891    /* Emit an initial clear of the tile buffers. This is necessary
1892     * for any buffers that should be cleared (since clearing
1893     * normally happens at the *end* of the generic tile list), but
1894     * it's also nice to clear everything so the first tile doesn't
1895     * inherit any contents from some previous frame.
1896     *
1897     * Also, implement the GFXH-1742 workaround. There's a race in
1898     * the HW between the RCL updating the TLB's internal type/size
1899     * and the spawning of the QPU instances using the TLB's current
1900     * internal type/size. To make sure the QPUs get the right
1901     * state, we need 1 dummy store in between internal type/size
1902     * changes on V3D 3.x, and 2 dummy stores on 4.x.
1903     */
1904    for (int i = 0; i < 2; i++) {
1905       if (i > 0)
1906          cl_emit(rcl, TILE_COORDINATES, coords);
1907       cl_emit(rcl, END_OF_LOADS, end);
1908       cl_emit(rcl, STORE_TILE_BUFFER_GENERAL, store) {
1909          store.buffer_to_store = NONE;
1910       }
1911       if (i == 0 && cmd_buffer->state.tile_aligned_render_area) {
1912          cl_emit(rcl, CLEAR_TILE_BUFFERS, clear) {
1913             clear.clear_z_stencil_buffer = true;
1914             clear.clear_all_render_targets = true;
1915          }
1916       }
1917       cl_emit(rcl, END_OF_TILE_MARKER, end);
1918    }
1919 
1920    cl_emit(rcl, FLUSH_VCD_CACHE, flush);
1921 
1922    cmd_buffer_render_pass_emit_per_tile_rcl(cmd_buffer, layer);
1923 
1924    uint32_t supertile_w_in_pixels =
1925       tiling->tile_width * tiling->supertile_width;
1926    uint32_t supertile_h_in_pixels =
1927       tiling->tile_height * tiling->supertile_height;
1928    const uint32_t min_x_supertile =
1929       state->render_area.offset.x / supertile_w_in_pixels;
1930    const uint32_t min_y_supertile =
1931       state->render_area.offset.y / supertile_h_in_pixels;
1932 
1933    uint32_t max_render_x = state->render_area.offset.x;
1934    if (state->render_area.extent.width > 0)
1935       max_render_x += state->render_area.extent.width - 1;
1936    uint32_t max_render_y = state->render_area.offset.y;
1937    if (state->render_area.extent.height > 0)
1938       max_render_y += state->render_area.extent.height - 1;
1939    const uint32_t max_x_supertile = max_render_x / supertile_w_in_pixels;
1940    const uint32_t max_y_supertile = max_render_y / supertile_h_in_pixels;
1941 
1942    for (int y = min_y_supertile; y <= max_y_supertile; y++) {
1943       for (int x = min_x_supertile; x <= max_x_supertile; x++) {
1944          cl_emit(rcl, SUPERTILE_COORDINATES, coords) {
1945             coords.column_number_in_supertiles = x;
1946             coords.row_number_in_supertiles = y;
1947          }
1948       }
1949    }
1950 }
1951 
1952 static void
set_rcl_early_z_config(struct v3dv_job * job,uint32_t fb_width,uint32_t fb_height,bool needs_depth_load,bool * early_z_disable,uint32_t * early_z_test_and_update_direction)1953 set_rcl_early_z_config(struct v3dv_job *job,
1954                        uint32_t fb_width,
1955                        uint32_t fb_height,
1956                        bool needs_depth_load,
1957                        bool *early_z_disable,
1958                        uint32_t *early_z_test_and_update_direction)
1959 {
1960    switch (job->first_ez_state) {
1961    case VC5_EZ_UNDECIDED:
1962    case VC5_EZ_LT_LE:
1963       *early_z_disable = false;
1964       *early_z_test_and_update_direction = EARLY_Z_DIRECTION_LT_LE;
1965       break;
1966    case VC5_EZ_GT_GE:
1967       *early_z_disable = false;
1968       *early_z_test_and_update_direction = EARLY_Z_DIRECTION_GT_GE;
1969       break;
1970    case VC5_EZ_DISABLED:
1971       *early_z_disable = true;
1972       break;
1973    }
1974 
1975    /* GFXH-1918: the early-z buffer may load incorrect depth values
1976     * if the frame has odd width or height.
1977     */
1978    if (*early_z_disable == false && needs_depth_load &&
1979        ((fb_width % 2) != 0 || (fb_height % 2) != 0)) {
1980       perf_debug("Loading depth aspect for framebuffer with odd width "
1981                  "or height disables early-Z tests.\n");
1982       *early_z_disable = true;
1983    }
1984 }
1985 
1986 static void
cmd_buffer_emit_render_pass_rcl(struct v3dv_cmd_buffer * cmd_buffer)1987 cmd_buffer_emit_render_pass_rcl(struct v3dv_cmd_buffer *cmd_buffer)
1988 {
1989    struct v3dv_job *job = cmd_buffer->state.job;
1990    assert(job);
1991 
1992    const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
1993    const struct v3dv_framebuffer *framebuffer = state->framebuffer;
1994 
1995    /* We can't emit the RCL until we have a framebuffer, which we may not have
1996     * if we are recording a secondary command buffer. In that case, we will
1997     * have to wait until vkCmdExecuteCommands is called from a primary command
1998     * buffer.
1999     */
2000    if (!framebuffer) {
2001       assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
2002       return;
2003    }
2004 
2005    const struct v3dv_frame_tiling *tiling = &job->frame_tiling;
2006 
2007    const uint32_t fb_layers = framebuffer->layers;
2008    v3dv_cl_ensure_space_with_branch(&job->rcl, 200 +
2009                                     MAX2(fb_layers, 1) * 256 *
2010                                     cl_packet_length(SUPERTILE_COORDINATES));
2011    v3dv_return_if_oom(cmd_buffer, NULL);
2012 
2013    assert(state->subpass_idx < state->pass->subpass_count);
2014    const struct v3dv_render_pass *pass = state->pass;
2015    const struct v3dv_subpass *subpass = &pass->subpasses[state->subpass_idx];
2016    struct v3dv_cl *rcl = &job->rcl;
2017 
2018    /* Comon config must be the first TILE_RENDERING_MODE_CFG and
2019     * Z_STENCIL_CLEAR_VALUES must be last. The ones in between are optional
2020     * updates to the previous HW state.
2021     */
2022    const uint32_t ds_attachment_idx = subpass->ds_attachment.attachment;
2023    cl_emit(rcl, TILE_RENDERING_MODE_CFG_COMMON, config) {
2024       config.image_width_pixels = framebuffer->width;
2025       config.image_height_pixels = framebuffer->height;
2026       config.number_of_render_targets = MAX2(subpass->color_count, 1);
2027       config.multisample_mode_4x = tiling->msaa;
2028       config.maximum_bpp_of_all_render_targets = tiling->internal_bpp;
2029 
2030       if (ds_attachment_idx != VK_ATTACHMENT_UNUSED) {
2031          const struct v3dv_image_view *iview =
2032             framebuffer->attachments[ds_attachment_idx];
2033          config.internal_depth_type = iview->internal_type;
2034 
2035          bool needs_depth_load =
2036             check_needs_load(state,
2037                              pass->attachments[ds_attachment_idx].first_subpass,
2038                              pass->attachments[ds_attachment_idx].desc.loadOp);
2039 
2040          set_rcl_early_z_config(job,
2041                                 framebuffer->width,
2042                                 framebuffer->height,
2043                                 needs_depth_load,
2044                                 &config.early_z_disable,
2045                                 &config.early_z_test_and_update_direction);
2046       } else {
2047          config.early_z_disable = true;
2048       }
2049    }
2050 
2051    for (uint32_t i = 0; i < subpass->color_count; i++) {
2052       uint32_t attachment_idx = subpass->color_attachments[i].attachment;
2053       if (attachment_idx == VK_ATTACHMENT_UNUSED)
2054          continue;
2055 
2056       struct v3dv_image_view *iview =
2057          state->framebuffer->attachments[attachment_idx];
2058 
2059       const struct v3dv_image *image = iview->image;
2060       const struct v3d_resource_slice *slice = &image->slices[iview->base_level];
2061 
2062       const uint32_t *clear_color =
2063          &state->attachments[attachment_idx].clear_value.color[0];
2064 
2065       uint32_t clear_pad = 0;
2066       if (slice->tiling == VC5_TILING_UIF_NO_XOR ||
2067           slice->tiling == VC5_TILING_UIF_XOR) {
2068          int uif_block_height = v3d_utile_height(image->cpp) * 2;
2069 
2070          uint32_t implicit_padded_height =
2071             align(framebuffer->height, uif_block_height) / uif_block_height;
2072 
2073          if (slice->padded_height_of_output_image_in_uif_blocks -
2074              implicit_padded_height >= 15) {
2075             clear_pad = slice->padded_height_of_output_image_in_uif_blocks;
2076          }
2077       }
2078 
2079       cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART1, clear) {
2080          clear.clear_color_low_32_bits = clear_color[0];
2081          clear.clear_color_next_24_bits = clear_color[1] & 0xffffff;
2082          clear.render_target_number = i;
2083       };
2084 
2085       if (iview->internal_bpp >= V3D_INTERNAL_BPP_64) {
2086          cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART2, clear) {
2087             clear.clear_color_mid_low_32_bits =
2088               ((clear_color[1] >> 24) | (clear_color[2] << 8));
2089             clear.clear_color_mid_high_24_bits =
2090               ((clear_color[2] >> 24) | ((clear_color[3] & 0xffff) << 8));
2091             clear.render_target_number = i;
2092          };
2093       }
2094 
2095       if (iview->internal_bpp >= V3D_INTERNAL_BPP_128 || clear_pad) {
2096          cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART3, clear) {
2097             clear.uif_padded_height_in_uif_blocks = clear_pad;
2098             clear.clear_color_high_16_bits = clear_color[3] >> 16;
2099             clear.render_target_number = i;
2100          };
2101       }
2102    }
2103 
2104    cl_emit(rcl, TILE_RENDERING_MODE_CFG_COLOR, rt) {
2105       v3dv_render_pass_setup_render_target(cmd_buffer, 0,
2106                                            &rt.render_target_0_internal_bpp,
2107                                            &rt.render_target_0_internal_type,
2108                                            &rt.render_target_0_clamp);
2109       v3dv_render_pass_setup_render_target(cmd_buffer, 1,
2110                                            &rt.render_target_1_internal_bpp,
2111                                            &rt.render_target_1_internal_type,
2112                                            &rt.render_target_1_clamp);
2113       v3dv_render_pass_setup_render_target(cmd_buffer, 2,
2114                                            &rt.render_target_2_internal_bpp,
2115                                            &rt.render_target_2_internal_type,
2116                                            &rt.render_target_2_clamp);
2117       v3dv_render_pass_setup_render_target(cmd_buffer, 3,
2118                                            &rt.render_target_3_internal_bpp,
2119                                            &rt.render_target_3_internal_type,
2120                                            &rt.render_target_3_clamp);
2121    }
2122 
2123    /* Ends rendering mode config. */
2124    if (ds_attachment_idx != VK_ATTACHMENT_UNUSED) {
2125       cl_emit(rcl, TILE_RENDERING_MODE_CFG_ZS_CLEAR_VALUES, clear) {
2126          clear.z_clear_value =
2127             state->attachments[ds_attachment_idx].clear_value.z;
2128          clear.stencil_clear_value =
2129             state->attachments[ds_attachment_idx].clear_value.s;
2130       };
2131    } else {
2132       cl_emit(rcl, TILE_RENDERING_MODE_CFG_ZS_CLEAR_VALUES, clear) {
2133          clear.z_clear_value = 1.0f;
2134          clear.stencil_clear_value = 0;
2135       };
2136    }
2137 
2138    /* Always set initial block size before the first branch, which needs
2139     * to match the value from binning mode config.
2140     */
2141    cl_emit(rcl, TILE_LIST_INITIAL_BLOCK_SIZE, init) {
2142       init.use_auto_chained_tile_lists = true;
2143       init.size_of_first_block_in_chained_tile_lists =
2144          TILE_ALLOCATION_BLOCK_SIZE_64B;
2145    }
2146 
2147    for (int layer = 0; layer < MAX2(1, fb_layers); layer++)
2148       cmd_buffer_emit_render_pass_layer_rcl(cmd_buffer, layer);
2149 
2150    cl_emit(rcl, END_OF_RENDERING, end);
2151 }
2152 
2153 static void
cmd_buffer_emit_subpass_clears(struct v3dv_cmd_buffer * cmd_buffer)2154 cmd_buffer_emit_subpass_clears(struct v3dv_cmd_buffer *cmd_buffer)
2155 {
2156    assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
2157 
2158    assert(cmd_buffer->state.pass);
2159    assert(cmd_buffer->state.subpass_idx < cmd_buffer->state.pass->subpass_count);
2160    const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
2161    const struct v3dv_render_pass *pass = state->pass;
2162    const struct v3dv_subpass *subpass = &pass->subpasses[state->subpass_idx];
2163 
2164    /* We only need to emit subpass clears as draw calls when the render
2165     * area is not aligned to tile boundaries or for GFXH-1461.
2166     */
2167    if (cmd_buffer->state.tile_aligned_render_area &&
2168        !subpass->do_depth_clear_with_draw &&
2169        !subpass->do_depth_clear_with_draw) {
2170       return;
2171    }
2172 
2173    uint32_t att_count = 0;
2174    VkClearAttachment atts[V3D_MAX_DRAW_BUFFERS + 1]; /* 4 color + D/S */
2175 
2176    /* We only need to emit subpass clears as draw calls for color attachments
2177     * if the render area is not aligned to tile boundaries.
2178     */
2179    if (!cmd_buffer->state.tile_aligned_render_area) {
2180       for (uint32_t i = 0; i < subpass->color_count; i++) {
2181          const uint32_t att_idx = subpass->color_attachments[i].attachment;
2182          if (att_idx == VK_ATTACHMENT_UNUSED)
2183             continue;
2184 
2185          struct v3dv_render_pass_attachment *att = &pass->attachments[att_idx];
2186          if (att->desc.loadOp != VK_ATTACHMENT_LOAD_OP_CLEAR)
2187             continue;
2188 
2189          if (state->subpass_idx != att->first_subpass)
2190             continue;
2191 
2192          atts[att_count].aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
2193          atts[att_count].colorAttachment = i;
2194          atts[att_count].clearValue = state->attachments[att_idx].vk_clear_value;
2195          att_count++;
2196       }
2197    }
2198 
2199    /* For D/S we may also need to emit a subpass clear for GFXH-1461 */
2200    const uint32_t ds_att_idx = subpass->ds_attachment.attachment;
2201    if (ds_att_idx != VK_ATTACHMENT_UNUSED) {
2202       struct v3dv_render_pass_attachment *att = &pass->attachments[ds_att_idx];
2203       if (state->subpass_idx == att->first_subpass) {
2204          VkImageAspectFlags aspects = vk_format_aspects(att->desc.format);
2205          if (att->desc.loadOp != VK_ATTACHMENT_LOAD_OP_CLEAR ||
2206              (cmd_buffer->state.tile_aligned_render_area &&
2207               !subpass->do_depth_clear_with_draw)) {
2208             aspects &= ~VK_IMAGE_ASPECT_DEPTH_BIT;
2209          }
2210          if (att->desc.stencilLoadOp != VK_ATTACHMENT_LOAD_OP_CLEAR ||
2211              (cmd_buffer->state.tile_aligned_render_area &&
2212               !subpass->do_stencil_clear_with_draw)) {
2213             aspects &= ~VK_IMAGE_ASPECT_STENCIL_BIT;
2214          }
2215          if (aspects) {
2216             atts[att_count].aspectMask = aspects;
2217             atts[att_count].colorAttachment = 0; /* Ignored */
2218             atts[att_count].clearValue =
2219                state->attachments[ds_att_idx].vk_clear_value;
2220             att_count++;
2221          }
2222       }
2223    }
2224 
2225    if (att_count == 0)
2226       return;
2227 
2228    if (!cmd_buffer->state.tile_aligned_render_area) {
2229       perf_debug("Render area doesn't match render pass granularity, falling "
2230                  "back to vkCmdClearAttachments for "
2231                  "VK_ATTACHMENT_LOAD_OP_CLEAR.\n");
2232    } else if (subpass->do_depth_clear_with_draw ||
2233               subpass->do_stencil_clear_with_draw) {
2234       perf_debug("Subpass clears DEPTH but loads STENCIL (or viceversa), "
2235                  "falling back to vkCmdClearAttachments for "
2236                  "VK_ATTACHMENT_LOAD_OP_CLEAR.\n");
2237    }
2238 
2239    /* From the Vulkan 1.0 spec:
2240     *
2241     *    "VK_ATTACHMENT_LOAD_OP_CLEAR specifies that the contents within the
2242     *     render area will be cleared to a uniform value, which is specified
2243     *     when a render pass instance is begun."
2244     *
2245     * So the clear is only constrained by the render area and not by pipeline
2246     * state such as scissor or viewport, these are the semantics of
2247     * vkCmdClearAttachments as well.
2248     */
2249    VkCommandBuffer _cmd_buffer = v3dv_cmd_buffer_to_handle(cmd_buffer);
2250    VkClearRect rect = {
2251       .rect = state->render_area,
2252       .baseArrayLayer = 0,
2253       .layerCount = 1,
2254    };
2255    v3dv_CmdClearAttachments(_cmd_buffer, att_count, atts, 1, &rect);
2256 }
2257 
2258 static struct v3dv_job *
cmd_buffer_subpass_create_job(struct v3dv_cmd_buffer * cmd_buffer,uint32_t subpass_idx,enum v3dv_job_type type)2259 cmd_buffer_subpass_create_job(struct v3dv_cmd_buffer *cmd_buffer,
2260                               uint32_t subpass_idx,
2261                               enum v3dv_job_type type)
2262 {
2263    assert(type == V3DV_JOB_TYPE_GPU_CL ||
2264           type == V3DV_JOB_TYPE_GPU_CL_SECONDARY);
2265 
2266    struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
2267    assert(subpass_idx < state->pass->subpass_count);
2268 
2269    /* Starting a new job can trigger a finish of the current one, so don't
2270     * change the command buffer state for the new job until we are done creating
2271     * the new job.
2272     */
2273    struct v3dv_job *job =
2274       v3dv_cmd_buffer_start_job(cmd_buffer, subpass_idx, type);
2275    if (!job)
2276       return NULL;
2277 
2278    state->subpass_idx = subpass_idx;
2279 
2280    /* If we are starting a new job we need to setup binning. We only do this
2281     * for V3DV_JOB_TYPE_GPU_CL jobs because V3DV_JOB_TYPE_GPU_CL_SECONDARY
2282     * jobs are not submitted to the GPU directly, and are instead meant to be
2283     * branched to from other V3DV_JOB_TYPE_GPU_CL jobs.
2284     */
2285    if (type == V3DV_JOB_TYPE_GPU_CL &&
2286        job->first_subpass == state->subpass_idx) {
2287       const struct v3dv_subpass *subpass =
2288          &state->pass->subpasses[state->subpass_idx];
2289 
2290       const struct v3dv_framebuffer *framebuffer = state->framebuffer;
2291 
2292       uint8_t internal_bpp;
2293       bool msaa;
2294       v3dv_framebuffer_compute_internal_bpp_msaa(framebuffer, subpass,
2295                                                  &internal_bpp, &msaa);
2296 
2297       v3dv_job_start_frame(job,
2298                            framebuffer->width,
2299                            framebuffer->height,
2300                            framebuffer->layers,
2301                            subpass->color_count,
2302                            internal_bpp,
2303                            msaa);
2304    }
2305 
2306    return job;
2307 }
2308 
2309 struct v3dv_job *
v3dv_cmd_buffer_subpass_start(struct v3dv_cmd_buffer * cmd_buffer,uint32_t subpass_idx)2310 v3dv_cmd_buffer_subpass_start(struct v3dv_cmd_buffer *cmd_buffer,
2311                               uint32_t subpass_idx)
2312 {
2313    assert(cmd_buffer->state.pass);
2314    assert(subpass_idx < cmd_buffer->state.pass->subpass_count);
2315 
2316    struct v3dv_job *job =
2317       cmd_buffer_subpass_create_job(cmd_buffer, subpass_idx,
2318                                     V3DV_JOB_TYPE_GPU_CL);
2319    if (!job)
2320       return NULL;
2321 
2322    /* Check if our render area is aligned to tile boundaries. We have to do
2323     * this in each subpass because the subset of attachments used can change
2324     * and with that the tile size selected by the hardware can change too.
2325     */
2326    cmd_buffer_update_tile_alignment(cmd_buffer);
2327 
2328    /* If we can't use TLB clears then we need to emit draw clears for any
2329     * LOAD_OP_CLEAR attachments in this subpass now. We might also need to emit
2330     * Depth/Stencil clears if we hit GFXH-1461.
2331     *
2332     * Secondary command buffers don't start subpasses (and may not even have
2333     * framebuffer state), so we only care about this in primaries. The only
2334     * exception could be a secondary runnning inside a subpass that needs to
2335     * record a meta operation (with its own render pass) that relies on
2336     * attachment load clears, but we don't have any instances of that right
2337     * now.
2338     */
2339    if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY)
2340       cmd_buffer_emit_subpass_clears(cmd_buffer);
2341 
2342    return job;
2343 }
2344 
2345 struct v3dv_job *
v3dv_cmd_buffer_subpass_resume(struct v3dv_cmd_buffer * cmd_buffer,uint32_t subpass_idx)2346 v3dv_cmd_buffer_subpass_resume(struct v3dv_cmd_buffer *cmd_buffer,
2347                                uint32_t subpass_idx)
2348 {
2349    assert(cmd_buffer->state.pass);
2350    assert(subpass_idx < cmd_buffer->state.pass->subpass_count);
2351 
2352    struct v3dv_job *job;
2353    if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
2354       job = cmd_buffer_subpass_create_job(cmd_buffer, subpass_idx,
2355                                           V3DV_JOB_TYPE_GPU_CL);
2356    } else {
2357       assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
2358       job = cmd_buffer_subpass_create_job(cmd_buffer, subpass_idx,
2359                                           V3DV_JOB_TYPE_GPU_CL_SECONDARY);
2360    }
2361 
2362    if (!job)
2363       return NULL;
2364 
2365    job->is_subpass_continue = true;
2366 
2367    return job;
2368 }
2369 
2370 void
v3dv_cmd_buffer_subpass_finish(struct v3dv_cmd_buffer * cmd_buffer)2371 v3dv_cmd_buffer_subpass_finish(struct v3dv_cmd_buffer *cmd_buffer)
2372 {
2373    /* We can end up here without a job if the last command recorded into the
2374     * subpass already finished the job (for example a pipeline barrier). In
2375     * that case we miss to set the is_subpass_finish flag, but that is not
2376     * required for proper behavior.
2377     */
2378    struct v3dv_job *job = cmd_buffer->state.job;
2379    if (job)
2380       job->is_subpass_finish = true;
2381 }
2382 
2383 void
v3dv_CmdEndRenderPass(VkCommandBuffer commandBuffer)2384 v3dv_CmdEndRenderPass(VkCommandBuffer commandBuffer)
2385 {
2386    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
2387 
2388    /* Finalize last subpass */
2389    struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
2390    assert(state->subpass_idx == state->pass->subpass_count - 1);
2391    v3dv_cmd_buffer_subpass_finish(cmd_buffer);
2392    v3dv_cmd_buffer_finish_job(cmd_buffer);
2393 
2394    cmd_buffer_subpass_handle_pending_resolves(cmd_buffer);
2395 
2396    /* We are no longer inside a render pass */
2397    state->framebuffer = NULL;
2398    state->pass = NULL;
2399    state->subpass_idx = -1;
2400 }
2401 
2402 VkResult
v3dv_EndCommandBuffer(VkCommandBuffer commandBuffer)2403 v3dv_EndCommandBuffer(VkCommandBuffer commandBuffer)
2404 {
2405    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
2406 
2407    if (cmd_buffer->state.oom)
2408       return VK_ERROR_OUT_OF_HOST_MEMORY;
2409 
2410    /* Primaries should have ended any recording jobs by the time they hit
2411     * vkEndRenderPass (if we are inside a render pass). Commands outside
2412     * a render pass instance (for both primaries and secondaries) spawn
2413     * complete jobs too. So the only case where we can get here without
2414     * finishing a recording job is when we are recording a secondary
2415     * inside a render pass.
2416     */
2417    if (cmd_buffer->state.job) {
2418       assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY &&
2419              cmd_buffer->state.pass);
2420       v3dv_cmd_buffer_finish_job(cmd_buffer);
2421    }
2422 
2423    cmd_buffer->status = V3DV_CMD_BUFFER_STATUS_EXECUTABLE;
2424 
2425    return VK_SUCCESS;
2426 }
2427 
2428 static void
2429 emit_occlusion_query(struct v3dv_cmd_buffer *cmd_buffer);
2430 
2431 static void
2432 ensure_array_state(struct v3dv_cmd_buffer *cmd_buffer,
2433                    uint32_t slot_size,
2434                    uint32_t used_count,
2435                    uint32_t *alloc_count,
2436                    void **ptr);
2437 
2438 static void
cmd_buffer_copy_secondary_end_query_state(struct v3dv_cmd_buffer * primary,struct v3dv_cmd_buffer * secondary)2439 cmd_buffer_copy_secondary_end_query_state(struct v3dv_cmd_buffer *primary,
2440                                           struct v3dv_cmd_buffer *secondary)
2441 {
2442    struct v3dv_cmd_buffer_state *p_state = &primary->state;
2443    struct v3dv_cmd_buffer_state *s_state = &secondary->state;
2444 
2445    const uint32_t total_state_count =
2446       p_state->query.end.used_count + s_state->query.end.used_count;
2447    ensure_array_state(primary,
2448                       sizeof(struct v3dv_end_query_cpu_job_info),
2449                       total_state_count,
2450                       &p_state->query.end.alloc_count,
2451                       (void **) &p_state->query.end.states);
2452    v3dv_return_if_oom(primary, NULL);
2453 
2454    for (uint32_t i = 0; i < s_state->query.end.used_count; i++) {
2455       const struct v3dv_end_query_cpu_job_info *s_qstate =
2456          &secondary->state.query.end.states[i];
2457 
2458       struct v3dv_end_query_cpu_job_info *p_qstate =
2459          &p_state->query.end.states[p_state->query.end.used_count++];
2460 
2461       p_qstate->pool = s_qstate->pool;
2462       p_qstate->query = s_qstate->query;
2463    }
2464 }
2465 
2466 static void
clone_bo_list(struct v3dv_cmd_buffer * cmd_buffer,struct list_head * dst,struct list_head * src)2467 clone_bo_list(struct v3dv_cmd_buffer *cmd_buffer,
2468               struct list_head *dst,
2469               struct list_head *src)
2470 {
2471    assert(cmd_buffer);
2472 
2473    list_inithead(dst);
2474    list_for_each_entry(struct v3dv_bo, bo, src, list_link) {
2475       struct v3dv_bo *clone_bo =
2476          vk_alloc(&cmd_buffer->device->alloc, sizeof(struct v3dv_bo), 8,
2477                   VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
2478       if (!clone_bo) {
2479          v3dv_flag_oom(cmd_buffer, NULL);
2480          return;
2481       }
2482 
2483       *clone_bo = *bo;
2484       list_addtail(&clone_bo->list_link, dst);
2485    }
2486 }
2487 
2488 /* Clones a job for inclusion in the given command buffer. Note that this
2489  * doesn't make a deep copy so the cloned job it doesn't own any resources.
2490  * Useful when we need to have a job in more than one list, which happens
2491  * for jobs recorded in secondary command buffers when we want to execute
2492  * them in primaries.
2493  */
2494 static struct v3dv_job *
job_clone_in_cmd_buffer(struct v3dv_job * job,struct v3dv_cmd_buffer * cmd_buffer)2495 job_clone_in_cmd_buffer(struct v3dv_job *job,
2496                         struct v3dv_cmd_buffer *cmd_buffer)
2497 {
2498    struct v3dv_job *clone_job = vk_alloc(&job->device->alloc,
2499                                          sizeof(struct v3dv_job), 8,
2500                                          VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
2501    if (!clone_job) {
2502       v3dv_flag_oom(cmd_buffer, NULL);
2503       return NULL;
2504    }
2505 
2506    /* Cloned jobs don't duplicate resources! */
2507    *clone_job = *job;
2508    clone_job->is_clone = true;
2509    clone_job->cmd_buffer = cmd_buffer;
2510    list_addtail(&clone_job->list_link, &cmd_buffer->jobs);
2511 
2512    /* We need to regen the BO lists so that they point to the BO list in the
2513     * cloned job. Otherwise functions like list_length() will loop forever.
2514     */
2515    if (job->type == V3DV_JOB_TYPE_GPU_CL) {
2516       clone_bo_list(cmd_buffer, &clone_job->bcl.bo_list, &job->bcl.bo_list);
2517       clone_bo_list(cmd_buffer, &clone_job->rcl.bo_list, &job->rcl.bo_list);
2518       clone_bo_list(cmd_buffer, &clone_job->indirect.bo_list,
2519                     &job->indirect.bo_list);
2520    }
2521 
2522    return clone_job;
2523 }
2524 
2525 static struct v3dv_job *
cmd_buffer_subpass_split_for_barrier(struct v3dv_cmd_buffer * cmd_buffer,bool is_bcl_barrier)2526 cmd_buffer_subpass_split_for_barrier(struct v3dv_cmd_buffer *cmd_buffer,
2527                                      bool is_bcl_barrier)
2528 {
2529    assert(cmd_buffer->state.subpass_idx >= 0);
2530    v3dv_cmd_buffer_finish_job(cmd_buffer);
2531    struct v3dv_job *job =
2532       v3dv_cmd_buffer_subpass_resume(cmd_buffer,
2533                                      cmd_buffer->state.subpass_idx);
2534    if (!job)
2535       return NULL;
2536 
2537    job->serialize = true;
2538    job->needs_bcl_sync = is_bcl_barrier;
2539    return job;
2540 }
2541 
2542 static void
cmd_buffer_execute_inside_pass(struct v3dv_cmd_buffer * primary,uint32_t cmd_buffer_count,const VkCommandBuffer * cmd_buffers)2543 cmd_buffer_execute_inside_pass(struct v3dv_cmd_buffer *primary,
2544                                uint32_t cmd_buffer_count,
2545                                const VkCommandBuffer *cmd_buffers)
2546 {
2547    assert(primary->state.job);
2548 
2549    /* Emit occlusion query state if needed so the draw calls inside our
2550     * secondaries update the counters.
2551     */
2552    bool has_occlusion_query =
2553       primary->state.dirty & V3DV_CMD_DIRTY_OCCLUSION_QUERY;
2554    if (has_occlusion_query)
2555       emit_occlusion_query(primary);
2556 
2557    /* FIXME: if our primary job tiling doesn't enable MSSA but any of the
2558     * pipelines used by the secondaries do, we need to re-start the primary
2559     * job to enable MSAA. See cmd_buffer_restart_job_for_msaa_if_needed.
2560     */
2561    bool pending_barrier = false;
2562    bool pending_bcl_barrier = false;
2563    for (uint32_t i = 0; i < cmd_buffer_count; i++) {
2564       V3DV_FROM_HANDLE(v3dv_cmd_buffer, secondary, cmd_buffers[i]);
2565 
2566       assert(secondary->usage_flags &
2567              VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT);
2568 
2569       list_for_each_entry(struct v3dv_job, secondary_job,
2570                           &secondary->jobs, list_link) {
2571          if (secondary_job->type == V3DV_JOB_TYPE_GPU_CL_SECONDARY) {
2572             /* If the job is a CL, then we branch to it from the primary BCL.
2573              * In this case the secondary's BCL is finished with a
2574              * RETURN_FROM_SUB_LIST command to return back to the primary BCL
2575              * once we are done executing it.
2576              */
2577             assert(v3dv_cl_offset(&secondary_job->rcl) == 0);
2578             assert(secondary_job->bcl.bo);
2579 
2580             /* Sanity check that secondary BCL ends with RETURN_FROM_SUB_LIST */
2581             STATIC_ASSERT(cl_packet_length(RETURN_FROM_SUB_LIST) == 1);
2582             assert(v3dv_cl_offset(&secondary_job->bcl) >= 1);
2583             assert(*(((uint8_t *)secondary_job->bcl.next) - 1) ==
2584                    V3D42_RETURN_FROM_SUB_LIST_opcode);
2585 
2586             /* If this secondary has any barriers (or we had any pending barrier
2587              * to apply), then we can't just branch to it from the primary, we
2588              * need to split the primary to create a new job that can consume
2589              * the barriers first.
2590              *
2591              * FIXME: in this case, maybe just copy the secondary BCL without
2592              * the RETURN_FROM_SUB_LIST into the primary job to skip the
2593              * branch?
2594              */
2595             struct v3dv_job *primary_job = primary->state.job;
2596             if (!primary_job || secondary_job->serialize || pending_barrier) {
2597                const bool needs_bcl_barrier =
2598                   secondary_job->needs_bcl_sync || pending_bcl_barrier;
2599                primary_job =
2600                   cmd_buffer_subpass_split_for_barrier(primary,
2601                                                        needs_bcl_barrier);
2602                v3dv_return_if_oom(primary, NULL);
2603 
2604                /* Since we have created a new primary we need to re-emit
2605                 * occlusion query state.
2606                 */
2607                if (has_occlusion_query)
2608                   emit_occlusion_query(primary);
2609             }
2610 
2611             /* Make sure our primary job has all required BO references */
2612             set_foreach(secondary_job->bos, entry) {
2613                struct v3dv_bo *bo = (struct v3dv_bo *)entry->key;
2614                v3dv_job_add_bo(primary_job, bo);
2615             }
2616 
2617             /* Emit the branch instruction */
2618             v3dv_cl_ensure_space_with_branch(&primary_job->bcl,
2619                                              cl_packet_length(BRANCH_TO_SUB_LIST));
2620             v3dv_return_if_oom(primary, NULL);
2621 
2622             cl_emit(&primary_job->bcl, BRANCH_TO_SUB_LIST, branch) {
2623                branch.address = v3dv_cl_address(secondary_job->bcl.bo, 0);
2624             }
2625 
2626             primary_job->tmu_dirty_rcl |= secondary_job->tmu_dirty_rcl;
2627          } else if (secondary_job->type == V3DV_JOB_TYPE_CPU_CLEAR_ATTACHMENTS) {
2628             if (pending_barrier) {
2629                cmd_buffer_subpass_split_for_barrier(primary, pending_bcl_barrier);
2630                v3dv_return_if_oom(primary, NULL);
2631             }
2632 
2633             const struct v3dv_clear_attachments_cpu_job_info *info =
2634                &secondary_job->cpu.clear_attachments;
2635             v3dv_CmdClearAttachments(v3dv_cmd_buffer_to_handle(primary),
2636                                      info->attachment_count,
2637                                      info->attachments,
2638                                      info->rect_count,
2639                                      info->rects);
2640          } else {
2641             /* This is a regular job (CPU or GPU), so just finish the current
2642              * primary job (if any) and then add the secondary job to the
2643              * primary's job list right after it.
2644              */
2645             v3dv_cmd_buffer_finish_job(primary);
2646             job_clone_in_cmd_buffer(secondary_job, primary);
2647             if (pending_barrier) {
2648                secondary_job->serialize = true;
2649                if (pending_bcl_barrier)
2650                   secondary_job->needs_bcl_sync = true;
2651             }
2652          }
2653 
2654          pending_barrier = false;
2655          pending_bcl_barrier = false;
2656       }
2657 
2658       /* If the secondary has recorded any vkCmdEndQuery commands, we need to
2659        * copy this state to the primary so it is processed properly when the
2660        * current primary job is finished.
2661        */
2662       cmd_buffer_copy_secondary_end_query_state(primary, secondary);
2663 
2664       /* If this secondary had any pending barrier state we will need that
2665        * barrier state consumed with whatever comes next in the primary.
2666        */
2667       assert(secondary->state.has_barrier || !secondary->state.has_bcl_barrier);
2668       pending_barrier = secondary->state.has_barrier;
2669       pending_bcl_barrier = secondary->state.has_bcl_barrier;
2670    }
2671 
2672    if (pending_barrier) {
2673       primary->state.has_barrier = true;
2674       primary->state.has_bcl_barrier |= pending_bcl_barrier;
2675    }
2676 }
2677 
2678 static void
cmd_buffer_execute_outside_pass(struct v3dv_cmd_buffer * primary,uint32_t cmd_buffer_count,const VkCommandBuffer * cmd_buffers)2679 cmd_buffer_execute_outside_pass(struct v3dv_cmd_buffer *primary,
2680                                 uint32_t cmd_buffer_count,
2681                                 const VkCommandBuffer *cmd_buffers)
2682 {
2683    bool pending_barrier = false;
2684    bool pending_bcl_barrier = false;
2685    for (uint32_t i = 0; i < cmd_buffer_count; i++) {
2686       V3DV_FROM_HANDLE(v3dv_cmd_buffer, secondary, cmd_buffers[i]);
2687 
2688       assert(!(secondary->usage_flags &
2689                VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT));
2690 
2691       /* Secondary command buffers that execute outside a render pass create
2692        * complete jobs with an RCL and tile setup, so we simply want to merge
2693        * their job list into the primary's. However, because they may be
2694        * executed into multiple primaries at the same time and we only have a
2695        * single list_link in each job, we can't just add then to the primary's
2696        * job list and we instead have to clone them first.
2697        *
2698        * Alternatively, we could create a "execute secondary" CPU job that
2699        * when executed in a queue, would submit all the jobs in the referenced
2700        * secondary command buffer. However, this would raise some challenges
2701        * to make it work with the implementation of wait threads in the queue
2702        * which we use for event waits, for example.
2703        */
2704       list_for_each_entry(struct v3dv_job, secondary_job,
2705                           &secondary->jobs, list_link) {
2706          /* These can only happen inside a render pass */
2707          assert(secondary_job->type != V3DV_JOB_TYPE_CPU_CLEAR_ATTACHMENTS);
2708          assert(secondary_job->type != V3DV_JOB_TYPE_GPU_CL_SECONDARY);
2709          struct v3dv_job *job = job_clone_in_cmd_buffer(secondary_job, primary);
2710          if (!job)
2711             return;
2712 
2713          if (pending_barrier) {
2714             job->serialize = true;
2715             if (pending_bcl_barrier)
2716                job->needs_bcl_sync = true;
2717             pending_barrier = false;
2718             pending_bcl_barrier = false;
2719          }
2720       }
2721 
2722       /* If this secondary had any pending barrier state we will need that
2723        * barrier state consumed with whatever comes after it (first job in
2724        * the next secondary or the primary, if this was the last secondary).
2725        */
2726       assert(secondary->state.has_barrier || !secondary->state.has_bcl_barrier);
2727       pending_barrier = secondary->state.has_barrier;
2728       pending_bcl_barrier = secondary->state.has_bcl_barrier;
2729    }
2730 
2731    if (pending_barrier) {
2732       primary->state.has_barrier = true;
2733       primary->state.has_bcl_barrier |= pending_bcl_barrier;
2734    }
2735 }
2736 
2737 void
v3dv_CmdExecuteCommands(VkCommandBuffer commandBuffer,uint32_t commandBufferCount,const VkCommandBuffer * pCommandBuffers)2738 v3dv_CmdExecuteCommands(VkCommandBuffer commandBuffer,
2739                         uint32_t commandBufferCount,
2740                         const VkCommandBuffer *pCommandBuffers)
2741 {
2742    V3DV_FROM_HANDLE(v3dv_cmd_buffer, primary, commandBuffer);
2743 
2744    if (primary->state.pass != NULL) {
2745       cmd_buffer_execute_inside_pass(primary,
2746                                      commandBufferCount, pCommandBuffers);
2747    } else {
2748       cmd_buffer_execute_outside_pass(primary,
2749                                       commandBufferCount, pCommandBuffers);
2750    }
2751 }
2752 
2753 /* This goes though the list of possible dynamic states in the pipeline and,
2754  * for those that are not configured as dynamic, copies relevant state into
2755  * the command buffer.
2756  */
2757 static void
cmd_buffer_bind_pipeline_static_state(struct v3dv_cmd_buffer * cmd_buffer,const struct v3dv_dynamic_state * src)2758 cmd_buffer_bind_pipeline_static_state(struct v3dv_cmd_buffer *cmd_buffer,
2759                                       const struct v3dv_dynamic_state *src)
2760 {
2761    struct v3dv_dynamic_state *dest = &cmd_buffer->state.dynamic;
2762    uint32_t dynamic_mask = src->mask;
2763    uint32_t dirty = 0;
2764 
2765    if (!(dynamic_mask & V3DV_DYNAMIC_VIEWPORT)) {
2766       dest->viewport.count = src->viewport.count;
2767       if (memcmp(&dest->viewport.viewports, &src->viewport.viewports,
2768                  src->viewport.count * sizeof(VkViewport))) {
2769          typed_memcpy(dest->viewport.viewports,
2770                       src->viewport.viewports,
2771                       src->viewport.count);
2772          typed_memcpy(dest->viewport.scale, src->viewport.scale,
2773                       src->viewport.count);
2774          typed_memcpy(dest->viewport.translate, src->viewport.translate,
2775                       src->viewport.count);
2776          dirty |= V3DV_CMD_DIRTY_VIEWPORT;
2777       }
2778    }
2779 
2780    if (!(dynamic_mask & V3DV_DYNAMIC_SCISSOR)) {
2781       dest->scissor.count = src->scissor.count;
2782       if (memcmp(&dest->scissor.scissors, &src->scissor.scissors,
2783                  src->scissor.count * sizeof(VkRect2D))) {
2784          typed_memcpy(dest->scissor.scissors,
2785                       src->scissor.scissors, src->scissor.count);
2786          dirty |= V3DV_CMD_DIRTY_SCISSOR;
2787       }
2788    }
2789 
2790    if (!(dynamic_mask & V3DV_DYNAMIC_STENCIL_COMPARE_MASK)) {
2791       if (memcmp(&dest->stencil_compare_mask, &src->stencil_compare_mask,
2792                  sizeof(src->stencil_compare_mask))) {
2793          dest->stencil_compare_mask = src->stencil_compare_mask;
2794          dirty |= V3DV_CMD_DIRTY_STENCIL_COMPARE_MASK;
2795       }
2796    }
2797 
2798    if (!(dynamic_mask & V3DV_DYNAMIC_STENCIL_WRITE_MASK)) {
2799       if (memcmp(&dest->stencil_write_mask, &src->stencil_write_mask,
2800                  sizeof(src->stencil_write_mask))) {
2801          dest->stencil_write_mask = src->stencil_write_mask;
2802          dirty |= V3DV_CMD_DIRTY_STENCIL_WRITE_MASK;
2803       }
2804    }
2805 
2806    if (!(dynamic_mask & V3DV_DYNAMIC_STENCIL_REFERENCE)) {
2807       if (memcmp(&dest->stencil_reference, &src->stencil_reference,
2808                  sizeof(src->stencil_reference))) {
2809          dest->stencil_reference = src->stencil_reference;
2810          dirty |= V3DV_CMD_DIRTY_STENCIL_REFERENCE;
2811       }
2812    }
2813 
2814    if (!(dynamic_mask & V3DV_DYNAMIC_BLEND_CONSTANTS)) {
2815       if (memcmp(dest->blend_constants, src->blend_constants,
2816                  sizeof(src->blend_constants))) {
2817          memcpy(dest->blend_constants, src->blend_constants,
2818                 sizeof(src->blend_constants));
2819          dirty |= V3DV_CMD_DIRTY_BLEND_CONSTANTS;
2820       }
2821    }
2822 
2823    if (!(dynamic_mask & V3DV_DYNAMIC_DEPTH_BIAS)) {
2824       if (memcmp(&dest->depth_bias, &src->depth_bias,
2825                  sizeof(src->depth_bias))) {
2826          memcpy(&dest->depth_bias, &src->depth_bias, sizeof(src->depth_bias));
2827          dirty |= V3DV_CMD_DIRTY_DEPTH_BIAS;
2828       }
2829    }
2830 
2831    if (!(dynamic_mask & V3DV_DYNAMIC_LINE_WIDTH)) {
2832       if (dest->line_width != src->line_width) {
2833          dest->line_width = src->line_width;
2834          dirty |= V3DV_CMD_DIRTY_LINE_WIDTH;
2835       }
2836    }
2837 
2838    cmd_buffer->state.dynamic.mask = dynamic_mask;
2839    cmd_buffer->state.dirty |= dirty;
2840 }
2841 
2842 static void
job_update_ez_state(struct v3dv_job * job,struct v3dv_pipeline * pipeline,struct v3dv_cmd_buffer_state * state)2843 job_update_ez_state(struct v3dv_job *job,
2844                     struct v3dv_pipeline *pipeline,
2845                     struct v3dv_cmd_buffer_state *state)
2846 {
2847    /* If we don't have a depth attachment at all, disable */
2848    if (!state->pass) {
2849       job->ez_state = VC5_EZ_DISABLED;
2850       return;
2851    }
2852 
2853    assert(state->subpass_idx < state->pass->subpass_count);
2854    struct v3dv_subpass *subpass = &state->pass->subpasses[state->subpass_idx];
2855    if (subpass->ds_attachment.attachment == VK_ATTACHMENT_UNUSED) {
2856       job->ez_state = VC5_EZ_DISABLED;
2857       return;
2858    }
2859 
2860    /* Otherwise, look at the curently bound pipeline state */
2861    switch (pipeline->ez_state) {
2862    case VC5_EZ_UNDECIDED:
2863       /* If the pipeline didn't pick a direction but didn't disable, then go
2864        * along with the current EZ state. This allows EZ optimization for Z
2865        * func == EQUAL or NEVER.
2866        */
2867       break;
2868 
2869    case VC5_EZ_LT_LE:
2870    case VC5_EZ_GT_GE:
2871       /* If the pipeline picked a direction, then it needs to match the current
2872        * direction if we've decided on one.
2873        */
2874       if (job->ez_state == VC5_EZ_UNDECIDED)
2875          job->ez_state = pipeline->ez_state;
2876       else if (job->ez_state != pipeline->ez_state)
2877          job->ez_state = VC5_EZ_DISABLED;
2878       break;
2879 
2880    case VC5_EZ_DISABLED:
2881       /* If the pipeline disables EZ because of a bad Z func or stencil
2882        * operation, then we can't do any more EZ in this frame.
2883        */
2884       job->ez_state = VC5_EZ_DISABLED;
2885       break;
2886    }
2887 
2888    /* If the FS writes Z, then it may update against the chosen EZ direction */
2889    if (pipeline->fs->current_variant->prog_data.fs->writes_z)
2890       job->ez_state = VC5_EZ_DISABLED;
2891 
2892    if (job->first_ez_state == VC5_EZ_UNDECIDED &&
2893        job->ez_state != VC5_EZ_DISABLED) {
2894       job->first_ez_state = job->ez_state;
2895    }
2896 }
2897 
2898 /* Note that the following poopulate methods doesn't do a detailed fill-up of
2899  * the v3d_fs_key. Here we just fill-up cmd_buffer specific info. All info
2900  * coming from the pipeline create info was alredy filled up when the pipeline
2901  * was created
2902  */
2903 static void
cmd_buffer_populate_v3d_key(struct v3d_key * key,struct v3dv_cmd_buffer * cmd_buffer,VkPipelineBindPoint pipeline_binding)2904 cmd_buffer_populate_v3d_key(struct v3d_key *key,
2905                             struct v3dv_cmd_buffer *cmd_buffer,
2906                             VkPipelineBindPoint pipeline_binding)
2907 {
2908    if (cmd_buffer->state.pipeline->combined_index_map != NULL) {
2909       struct v3dv_descriptor_map *texture_map = &cmd_buffer->state.pipeline->texture_map;
2910       struct v3dv_descriptor_map *sampler_map = &cmd_buffer->state.pipeline->sampler_map;
2911       struct v3dv_descriptor_state *descriptor_state =
2912          &cmd_buffer->state.descriptor_state[pipeline_binding];
2913 
2914       /* At pipeline creation time it was pre-generated an all-16 bit and an
2915        * all-32 bit variant, so let's do the same here to avoid as much as
2916        * possible a new compilation.
2917        */
2918       uint32_t v3d_key_return_size = 16;
2919       hash_table_foreach(cmd_buffer->state.pipeline->combined_index_map, entry) {
2920          uint32_t combined_idx = (uint32_t)(uintptr_t) (entry->data);
2921          uint32_t combined_idx_key =
2922             cmd_buffer->state.pipeline->combined_index_to_key_map[combined_idx];
2923          uint32_t texture_idx;
2924          uint32_t sampler_idx;
2925 
2926          v3dv_pipeline_combined_index_key_unpack(combined_idx_key,
2927                                                  &texture_idx, &sampler_idx);
2928 
2929          VkFormat vk_format;
2930          const struct v3dv_format *format;
2931 
2932          format =
2933             v3dv_descriptor_map_get_texture_format(descriptor_state,
2934                                                    texture_map,
2935                                                    cmd_buffer->state.pipeline->layout,
2936                                                    texture_idx,
2937                                                    &vk_format);
2938 
2939          const struct v3dv_sampler *sampler = NULL;
2940          if (sampler_idx != V3DV_NO_SAMPLER_IDX) {
2941             sampler =
2942                v3dv_descriptor_map_get_sampler(descriptor_state,
2943                                                sampler_map,
2944                                                cmd_buffer->state.pipeline->layout,
2945                                                sampler_idx);
2946             assert(sampler);
2947          }
2948 
2949          key->tex[combined_idx].return_size =
2950             v3dv_get_tex_return_size(format,
2951                                      sampler ? sampler->compare_enable : false);
2952 
2953          if (key->tex[combined_idx].return_size == 32) {
2954             v3d_key_return_size = 32;
2955          }
2956       }
2957       v3d_key_update_return_size(cmd_buffer->state.pipeline, key,
2958                                  v3d_key_return_size);
2959    }
2960 }
2961 
2962 static void
update_fs_variant(struct v3dv_cmd_buffer * cmd_buffer)2963 update_fs_variant(struct v3dv_cmd_buffer *cmd_buffer)
2964 {
2965    struct v3dv_shader_variant *variant;
2966    struct v3dv_pipeline_stage *p_stage = cmd_buffer->state.pipeline->fs;
2967    struct v3d_fs_key local_key;
2968 
2969    /* We start with a copy of the original pipeline key */
2970    memcpy(&local_key, &p_stage->key.fs, sizeof(struct v3d_fs_key));
2971 
2972    cmd_buffer_populate_v3d_key(&local_key.base, cmd_buffer,
2973                                VK_PIPELINE_BIND_POINT_GRAPHICS);
2974 
2975    VkResult vk_result;
2976    variant = v3dv_get_shader_variant(p_stage, NULL, &local_key.base,
2977                                      sizeof(struct v3d_fs_key),
2978                                      &cmd_buffer->device->alloc,
2979                                      &vk_result);
2980    /* At this point we are not creating a vulkan object to return to the
2981     * API user, so we can't really return back a OOM error
2982     */
2983    assert(variant);
2984    assert(vk_result == VK_SUCCESS);
2985 
2986    if (p_stage->current_variant != variant) {
2987       v3dv_shader_variant_unref(cmd_buffer->device, p_stage->current_variant);
2988    }
2989    p_stage->current_variant = variant;
2990 }
2991 
2992 static void
update_vs_variant(struct v3dv_cmd_buffer * cmd_buffer)2993 update_vs_variant(struct v3dv_cmd_buffer *cmd_buffer)
2994 {
2995    struct v3dv_shader_variant *variant;
2996    struct v3dv_pipeline_stage *p_stage;
2997    struct v3d_vs_key local_key;
2998    VkResult vk_result;
2999 
3000    /* We start with a copy of the original pipeline key */
3001    p_stage = cmd_buffer->state.pipeline->vs;
3002    memcpy(&local_key, &p_stage->key.vs, sizeof(struct v3d_vs_key));
3003 
3004    cmd_buffer_populate_v3d_key(&local_key.base, cmd_buffer,
3005                                VK_PIPELINE_BIND_POINT_GRAPHICS);
3006 
3007    variant = v3dv_get_shader_variant(p_stage, NULL, &local_key.base,
3008                                      sizeof(struct v3d_vs_key),
3009                                      &cmd_buffer->device->alloc,
3010                                      &vk_result);
3011    /* At this point we are not creating a vulkan object to return to the
3012     * API user, so we can't really return back a OOM error
3013     */
3014    assert(variant);
3015    assert(vk_result == VK_SUCCESS);
3016 
3017    if (p_stage->current_variant != variant) {
3018       v3dv_shader_variant_unref(cmd_buffer->device, p_stage->current_variant);
3019    }
3020    p_stage->current_variant = variant;
3021 
3022    /* Now the vs_bin */
3023    p_stage = cmd_buffer->state.pipeline->vs_bin;
3024    memcpy(&local_key, &p_stage->key.vs, sizeof(struct v3d_vs_key));
3025 
3026    cmd_buffer_populate_v3d_key(&local_key.base, cmd_buffer,
3027                                VK_PIPELINE_BIND_POINT_GRAPHICS);
3028    variant = v3dv_get_shader_variant(p_stage, NULL, &local_key.base,
3029                                      sizeof(struct v3d_vs_key),
3030                                      &cmd_buffer->device->alloc,
3031                                      &vk_result);
3032 
3033    /* At this point we are not creating a vulkan object to return to the
3034     * API user, so we can't really return back a OOM error
3035     */
3036    assert(variant);
3037    assert(vk_result == VK_SUCCESS);
3038 
3039    if (p_stage->current_variant != variant) {
3040       v3dv_shader_variant_unref(cmd_buffer->device, p_stage->current_variant);
3041    }
3042    p_stage->current_variant = variant;
3043 }
3044 
3045 static void
update_cs_variant(struct v3dv_cmd_buffer * cmd_buffer)3046 update_cs_variant(struct v3dv_cmd_buffer *cmd_buffer)
3047 {
3048    struct v3dv_shader_variant *variant;
3049    struct v3dv_pipeline_stage *p_stage = cmd_buffer->state.pipeline->cs;
3050    struct v3d_key local_key;
3051 
3052    /* We start with a copy of the original pipeline key */
3053    memcpy(&local_key, &p_stage->key.base, sizeof(struct v3d_key));
3054 
3055    cmd_buffer_populate_v3d_key(&local_key, cmd_buffer,
3056                                VK_PIPELINE_BIND_POINT_COMPUTE);
3057 
3058    VkResult result;
3059    variant = v3dv_get_shader_variant(p_stage, NULL, &local_key,
3060                                      sizeof(struct v3d_key),
3061                                      &cmd_buffer->device->alloc,
3062                                      &result);
3063    /* At this point we are not creating a vulkan object to return to the
3064     * API user, so we can't really return back a OOM error
3065     */
3066    assert(variant);
3067    assert(result == VK_SUCCESS);
3068 
3069    if (p_stage->current_variant != variant) {
3070       v3dv_shader_variant_unref(cmd_buffer->device, p_stage->current_variant);
3071    }
3072    p_stage->current_variant = variant;
3073 }
3074 
3075 /*
3076  * Some updates on the cmd buffer requires also updates on the shader being
3077  * compiled at the pipeline. The poster boy here are textures, as the compiler
3078  * needs to do certain things depending on the texture format. So here we
3079  * re-create the v3d_keys and update the variant. Note that internally the
3080  * pipeline has a variant cache (hash table) to avoid unneeded compilations
3081  *
3082  */
3083 static void
update_pipeline_variants(struct v3dv_cmd_buffer * cmd_buffer)3084 update_pipeline_variants(struct v3dv_cmd_buffer *cmd_buffer)
3085 {
3086    assert(cmd_buffer->state.pipeline);
3087 
3088    if (v3dv_pipeline_get_binding_point(cmd_buffer->state.pipeline) ==
3089        VK_PIPELINE_BIND_POINT_GRAPHICS) {
3090       update_fs_variant(cmd_buffer);
3091       update_vs_variant(cmd_buffer);
3092    } else {
3093       update_cs_variant(cmd_buffer);
3094    }
3095 }
3096 
3097 static void
bind_graphics_pipeline(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_pipeline * pipeline)3098 bind_graphics_pipeline(struct v3dv_cmd_buffer *cmd_buffer,
3099                        struct v3dv_pipeline *pipeline)
3100 {
3101    assert(pipeline && !(pipeline->active_stages & VK_SHADER_STAGE_COMPUTE_BIT));
3102    if (cmd_buffer->state.pipeline == pipeline)
3103       return;
3104 
3105    /* Enable always flush if we are blending to sRGB render targets. This
3106     * fixes test failures in:
3107     * dEQP-VK.pipeline.blend.format.r8g8b8a8_srgb.*
3108     *
3109     * FIXME: not sure why we need this. The tile buffer is always linear, with
3110     * conversion from/to sRGB happening on tile load/store operations. This
3111     * means that when we enable flushing the only difference is that we convert
3112     * to sRGB on the store after each draw call and we convert from sRGB on the
3113     * load before each draw call, but the blend happens in linear format in the
3114     * tile buffer anyway, which is the same scenario as if we didn't flush.
3115     */
3116    assert(pipeline->subpass);
3117    if (pipeline->subpass->has_srgb_rt && pipeline->blend.enables) {
3118       assert(cmd_buffer->state.job);
3119       cmd_buffer->state.job->always_flush = true;
3120       perf_debug("flushing draw calls for subpass %d because bound pipeline "
3121                  "uses sRGB blending\n", cmd_buffer->state.subpass_idx);
3122    }
3123 
3124    cmd_buffer->state.pipeline = pipeline;
3125 
3126    cmd_buffer_bind_pipeline_static_state(cmd_buffer, &pipeline->dynamic_state);
3127 
3128    cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_PIPELINE;
3129 }
3130 
3131 static void
bind_compute_pipeline(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_pipeline * pipeline)3132 bind_compute_pipeline(struct v3dv_cmd_buffer *cmd_buffer,
3133                       struct v3dv_pipeline *pipeline)
3134 {
3135    assert(pipeline && pipeline->active_stages == VK_SHADER_STAGE_COMPUTE_BIT);
3136 
3137    if (cmd_buffer->state.pipeline == pipeline)
3138       return;
3139 
3140    cmd_buffer->state.pipeline = pipeline;
3141    cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_PIPELINE;
3142 }
3143 
3144 void
v3dv_CmdBindPipeline(VkCommandBuffer commandBuffer,VkPipelineBindPoint pipelineBindPoint,VkPipeline _pipeline)3145 v3dv_CmdBindPipeline(VkCommandBuffer commandBuffer,
3146                      VkPipelineBindPoint pipelineBindPoint,
3147                      VkPipeline _pipeline)
3148 {
3149    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
3150    V3DV_FROM_HANDLE(v3dv_pipeline, pipeline, _pipeline);
3151 
3152    switch (pipelineBindPoint) {
3153    case VK_PIPELINE_BIND_POINT_COMPUTE:
3154       bind_compute_pipeline(cmd_buffer, pipeline);
3155       break;
3156 
3157    case VK_PIPELINE_BIND_POINT_GRAPHICS:
3158       bind_graphics_pipeline(cmd_buffer, pipeline);
3159       break;
3160 
3161    default:
3162       assert(!"invalid bind point");
3163       break;
3164    }
3165 }
3166 
3167 /* FIXME: C&P from radv. tu has similar code. Perhaps common place? */
3168 void
v3dv_viewport_compute_xform(const VkViewport * viewport,float scale[3],float translate[3])3169 v3dv_viewport_compute_xform(const VkViewport *viewport,
3170                             float scale[3],
3171                             float translate[3])
3172 {
3173    float x = viewport->x;
3174    float y = viewport->y;
3175    float half_width = 0.5f * viewport->width;
3176    float half_height = 0.5f * viewport->height;
3177    double n = viewport->minDepth;
3178    double f = viewport->maxDepth;
3179 
3180    scale[0] = half_width;
3181    translate[0] = half_width + x;
3182    scale[1] = half_height;
3183    translate[1] = half_height + y;
3184 
3185    scale[2] = (f - n);
3186    translate[2] = n;
3187 
3188    /* It seems that if the scale is small enough the hardware won't clip
3189     * correctly so we work around this my choosing the smallest scale that
3190     * seems to work.
3191     *
3192     * This case is exercised by CTS:
3193     * dEQP-VK.draw.inverted_depth_ranges.nodepthclamp_deltazero
3194     */
3195    const float min_abs_scale = 0.000009f;
3196    if (fabs(scale[2]) < min_abs_scale)
3197       scale[2] = min_abs_scale * (scale[2] < 0 ? -1.0f : 1.0f);
3198 }
3199 
3200 void
v3dv_CmdSetViewport(VkCommandBuffer commandBuffer,uint32_t firstViewport,uint32_t viewportCount,const VkViewport * pViewports)3201 v3dv_CmdSetViewport(VkCommandBuffer commandBuffer,
3202                     uint32_t firstViewport,
3203                     uint32_t viewportCount,
3204                     const VkViewport *pViewports)
3205 {
3206    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
3207    struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
3208    const uint32_t total_count = firstViewport + viewportCount;
3209 
3210    assert(firstViewport < MAX_VIEWPORTS);
3211    assert(total_count >= 1 && total_count <= MAX_VIEWPORTS);
3212 
3213    if (state->dynamic.viewport.count < total_count)
3214       state->dynamic.viewport.count = total_count;
3215 
3216    if (!memcmp(state->dynamic.viewport.viewports + firstViewport,
3217                pViewports, viewportCount * sizeof(*pViewports))) {
3218       return;
3219    }
3220 
3221    memcpy(state->dynamic.viewport.viewports + firstViewport, pViewports,
3222           viewportCount * sizeof(*pViewports));
3223 
3224    for (uint32_t i = firstViewport; i < total_count; i++) {
3225       v3dv_viewport_compute_xform(&state->dynamic.viewport.viewports[i],
3226                                   state->dynamic.viewport.scale[i],
3227                                   state->dynamic.viewport.translate[i]);
3228    }
3229 
3230    cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_VIEWPORT;
3231 }
3232 
3233 void
v3dv_CmdSetScissor(VkCommandBuffer commandBuffer,uint32_t firstScissor,uint32_t scissorCount,const VkRect2D * pScissors)3234 v3dv_CmdSetScissor(VkCommandBuffer commandBuffer,
3235                    uint32_t firstScissor,
3236                    uint32_t scissorCount,
3237                    const VkRect2D *pScissors)
3238 {
3239    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
3240    struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
3241 
3242    assert(firstScissor < MAX_SCISSORS);
3243    assert(firstScissor + scissorCount >= 1 &&
3244           firstScissor + scissorCount <= MAX_SCISSORS);
3245 
3246    if (state->dynamic.scissor.count < firstScissor + scissorCount)
3247       state->dynamic.scissor.count = firstScissor + scissorCount;
3248 
3249    if (!memcmp(state->dynamic.scissor.scissors + firstScissor,
3250                pScissors, scissorCount * sizeof(*pScissors))) {
3251       return;
3252    }
3253 
3254    memcpy(state->dynamic.scissor.scissors + firstScissor, pScissors,
3255           scissorCount * sizeof(*pScissors));
3256 
3257    cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_SCISSOR;
3258 }
3259 
3260 static void
emit_scissor(struct v3dv_cmd_buffer * cmd_buffer)3261 emit_scissor(struct v3dv_cmd_buffer *cmd_buffer)
3262 {
3263    if (cmd_buffer->state.dynamic.viewport.count == 0)
3264       return;
3265 
3266    struct v3dv_dynamic_state *dynamic = &cmd_buffer->state.dynamic;
3267 
3268    /* FIXME: right now we only support one viewport. viewporst[0] would work
3269     * now, but would need to change if we allow multiple viewports.
3270     */
3271    float *vptranslate = dynamic->viewport.translate[0];
3272    float *vpscale = dynamic->viewport.scale[0];
3273 
3274    float vp_minx = -fabsf(vpscale[0]) + vptranslate[0];
3275    float vp_maxx = fabsf(vpscale[0]) + vptranslate[0];
3276    float vp_miny = -fabsf(vpscale[1]) + vptranslate[1];
3277    float vp_maxy = fabsf(vpscale[1]) + vptranslate[1];
3278 
3279    /* Quoting from v3dx_emit:
3280     * "Clip to the scissor if it's enabled, but still clip to the
3281     * drawable regardless since that controls where the binner
3282     * tries to put things.
3283     *
3284     * Additionally, always clip the rendering to the viewport,
3285     * since the hardware does guardband clipping, meaning
3286     * primitives would rasterize outside of the view volume."
3287     */
3288    uint32_t minx, miny, maxx, maxy;
3289 
3290    /* From the Vulkan spec:
3291     *
3292     * "The application must ensure (using scissor if necessary) that all
3293     *  rendering is contained within the render area. The render area must be
3294     *  contained within the framebuffer dimensions."
3295     *
3296     * So it is the application's responsibility to ensure this. Still, we can
3297     * help by automatically restricting the scissor rect to the render area.
3298     */
3299    minx = MAX2(vp_minx, cmd_buffer->state.render_area.offset.x);
3300    miny = MAX2(vp_miny, cmd_buffer->state.render_area.offset.y);
3301    maxx = MIN2(vp_maxx, cmd_buffer->state.render_area.offset.x +
3302                         cmd_buffer->state.render_area.extent.width);
3303    maxy = MIN2(vp_maxy, cmd_buffer->state.render_area.offset.y +
3304                         cmd_buffer->state.render_area.extent.height);
3305 
3306    minx = vp_minx;
3307    miny = vp_miny;
3308    maxx = vp_maxx;
3309    maxy = vp_maxy;
3310 
3311    /* Clip against user provided scissor if needed.
3312     *
3313     * FIXME: right now we only allow one scissor. Below would need to be
3314     * updated if we support more
3315     */
3316    if (dynamic->scissor.count > 0) {
3317       VkRect2D *scissor = &dynamic->scissor.scissors[0];
3318       minx = MAX2(minx, scissor->offset.x);
3319       miny = MAX2(miny, scissor->offset.y);
3320       maxx = MIN2(maxx, scissor->offset.x + scissor->extent.width);
3321       maxy = MIN2(maxy, scissor->offset.y + scissor->extent.height);
3322    }
3323 
3324    /* If the scissor is outside the viewport area we end up with
3325     * min{x,y} > max{x,y}.
3326     */
3327    if (minx > maxx)
3328       maxx = minx;
3329    if (miny > maxy)
3330       maxy = miny;
3331 
3332    cmd_buffer->state.clip_window.offset.x = minx;
3333    cmd_buffer->state.clip_window.offset.y = miny;
3334    cmd_buffer->state.clip_window.extent.width = maxx - minx;
3335    cmd_buffer->state.clip_window.extent.height = maxy - miny;
3336 
3337    emit_clip_window(cmd_buffer->state.job, &cmd_buffer->state.clip_window);
3338 
3339    cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_SCISSOR;
3340 }
3341 
3342 static void
emit_viewport(struct v3dv_cmd_buffer * cmd_buffer)3343 emit_viewport(struct v3dv_cmd_buffer *cmd_buffer)
3344 {
3345    struct v3dv_dynamic_state *dynamic = &cmd_buffer->state.dynamic;
3346    /* FIXME: right now we only support one viewport. viewporst[0] would work
3347     * now, would need to change if we allow multiple viewports
3348     */
3349    float *vptranslate = dynamic->viewport.translate[0];
3350    float *vpscale = dynamic->viewport.scale[0];
3351 
3352    struct v3dv_job *job = cmd_buffer->state.job;
3353    assert(job);
3354 
3355    const uint32_t required_cl_size =
3356       cl_packet_length(CLIPPER_XY_SCALING) +
3357       cl_packet_length(CLIPPER_Z_SCALE_AND_OFFSET) +
3358       cl_packet_length(CLIPPER_Z_MIN_MAX_CLIPPING_PLANES) +
3359       cl_packet_length(VIEWPORT_OFFSET);
3360    v3dv_cl_ensure_space_with_branch(&job->bcl, required_cl_size);
3361    v3dv_return_if_oom(cmd_buffer, NULL);
3362 
3363    cl_emit(&job->bcl, CLIPPER_XY_SCALING, clip) {
3364       clip.viewport_half_width_in_1_256th_of_pixel = vpscale[0] * 256.0f;
3365       clip.viewport_half_height_in_1_256th_of_pixel = vpscale[1] * 256.0f;
3366    }
3367 
3368    cl_emit(&job->bcl, CLIPPER_Z_SCALE_AND_OFFSET, clip) {
3369       clip.viewport_z_offset_zc_to_zs = vptranslate[2];
3370       clip.viewport_z_scale_zc_to_zs = vpscale[2];
3371    }
3372    cl_emit(&job->bcl, CLIPPER_Z_MIN_MAX_CLIPPING_PLANES, clip) {
3373       /* Vulkan's Z NDC is [0..1], unlile OpenGL which is [-1, 1] */
3374       float z1 = vptranslate[2];
3375       float z2 = vptranslate[2] + vpscale[2];
3376       clip.minimum_zw = MIN2(z1, z2);
3377       clip.maximum_zw = MAX2(z1, z2);
3378    }
3379 
3380    cl_emit(&job->bcl, VIEWPORT_OFFSET, vp) {
3381       vp.viewport_centre_x_coordinate = vptranslate[0];
3382       vp.viewport_centre_y_coordinate = vptranslate[1];
3383    }
3384 
3385    cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_VIEWPORT;
3386 }
3387 
3388 static void
emit_stencil(struct v3dv_cmd_buffer * cmd_buffer)3389 emit_stencil(struct v3dv_cmd_buffer *cmd_buffer)
3390 {
3391    struct v3dv_job *job = cmd_buffer->state.job;
3392    assert(job);
3393 
3394    struct v3dv_pipeline *pipeline = cmd_buffer->state.pipeline;
3395    struct v3dv_dynamic_state *dynamic_state = &cmd_buffer->state.dynamic;
3396 
3397    const uint32_t dynamic_stencil_states = V3DV_DYNAMIC_STENCIL_COMPARE_MASK |
3398                                            V3DV_DYNAMIC_STENCIL_WRITE_MASK |
3399                                            V3DV_DYNAMIC_STENCIL_REFERENCE;
3400 
3401    v3dv_cl_ensure_space_with_branch(&job->bcl,
3402                                     2 * cl_packet_length(STENCIL_CFG));
3403    v3dv_return_if_oom(cmd_buffer, NULL);
3404 
3405    bool emitted_stencil = false;
3406    for (uint32_t i = 0; i < 2; i++) {
3407       if (pipeline->emit_stencil_cfg[i]) {
3408          if (dynamic_state->mask & dynamic_stencil_states) {
3409             cl_emit_with_prepacked(&job->bcl, STENCIL_CFG,
3410                                    pipeline->stencil_cfg[i], config) {
3411                if (dynamic_state->mask & V3DV_DYNAMIC_STENCIL_COMPARE_MASK) {
3412                   config.stencil_test_mask =
3413                      i == 0 ? dynamic_state->stencil_compare_mask.front :
3414                               dynamic_state->stencil_compare_mask.back;
3415                }
3416                if (dynamic_state->mask & V3DV_DYNAMIC_STENCIL_WRITE_MASK) {
3417                   config.stencil_write_mask =
3418                      i == 0 ? dynamic_state->stencil_write_mask.front :
3419                               dynamic_state->stencil_write_mask.back;
3420                }
3421                if (dynamic_state->mask & V3DV_DYNAMIC_STENCIL_REFERENCE) {
3422                   config.stencil_ref_value =
3423                      i == 0 ? dynamic_state->stencil_reference.front :
3424                               dynamic_state->stencil_reference.back;
3425                }
3426             }
3427          } else {
3428             cl_emit_prepacked(&job->bcl, &pipeline->stencil_cfg[i]);
3429          }
3430 
3431          emitted_stencil = true;
3432       }
3433    }
3434 
3435    if (emitted_stencil) {
3436       const uint32_t dynamic_stencil_dirty_flags =
3437                V3DV_CMD_DIRTY_STENCIL_COMPARE_MASK |
3438                V3DV_CMD_DIRTY_STENCIL_WRITE_MASK |
3439                V3DV_CMD_DIRTY_STENCIL_REFERENCE;
3440       cmd_buffer->state.dirty &= ~dynamic_stencil_dirty_flags;
3441    }
3442 }
3443 
3444 static void
emit_depth_bias(struct v3dv_cmd_buffer * cmd_buffer)3445 emit_depth_bias(struct v3dv_cmd_buffer *cmd_buffer)
3446 {
3447    struct v3dv_pipeline *pipeline = cmd_buffer->state.pipeline;
3448    assert(pipeline);
3449 
3450    if (!pipeline->depth_bias.enabled)
3451       return;
3452 
3453    struct v3dv_job *job = cmd_buffer->state.job;
3454    assert(job);
3455 
3456    v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(DEPTH_OFFSET));
3457    v3dv_return_if_oom(cmd_buffer, NULL);
3458 
3459    struct v3dv_dynamic_state *dynamic = &cmd_buffer->state.dynamic;
3460    cl_emit(&job->bcl, DEPTH_OFFSET, bias) {
3461       bias.depth_offset_factor = dynamic->depth_bias.slope_factor;
3462       bias.depth_offset_units = dynamic->depth_bias.constant_factor;
3463       if (pipeline->depth_bias.is_z16)
3464          bias.depth_offset_units *= 256.0f;
3465    }
3466 
3467    cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_DEPTH_BIAS;
3468 }
3469 
3470 static void
emit_line_width(struct v3dv_cmd_buffer * cmd_buffer)3471 emit_line_width(struct v3dv_cmd_buffer *cmd_buffer)
3472 {
3473    struct v3dv_job *job = cmd_buffer->state.job;
3474    assert(job);
3475 
3476    v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(LINE_WIDTH));
3477    v3dv_return_if_oom(cmd_buffer, NULL);
3478 
3479    cl_emit(&job->bcl, LINE_WIDTH, line) {
3480       line.line_width = cmd_buffer->state.dynamic.line_width;
3481    }
3482 
3483    cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_LINE_WIDTH;
3484 }
3485 
3486 static void
emit_sample_state(struct v3dv_cmd_buffer * cmd_buffer)3487 emit_sample_state(struct v3dv_cmd_buffer *cmd_buffer)
3488 {
3489    struct v3dv_pipeline *pipeline = cmd_buffer->state.pipeline;
3490    assert(pipeline);
3491 
3492    struct v3dv_job *job = cmd_buffer->state.job;
3493    assert(job);
3494 
3495    v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(SAMPLE_STATE));
3496    v3dv_return_if_oom(cmd_buffer, NULL);
3497 
3498    cl_emit(&job->bcl, SAMPLE_STATE, state) {
3499       state.coverage = 1.0f;
3500       state.mask = pipeline->sample_mask;
3501    }
3502 }
3503 
3504 static void
emit_blend(struct v3dv_cmd_buffer * cmd_buffer)3505 emit_blend(struct v3dv_cmd_buffer *cmd_buffer)
3506 {
3507    struct v3dv_job *job = cmd_buffer->state.job;
3508    assert(job);
3509 
3510    struct v3dv_pipeline *pipeline = cmd_buffer->state.pipeline;
3511    assert(pipeline);
3512 
3513    const uint32_t blend_packets_size =
3514       cl_packet_length(BLEND_ENABLES) +
3515       cl_packet_length(BLEND_CONSTANT_COLOR) +
3516       cl_packet_length(BLEND_CFG) * V3D_MAX_DRAW_BUFFERS +
3517       cl_packet_length(COLOR_WRITE_MASKS);
3518 
3519    v3dv_cl_ensure_space_with_branch(&job->bcl, blend_packets_size);
3520    v3dv_return_if_oom(cmd_buffer, NULL);
3521 
3522    if (cmd_buffer->state.dirty & V3DV_CMD_DIRTY_PIPELINE) {
3523       if (pipeline->blend.enables) {
3524          cl_emit(&job->bcl, BLEND_ENABLES, enables) {
3525             enables.mask = pipeline->blend.enables;
3526          }
3527       }
3528 
3529       for (uint32_t i = 0; i < V3D_MAX_DRAW_BUFFERS; i++) {
3530          if (pipeline->blend.enables & (1 << i))
3531             cl_emit_prepacked(&job->bcl, &pipeline->blend.cfg[i]);
3532       }
3533 
3534       cl_emit(&job->bcl, COLOR_WRITE_MASKS, mask) {
3535          mask.mask = pipeline->blend.color_write_masks;
3536       }
3537    }
3538 
3539    if (pipeline->blend.needs_color_constants &&
3540        cmd_buffer->state.dirty & V3DV_CMD_DIRTY_BLEND_CONSTANTS) {
3541       struct v3dv_dynamic_state *dynamic = &cmd_buffer->state.dynamic;
3542       cl_emit(&job->bcl, BLEND_CONSTANT_COLOR, color) {
3543          color.red_f16 = _mesa_float_to_half(dynamic->blend_constants[0]);
3544          color.green_f16 = _mesa_float_to_half(dynamic->blend_constants[1]);
3545          color.blue_f16 = _mesa_float_to_half(dynamic->blend_constants[2]);
3546          color.alpha_f16 = _mesa_float_to_half(dynamic->blend_constants[3]);
3547       }
3548       cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_BLEND_CONSTANTS;
3549    }
3550 }
3551 
3552 static void
emit_flat_shade_flags(struct v3dv_job * job,int varying_offset,uint32_t varyings,enum V3DX (Varying_Flags_Action)lower,enum V3DX (Varying_Flags_Action)higher)3553 emit_flat_shade_flags(struct v3dv_job *job,
3554                       int varying_offset,
3555                       uint32_t varyings,
3556                       enum V3DX(Varying_Flags_Action) lower,
3557                       enum V3DX(Varying_Flags_Action) higher)
3558 {
3559    v3dv_cl_ensure_space_with_branch(&job->bcl,
3560                                     cl_packet_length(FLAT_SHADE_FLAGS));
3561    v3dv_return_if_oom(NULL, job);
3562 
3563    cl_emit(&job->bcl, FLAT_SHADE_FLAGS, flags) {
3564       flags.varying_offset_v0 = varying_offset;
3565       flags.flat_shade_flags_for_varyings_v024 = varyings;
3566       flags.action_for_flat_shade_flags_of_lower_numbered_varyings = lower;
3567       flags.action_for_flat_shade_flags_of_higher_numbered_varyings = higher;
3568    }
3569 }
3570 
3571 static void
emit_noperspective_flags(struct v3dv_job * job,int varying_offset,uint32_t varyings,enum V3DX (Varying_Flags_Action)lower,enum V3DX (Varying_Flags_Action)higher)3572 emit_noperspective_flags(struct v3dv_job *job,
3573                          int varying_offset,
3574                          uint32_t varyings,
3575                          enum V3DX(Varying_Flags_Action) lower,
3576                          enum V3DX(Varying_Flags_Action) higher)
3577 {
3578    v3dv_cl_ensure_space_with_branch(&job->bcl,
3579                                     cl_packet_length(NON_PERSPECTIVE_FLAGS));
3580    v3dv_return_if_oom(NULL, job);
3581 
3582    cl_emit(&job->bcl, NON_PERSPECTIVE_FLAGS, flags) {
3583       flags.varying_offset_v0 = varying_offset;
3584       flags.non_perspective_flags_for_varyings_v024 = varyings;
3585       flags.action_for_non_perspective_flags_of_lower_numbered_varyings = lower;
3586       flags.action_for_non_perspective_flags_of_higher_numbered_varyings = higher;
3587    }
3588 }
3589 
3590 static void
emit_centroid_flags(struct v3dv_job * job,int varying_offset,uint32_t varyings,enum V3DX (Varying_Flags_Action)lower,enum V3DX (Varying_Flags_Action)higher)3591 emit_centroid_flags(struct v3dv_job *job,
3592                     int varying_offset,
3593                     uint32_t varyings,
3594                     enum V3DX(Varying_Flags_Action) lower,
3595                     enum V3DX(Varying_Flags_Action) higher)
3596 {
3597    v3dv_cl_ensure_space_with_branch(&job->bcl,
3598                                     cl_packet_length(CENTROID_FLAGS));
3599    v3dv_return_if_oom(NULL, job);
3600 
3601    cl_emit(&job->bcl, CENTROID_FLAGS, flags) {
3602       flags.varying_offset_v0 = varying_offset;
3603       flags.centroid_flags_for_varyings_v024 = varyings;
3604       flags.action_for_centroid_flags_of_lower_numbered_varyings = lower;
3605       flags.action_for_centroid_flags_of_higher_numbered_varyings = higher;
3606    }
3607 }
3608 
3609 static bool
emit_varying_flags(struct v3dv_job * job,uint32_t num_flags,const uint32_t * flags,void (* flag_emit_callback)(struct v3dv_job * job,int varying_offset,uint32_t flags,enum V3DX (Varying_Flags_Action)lower,enum V3DX (Varying_Flags_Action)higher))3610 emit_varying_flags(struct v3dv_job *job,
3611                    uint32_t num_flags,
3612                    const uint32_t *flags,
3613                    void (*flag_emit_callback)(struct v3dv_job *job,
3614                                               int varying_offset,
3615                                               uint32_t flags,
3616                                               enum V3DX(Varying_Flags_Action) lower,
3617                                               enum V3DX(Varying_Flags_Action) higher))
3618 {
3619    bool emitted_any = false;
3620    for (int i = 0; i < num_flags; i++) {
3621       if (!flags[i])
3622          continue;
3623 
3624       if (emitted_any) {
3625         flag_emit_callback(job, i, flags[i],
3626                            V3D_VARYING_FLAGS_ACTION_UNCHANGED,
3627                            V3D_VARYING_FLAGS_ACTION_UNCHANGED);
3628       } else if (i == 0) {
3629         flag_emit_callback(job, i, flags[i],
3630                            V3D_VARYING_FLAGS_ACTION_UNCHANGED,
3631                            V3D_VARYING_FLAGS_ACTION_ZEROED);
3632       } else {
3633         flag_emit_callback(job, i, flags[i],
3634                            V3D_VARYING_FLAGS_ACTION_ZEROED,
3635                            V3D_VARYING_FLAGS_ACTION_ZEROED);
3636       }
3637 
3638       emitted_any = true;
3639    }
3640 
3641    return emitted_any;
3642 }
3643 
3644 static void
emit_varyings_state(struct v3dv_cmd_buffer * cmd_buffer)3645 emit_varyings_state(struct v3dv_cmd_buffer *cmd_buffer)
3646 {
3647    struct v3dv_job *job = cmd_buffer->state.job;
3648    struct v3dv_pipeline *pipeline = cmd_buffer->state.pipeline;
3649 
3650    struct v3d_fs_prog_data *prog_data_fs =
3651       pipeline->fs->current_variant->prog_data.fs;
3652 
3653    const uint32_t num_flags =
3654       ARRAY_SIZE(prog_data_fs->flat_shade_flags);
3655    const uint32_t *flat_shade_flags = prog_data_fs->flat_shade_flags;
3656    const uint32_t *noperspective_flags =  prog_data_fs->noperspective_flags;
3657    const uint32_t *centroid_flags = prog_data_fs->centroid_flags;
3658 
3659    if (!emit_varying_flags(job, num_flags, flat_shade_flags,
3660                            emit_flat_shade_flags)) {
3661       v3dv_cl_ensure_space_with_branch(
3662          &job->bcl, cl_packet_length(ZERO_ALL_FLAT_SHADE_FLAGS));
3663       v3dv_return_if_oom(cmd_buffer, NULL);
3664 
3665       cl_emit(&job->bcl, ZERO_ALL_FLAT_SHADE_FLAGS, flags);
3666    }
3667 
3668    if (!emit_varying_flags(job, num_flags, noperspective_flags,
3669                            emit_noperspective_flags)) {
3670       v3dv_cl_ensure_space_with_branch(
3671          &job->bcl, cl_packet_length(ZERO_ALL_NON_PERSPECTIVE_FLAGS));
3672       v3dv_return_if_oom(cmd_buffer, NULL);
3673 
3674       cl_emit(&job->bcl, ZERO_ALL_NON_PERSPECTIVE_FLAGS, flags);
3675    }
3676 
3677    if (!emit_varying_flags(job, num_flags, centroid_flags,
3678                            emit_centroid_flags)) {
3679       v3dv_cl_ensure_space_with_branch(
3680          &job->bcl, cl_packet_length(ZERO_ALL_CENTROID_FLAGS));
3681       v3dv_return_if_oom(cmd_buffer, NULL);
3682 
3683       cl_emit(&job->bcl, ZERO_ALL_CENTROID_FLAGS, flags);
3684    }
3685 }
3686 
3687 static void
emit_configuration_bits(struct v3dv_cmd_buffer * cmd_buffer)3688 emit_configuration_bits(struct v3dv_cmd_buffer *cmd_buffer)
3689 {
3690    struct v3dv_job *job = cmd_buffer->state.job;
3691    assert(job);
3692 
3693    struct v3dv_pipeline *pipeline = cmd_buffer->state.pipeline;
3694    assert(pipeline);
3695 
3696    job_update_ez_state(job, pipeline, &cmd_buffer->state);
3697 
3698    v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(CFG_BITS));
3699    v3dv_return_if_oom(cmd_buffer, NULL);
3700 
3701    cl_emit_with_prepacked(&job->bcl, CFG_BITS, pipeline->cfg_bits, config) {
3702       config.early_z_updates_enable = job->ez_state != VC5_EZ_DISABLED;
3703       config.early_z_enable = config.early_z_updates_enable;
3704    }
3705 }
3706 
3707 static void
emit_gl_shader_state(struct v3dv_cmd_buffer * cmd_buffer)3708 emit_gl_shader_state(struct v3dv_cmd_buffer *cmd_buffer)
3709 {
3710    struct v3dv_job *job = cmd_buffer->state.job;
3711    assert(job);
3712 
3713    struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
3714    struct v3dv_pipeline *pipeline = state->pipeline;
3715    assert(pipeline);
3716 
3717    /* Upload the uniforms to the indirect CL first */
3718    struct v3dv_cl_reloc fs_uniforms =
3719       v3dv_write_uniforms(cmd_buffer, pipeline->fs);
3720 
3721    struct v3dv_cl_reloc vs_uniforms =
3722       v3dv_write_uniforms(cmd_buffer, pipeline->vs);
3723 
3724    struct v3dv_cl_reloc vs_bin_uniforms =
3725       v3dv_write_uniforms(cmd_buffer, pipeline->vs_bin);
3726 
3727    /* Update the cache dirty flag based on the shader progs data */
3728    job->tmu_dirty_rcl |= pipeline->vs_bin->current_variant->prog_data.vs->base.tmu_dirty_rcl;
3729    job->tmu_dirty_rcl |= pipeline->vs->current_variant->prog_data.vs->base.tmu_dirty_rcl;
3730    job->tmu_dirty_rcl |= pipeline->fs->current_variant->prog_data.fs->base.tmu_dirty_rcl;
3731 
3732    /* See GFXH-930 workaround below */
3733    uint32_t num_elements_to_emit = MAX2(pipeline->va_count, 1);
3734 
3735    uint32_t shader_rec_offset =
3736       v3dv_cl_ensure_space(&job->indirect,
3737                            cl_packet_length(GL_SHADER_STATE_RECORD) +
3738                            num_elements_to_emit *
3739                            cl_packet_length(GL_SHADER_STATE_ATTRIBUTE_RECORD),
3740                            32);
3741    v3dv_return_if_oom(cmd_buffer, NULL);
3742 
3743    cl_emit_with_prepacked(&job->indirect, GL_SHADER_STATE_RECORD,
3744                           pipeline->shader_state_record, shader) {
3745 
3746       /* FIXME: we are setting this values here and during the
3747        * prepacking. This is because both cl_emit_with_prepacked and v3dv_pack
3748        * asserts for minimum values of these. It would be good to get
3749        * v3dv_pack to assert on the final value if possible
3750        */
3751       shader.min_coord_shader_input_segments_required_in_play =
3752          pipeline->vpm_cfg_bin.As;
3753       shader.min_vertex_shader_input_segments_required_in_play =
3754          pipeline->vpm_cfg.As;
3755 
3756       shader.coordinate_shader_code_address =
3757          v3dv_cl_address(pipeline->vs_bin->current_variant->assembly_bo, 0);
3758       shader.vertex_shader_code_address =
3759          v3dv_cl_address(pipeline->vs->current_variant->assembly_bo, 0);
3760       shader.fragment_shader_code_address =
3761          v3dv_cl_address(pipeline->fs->current_variant->assembly_bo, 0);
3762 
3763       shader.coordinate_shader_uniforms_address = vs_bin_uniforms;
3764       shader.vertex_shader_uniforms_address = vs_uniforms;
3765       shader.fragment_shader_uniforms_address = fs_uniforms;
3766 
3767       shader.address_of_default_attribute_values =
3768          v3dv_cl_address(pipeline->default_attribute_values, 0);
3769    }
3770 
3771    /* Upload vertex element attributes (SHADER_STATE_ATTRIBUTE_RECORD) */
3772    struct v3d_vs_prog_data *prog_data_vs =
3773       pipeline->vs->current_variant->prog_data.vs;
3774 
3775    struct v3d_vs_prog_data *prog_data_vs_bin =
3776       pipeline->vs_bin->current_variant->prog_data.vs;
3777 
3778    bool cs_loaded_any = false;
3779    const bool cs_uses_builtins = prog_data_vs_bin->uses_iid ||
3780                                  prog_data_vs_bin->uses_biid ||
3781                                  prog_data_vs_bin->uses_vid;
3782    const uint32_t packet_length =
3783       cl_packet_length(GL_SHADER_STATE_ATTRIBUTE_RECORD);
3784 
3785    uint32_t emitted_va_count = 0;
3786    for (uint32_t i = 0; emitted_va_count < pipeline->va_count; i++) {
3787       assert(i < MAX_VERTEX_ATTRIBS);
3788 
3789       if (pipeline->va[i].vk_format == VK_FORMAT_UNDEFINED)
3790          continue;
3791 
3792       const uint32_t binding = pipeline->va[i].binding;
3793 
3794       /* We store each vertex attribute in the array using its driver location
3795        * as index.
3796        */
3797       const uint32_t location = i;
3798 
3799       struct v3dv_vertex_binding *c_vb = &cmd_buffer->state.vertex_bindings[binding];
3800 
3801       cl_emit_with_prepacked(&job->indirect, GL_SHADER_STATE_ATTRIBUTE_RECORD,
3802                              &pipeline->vertex_attrs[i * packet_length], attr) {
3803 
3804          assert(c_vb->buffer->mem->bo);
3805          attr.address = v3dv_cl_address(c_vb->buffer->mem->bo,
3806                                         c_vb->buffer->mem_offset +
3807                                         pipeline->va[i].offset +
3808                                         c_vb->offset);
3809 
3810          attr.number_of_values_read_by_coordinate_shader =
3811             prog_data_vs_bin->vattr_sizes[location];
3812          attr.number_of_values_read_by_vertex_shader =
3813             prog_data_vs->vattr_sizes[location];
3814 
3815          /* GFXH-930: At least one attribute must be enabled and read by CS
3816           * and VS.  If we have attributes being consumed by the VS but not
3817           * the CS, then set up a dummy load of the last attribute into the
3818           * CS's VPM inputs.  (Since CS is just dead-code-elimination compared
3819           * to VS, we can't have CS loading but not VS).
3820           *
3821           * GFXH-1602: first attribute must be active if using builtins.
3822           */
3823          if (prog_data_vs_bin->vattr_sizes[location])
3824             cs_loaded_any = true;
3825 
3826          if (i == 0 && cs_uses_builtins && !cs_loaded_any) {
3827             attr.number_of_values_read_by_coordinate_shader = 1;
3828             cs_loaded_any = true;
3829          } else if (i == pipeline->va_count - 1 && !cs_loaded_any) {
3830             attr.number_of_values_read_by_coordinate_shader = 1;
3831             cs_loaded_any = true;
3832          }
3833 
3834          attr.maximum_index = 0xffffff;
3835       }
3836 
3837       emitted_va_count++;
3838    }
3839 
3840    if (pipeline->va_count == 0) {
3841       /* GFXH-930: At least one attribute must be enabled and read
3842        * by CS and VS.  If we have no attributes being consumed by
3843        * the shader, set up a dummy to be loaded into the VPM.
3844        */
3845       cl_emit(&job->indirect, GL_SHADER_STATE_ATTRIBUTE_RECORD, attr) {
3846          /* Valid address of data whose value will be unused. */
3847          attr.address = v3dv_cl_address(job->indirect.bo, 0);
3848 
3849          attr.type = ATTRIBUTE_FLOAT;
3850          attr.stride = 0;
3851          attr.vec_size = 1;
3852 
3853          attr.number_of_values_read_by_coordinate_shader = 1;
3854          attr.number_of_values_read_by_vertex_shader = 1;
3855       }
3856    }
3857 
3858    if (cmd_buffer->state.dirty & V3DV_CMD_DIRTY_PIPELINE) {
3859       v3dv_cl_ensure_space_with_branch(&job->bcl,
3860                                        sizeof(pipeline->vcm_cache_size));
3861       v3dv_return_if_oom(cmd_buffer, NULL);
3862 
3863       cl_emit_prepacked(&job->bcl, &pipeline->vcm_cache_size);
3864    }
3865 
3866    v3dv_cl_ensure_space_with_branch(&job->bcl,
3867                                     cl_packet_length(GL_SHADER_STATE));
3868    v3dv_return_if_oom(cmd_buffer, NULL);
3869 
3870    cl_emit(&job->bcl, GL_SHADER_STATE, state) {
3871       state.address = v3dv_cl_address(job->indirect.bo,
3872                                       shader_rec_offset);
3873       state.number_of_attribute_arrays = num_elements_to_emit;
3874    }
3875 
3876    cmd_buffer->state.dirty &= ~(V3DV_CMD_DIRTY_VERTEX_BUFFER |
3877                                 V3DV_CMD_DIRTY_DESCRIPTOR_SETS |
3878                                 V3DV_CMD_DIRTY_PUSH_CONSTANTS);
3879 }
3880 
3881 static void
emit_occlusion_query(struct v3dv_cmd_buffer * cmd_buffer)3882 emit_occlusion_query(struct v3dv_cmd_buffer *cmd_buffer)
3883 {
3884    struct v3dv_job *job = cmd_buffer->state.job;
3885    assert(job);
3886 
3887    v3dv_cl_ensure_space_with_branch(&job->bcl,
3888                                     cl_packet_length(OCCLUSION_QUERY_COUNTER));
3889    v3dv_return_if_oom(cmd_buffer, NULL);
3890 
3891    cl_emit(&job->bcl, OCCLUSION_QUERY_COUNTER, counter) {
3892       if (cmd_buffer->state.query.active_query) {
3893          counter.address =
3894             v3dv_cl_address(cmd_buffer->state.query.active_query, 0);
3895       }
3896    }
3897 
3898    cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_OCCLUSION_QUERY;
3899 }
3900 
3901 /* This stores command buffer state that we might be about to stomp for
3902  * a meta operation.
3903  */
3904 void
v3dv_cmd_buffer_meta_state_push(struct v3dv_cmd_buffer * cmd_buffer,bool push_descriptor_state)3905 v3dv_cmd_buffer_meta_state_push(struct v3dv_cmd_buffer *cmd_buffer,
3906                                 bool push_descriptor_state)
3907 {
3908    struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
3909 
3910    if (state->subpass_idx != -1) {
3911       state->meta.subpass_idx = state->subpass_idx;
3912       state->meta.framebuffer = v3dv_framebuffer_to_handle(state->framebuffer);
3913       state->meta.pass = v3dv_render_pass_to_handle(state->pass);
3914 
3915       const uint32_t attachment_state_item_size =
3916          sizeof(struct v3dv_cmd_buffer_attachment_state);
3917       const uint32_t attachment_state_total_size =
3918          attachment_state_item_size * state->attachment_alloc_count;
3919       if (state->meta.attachment_alloc_count < state->attachment_alloc_count) {
3920          if (state->meta.attachment_alloc_count > 0)
3921             vk_free(&cmd_buffer->device->alloc, state->meta.attachments);
3922 
3923          state->meta.attachments = vk_zalloc(&cmd_buffer->device->alloc,
3924                                              attachment_state_total_size, 8,
3925                                              VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
3926          if (!state->meta.attachments) {
3927             v3dv_flag_oom(cmd_buffer, NULL);
3928             return;
3929          }
3930          state->meta.attachment_alloc_count = state->attachment_alloc_count;
3931       }
3932       state->meta.attachment_count = state->attachment_alloc_count;
3933       memcpy(state->meta.attachments, state->attachments,
3934              attachment_state_total_size);
3935 
3936       state->meta.tile_aligned_render_area = state->tile_aligned_render_area;
3937       memcpy(&state->meta.render_area, &state->render_area, sizeof(VkRect2D));
3938    }
3939 
3940    state->meta.pipeline = v3dv_pipeline_to_handle(state->pipeline);
3941    memcpy(&state->meta.dynamic, &state->dynamic, sizeof(state->dynamic));
3942 
3943    /* We expect that meta operations are graphics-only and won't alter
3944     * compute state.
3945     */
3946    struct v3dv_descriptor_state *gfx_descriptor_state =
3947       &state->descriptor_state[VK_PIPELINE_BIND_POINT_GRAPHICS];
3948    if (push_descriptor_state) {
3949       if (gfx_descriptor_state->valid != 0) {
3950          memcpy(&state->meta.descriptor_state, gfx_descriptor_state,
3951                 sizeof(state->descriptor_state));
3952       }
3953       state->meta.has_descriptor_state = true;
3954    } else {
3955       state->meta.has_descriptor_state = false;
3956    }
3957 
3958    /* FIXME: if we keep track of wether we have bound any push constant state
3959     *        at all we could restruct this only to cases where it is actually
3960     *        necessary.
3961     */
3962    memcpy(state->meta.push_constants, cmd_buffer->push_constants_data,
3963           sizeof(state->meta.push_constants));
3964 }
3965 
3966 /* This restores command buffer state after a meta operation
3967  */
3968 void
v3dv_cmd_buffer_meta_state_pop(struct v3dv_cmd_buffer * cmd_buffer,uint32_t dirty_dynamic_state,bool needs_subpass_resume)3969 v3dv_cmd_buffer_meta_state_pop(struct v3dv_cmd_buffer *cmd_buffer,
3970                                uint32_t dirty_dynamic_state,
3971                                bool needs_subpass_resume)
3972 {
3973    struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
3974 
3975    if (state->meta.subpass_idx != -1) {
3976       state->pass = v3dv_render_pass_from_handle(state->meta.pass);
3977       state->framebuffer = v3dv_framebuffer_from_handle(state->meta.framebuffer);
3978 
3979       assert(state->meta.attachment_count <= state->attachment_alloc_count);
3980       const uint32_t attachment_state_item_size =
3981          sizeof(struct v3dv_cmd_buffer_attachment_state);
3982       const uint32_t attachment_state_total_size =
3983          attachment_state_item_size * state->meta.attachment_count;
3984       memcpy(state->attachments, state->meta.attachments,
3985              attachment_state_total_size);
3986 
3987       state->tile_aligned_render_area = state->meta.tile_aligned_render_area;
3988       memcpy(&state->render_area, &state->meta.render_area, sizeof(VkRect2D));
3989 
3990       /* Is needs_subpass_resume is true it means that the emitted the meta
3991        * operation in its own job (possibly with an RT config that is
3992        * incompatible with the current subpass), so resuming subpass execution
3993        * after it requires that we create a new job with the subpass RT setup.
3994        */
3995       if (needs_subpass_resume)
3996          v3dv_cmd_buffer_subpass_resume(cmd_buffer, state->meta.subpass_idx);
3997    } else {
3998       state->subpass_idx = -1;
3999    }
4000 
4001    if (state->meta.pipeline != VK_NULL_HANDLE) {
4002       struct v3dv_pipeline *pipeline =
4003             v3dv_pipeline_from_handle(state->meta.pipeline);
4004       VkPipelineBindPoint pipeline_binding =
4005          v3dv_pipeline_get_binding_point(pipeline);
4006       v3dv_CmdBindPipeline(v3dv_cmd_buffer_to_handle(cmd_buffer),
4007                            pipeline_binding,
4008                            state->meta.pipeline);
4009    } else {
4010       state->pipeline = VK_NULL_HANDLE;
4011    }
4012 
4013    if (dirty_dynamic_state) {
4014       memcpy(&state->dynamic, &state->meta.dynamic, sizeof(state->dynamic));
4015       state->dirty |= dirty_dynamic_state;
4016    }
4017 
4018    if (state->meta.has_descriptor_state) {
4019       if (state->meta.descriptor_state.valid != 0) {
4020          memcpy(&state->descriptor_state[VK_PIPELINE_BIND_POINT_GRAPHICS],
4021                 &state->meta.descriptor_state,
4022                 sizeof(state->descriptor_state));
4023       } else {
4024          state->descriptor_state[VK_PIPELINE_BIND_POINT_GRAPHICS].valid = 0;
4025       }
4026    }
4027 
4028    memcpy(cmd_buffer->push_constants_data, state->meta.push_constants,
4029           sizeof(state->meta.push_constants));
4030 
4031    state->meta.pipeline = VK_NULL_HANDLE;
4032    state->meta.framebuffer = VK_NULL_HANDLE;
4033    state->meta.pass = VK_NULL_HANDLE;
4034    state->meta.subpass_idx = -1;
4035    state->meta.has_descriptor_state = false;
4036 }
4037 
4038 /* FIXME: C&P from v3dx_draw. Refactor to common place? */
4039 static uint32_t
v3d_hw_prim_type(enum pipe_prim_type prim_type)4040 v3d_hw_prim_type(enum pipe_prim_type prim_type)
4041 {
4042    switch (prim_type) {
4043    case PIPE_PRIM_POINTS:
4044    case PIPE_PRIM_LINES:
4045    case PIPE_PRIM_LINE_LOOP:
4046    case PIPE_PRIM_LINE_STRIP:
4047    case PIPE_PRIM_TRIANGLES:
4048    case PIPE_PRIM_TRIANGLE_STRIP:
4049    case PIPE_PRIM_TRIANGLE_FAN:
4050       return prim_type;
4051 
4052    case PIPE_PRIM_LINES_ADJACENCY:
4053    case PIPE_PRIM_LINE_STRIP_ADJACENCY:
4054    case PIPE_PRIM_TRIANGLES_ADJACENCY:
4055    case PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY:
4056       return 8 + (prim_type - PIPE_PRIM_LINES_ADJACENCY);
4057 
4058    default:
4059       unreachable("Unsupported primitive type");
4060    }
4061 }
4062 
4063 struct v3dv_draw_info {
4064    uint32_t vertex_count;
4065    uint32_t instance_count;
4066    uint32_t first_vertex;
4067    uint32_t first_instance;
4068 };
4069 
4070 static void
cmd_buffer_emit_draw(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_draw_info * info)4071 cmd_buffer_emit_draw(struct v3dv_cmd_buffer *cmd_buffer,
4072                      struct v3dv_draw_info *info)
4073 {
4074    struct v3dv_job *job = cmd_buffer->state.job;
4075    assert(job);
4076 
4077    struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
4078    struct v3dv_pipeline *pipeline = state->pipeline;
4079 
4080    assert(pipeline);
4081 
4082    uint32_t hw_prim_type = v3d_hw_prim_type(pipeline->vs->topology);
4083 
4084    if (info->first_instance > 0) {
4085       v3dv_cl_ensure_space_with_branch(
4086          &job->bcl, cl_packet_length(BASE_VERTEX_BASE_INSTANCE));
4087       v3dv_return_if_oom(cmd_buffer, NULL);
4088 
4089       cl_emit(&job->bcl, BASE_VERTEX_BASE_INSTANCE, base) {
4090          base.base_instance = info->first_instance;
4091          base.base_vertex = 0;
4092       }
4093    }
4094 
4095    if (info->instance_count > 1) {
4096       v3dv_cl_ensure_space_with_branch(
4097          &job->bcl, cl_packet_length(VERTEX_ARRAY_INSTANCED_PRIMS));
4098       v3dv_return_if_oom(cmd_buffer, NULL);
4099 
4100       cl_emit(&job->bcl, VERTEX_ARRAY_INSTANCED_PRIMS, prim) {
4101          prim.mode = hw_prim_type;
4102          prim.index_of_first_vertex = info->first_vertex;
4103          prim.number_of_instances = info->instance_count;
4104          prim.instance_length = info->vertex_count;
4105       }
4106    } else {
4107       v3dv_cl_ensure_space_with_branch(
4108          &job->bcl, cl_packet_length(VERTEX_ARRAY_PRIMS));
4109       v3dv_return_if_oom(cmd_buffer, NULL);
4110       cl_emit(&job->bcl, VERTEX_ARRAY_PRIMS, prim) {
4111          prim.mode = hw_prim_type;
4112          prim.length = info->vertex_count;
4113          prim.index_of_first_vertex = info->first_vertex;
4114       }
4115    }
4116 }
4117 
4118 static struct v3dv_job *
cmd_buffer_pre_draw_split_job(struct v3dv_cmd_buffer * cmd_buffer)4119 cmd_buffer_pre_draw_split_job(struct v3dv_cmd_buffer *cmd_buffer)
4120 {
4121    struct v3dv_job *job = cmd_buffer->state.job;
4122    assert(job);
4123 
4124    /* If the job has been flagged with 'always_flush' and it has already
4125     * recorded any draw calls then we need to start a new job for it.
4126     */
4127    if (job->always_flush && job->draw_count > 0) {
4128       assert(cmd_buffer->state.pass);
4129       /* First, flag the current job as not being the last in the
4130        * current subpass
4131        */
4132       job->is_subpass_finish = false;
4133 
4134       /* Now start a new job in the same subpass and flag it as continuing
4135        * the current subpass.
4136        */
4137       job = v3dv_cmd_buffer_subpass_resume(cmd_buffer,
4138                                            cmd_buffer->state.subpass_idx);
4139       assert(job->draw_count == 0);
4140 
4141       /* Inherit the 'always flush' behavior */
4142       job->always_flush = true;
4143    }
4144 
4145    assert(job->draw_count == 0 || !job->always_flush);
4146    return job;
4147 }
4148 
4149 /**
4150  * The Vulkan spec states:
4151  *
4152  *   "It is legal for a subpass to use no color or depth/stencil
4153  *    attachments (...)  This kind of subpass can use shader side effects such
4154  *    as image stores and atomics to produce an output. In this case, the
4155  *    subpass continues to use the width, height, and layers of the framebuffer
4156  *    to define the dimensions of the rendering area, and the
4157  *    rasterizationSamples from each pipeline’s
4158  *    VkPipelineMultisampleStateCreateInfo to define the number of samples used
4159  *    in rasterization."
4160  *
4161  * We need to enable MSAA in the TILE_BINNING_MODE_CFG packet, which we
4162  * emit when we start a new frame at the begining of a subpass. At that point,
4163  * if the framebuffer doesn't have any attachments we won't enable MSAA and
4164  * the job won't be valid in the scenario described by the spec.
4165  *
4166  * This function is intended to be called before a draw call and will test if
4167  * we are in that scenario, in which case, it will restart the current job
4168  * with MSAA enabled.
4169  */
4170 static void
cmd_buffer_restart_job_for_msaa_if_needed(struct v3dv_cmd_buffer * cmd_buffer)4171 cmd_buffer_restart_job_for_msaa_if_needed(struct v3dv_cmd_buffer *cmd_buffer)
4172 {
4173    assert(cmd_buffer->state.job);
4174 
4175    /* We don't support variableMultisampleRate so we know that all pipelines
4176     * bound in the same subpass must have matching number of samples, so we
4177     * can do this check only on the first draw call.
4178     */
4179    if (cmd_buffer->state.job->draw_count > 0)
4180       return;
4181 
4182    /* We only need to restart the frame if the pipeline requires MSAA but
4183     * our frame tiling didn't enable it.
4184     */
4185    if (!cmd_buffer->state.pipeline->msaa ||
4186        cmd_buffer->state.job->frame_tiling.msaa) {
4187       return;
4188    }
4189 
4190    /* FIXME: Secondary command buffers don't start frames. Instead, they are
4191     * recorded into primary jobs that start them. For secondaries, we should
4192     * still handle this scenario, but we should do that when we record them
4193     * into primaries by testing if any of the secondaries has multisampled
4194     * draw calls in them, and then using that info to decide if we need to
4195     * restart the primary job into which they are being recorded.
4196     */
4197    if (cmd_buffer->level != VK_COMMAND_BUFFER_LEVEL_PRIMARY)
4198       return;
4199 
4200    /* Drop the current job and restart it with MSAA enabled */
4201    struct v3dv_job *old_job = cmd_buffer->state.job;
4202    cmd_buffer->state.job = NULL;
4203 
4204    struct v3dv_job *job = vk_zalloc(&cmd_buffer->device->alloc,
4205                                     sizeof(struct v3dv_job), 8,
4206                                     VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
4207    if (!job) {
4208       v3dv_flag_oom(cmd_buffer, NULL);
4209       return;
4210    }
4211 
4212    v3dv_job_init(job, V3DV_JOB_TYPE_GPU_CL, cmd_buffer->device, cmd_buffer,
4213                  cmd_buffer->state.subpass_idx);
4214    cmd_buffer->state.job = job;
4215 
4216    v3dv_job_start_frame(job,
4217                         old_job->frame_tiling.width,
4218                         old_job->frame_tiling.height,
4219                         old_job->frame_tiling.layers,
4220                         old_job->frame_tiling.render_target_count,
4221                         old_job->frame_tiling.internal_bpp,
4222                         true /* msaa */);
4223 
4224    v3dv_job_destroy(old_job);
4225 }
4226 
4227 static void
cmd_buffer_emit_pre_draw(struct v3dv_cmd_buffer * cmd_buffer)4228 cmd_buffer_emit_pre_draw(struct v3dv_cmd_buffer *cmd_buffer)
4229 {
4230    assert(cmd_buffer->state.pipeline);
4231    assert(!(cmd_buffer->state.pipeline->active_stages & VK_SHADER_STAGE_COMPUTE_BIT));
4232 
4233    /* If we emitted a pipeline barrier right before this draw we won't have
4234     * an active job. In that case, create a new job continuing the current
4235     * subpass.
4236     */
4237    struct v3dv_job *job = cmd_buffer->state.job;
4238    if (!job) {
4239       job = v3dv_cmd_buffer_subpass_resume(cmd_buffer,
4240                                            cmd_buffer->state.subpass_idx);
4241    }
4242 
4243    /* Restart single sample job for MSAA pipeline if needed */
4244    cmd_buffer_restart_job_for_msaa_if_needed(cmd_buffer);
4245 
4246    /* If the job is configured to flush on every draw call we need to create
4247     * a new job now.
4248     */
4249    job = cmd_buffer_pre_draw_split_job(cmd_buffer);
4250    job->draw_count++;
4251 
4252    /* We may need to compile shader variants based on bound textures */
4253    uint32_t *dirty = &cmd_buffer->state.dirty;
4254    if (*dirty & (V3DV_CMD_DIRTY_PIPELINE |
4255                  V3DV_CMD_DIRTY_DESCRIPTOR_SETS)) {
4256       update_pipeline_variants(cmd_buffer);
4257    }
4258 
4259    /* GL shader state binds shaders, uniform and vertex attribute state. The
4260     * compiler injects uniforms to handle some descriptor types (such as
4261     * textures), so we need to regen that when descriptor state changes.
4262     *
4263     * We also need to emit new shader state if we have a dirty viewport since
4264     * that will require that we new uniform state for QUNIFORM_VIEWPORT_*.
4265     */
4266    if (*dirty & (V3DV_CMD_DIRTY_PIPELINE |
4267                  V3DV_CMD_DIRTY_VERTEX_BUFFER |
4268                  V3DV_CMD_DIRTY_DESCRIPTOR_SETS |
4269                  V3DV_CMD_DIRTY_PUSH_CONSTANTS |
4270                  V3DV_CMD_DIRTY_VIEWPORT)) {
4271       emit_gl_shader_state(cmd_buffer);
4272    }
4273 
4274    if (*dirty & (V3DV_CMD_DIRTY_PIPELINE)) {
4275       emit_configuration_bits(cmd_buffer);
4276       emit_varyings_state(cmd_buffer);
4277    }
4278 
4279    if (*dirty & (V3DV_CMD_DIRTY_VIEWPORT | V3DV_CMD_DIRTY_SCISSOR)) {
4280       emit_scissor(cmd_buffer);
4281    }
4282 
4283    if (*dirty & V3DV_CMD_DIRTY_VIEWPORT) {
4284       emit_viewport(cmd_buffer);
4285    }
4286 
4287    const uint32_t dynamic_stencil_dirty_flags =
4288       V3DV_CMD_DIRTY_STENCIL_COMPARE_MASK |
4289       V3DV_CMD_DIRTY_STENCIL_WRITE_MASK |
4290       V3DV_CMD_DIRTY_STENCIL_REFERENCE;
4291    if (*dirty & (V3DV_CMD_DIRTY_PIPELINE | dynamic_stencil_dirty_flags))
4292       emit_stencil(cmd_buffer);
4293 
4294    if (*dirty & (V3DV_CMD_DIRTY_PIPELINE | V3DV_CMD_DIRTY_DEPTH_BIAS))
4295       emit_depth_bias(cmd_buffer);
4296 
4297    if (*dirty & (V3DV_CMD_DIRTY_PIPELINE | V3DV_CMD_DIRTY_BLEND_CONSTANTS))
4298       emit_blend(cmd_buffer);
4299 
4300    if (*dirty & V3DV_CMD_DIRTY_OCCLUSION_QUERY)
4301       emit_occlusion_query(cmd_buffer);
4302 
4303    if (*dirty & V3DV_CMD_DIRTY_LINE_WIDTH)
4304       emit_line_width(cmd_buffer);
4305 
4306    if (*dirty & V3DV_CMD_DIRTY_PIPELINE)
4307       emit_sample_state(cmd_buffer);
4308 
4309    cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_PIPELINE;
4310 }
4311 
4312 static void
cmd_buffer_draw(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_draw_info * info)4313 cmd_buffer_draw(struct v3dv_cmd_buffer *cmd_buffer,
4314                 struct v3dv_draw_info *info)
4315 {
4316    cmd_buffer_emit_pre_draw(cmd_buffer);
4317    cmd_buffer_emit_draw(cmd_buffer, info);
4318 }
4319 
4320 void
v3dv_CmdDraw(VkCommandBuffer commandBuffer,uint32_t vertexCount,uint32_t instanceCount,uint32_t firstVertex,uint32_t firstInstance)4321 v3dv_CmdDraw(VkCommandBuffer commandBuffer,
4322              uint32_t vertexCount,
4323              uint32_t instanceCount,
4324              uint32_t firstVertex,
4325              uint32_t firstInstance)
4326 {
4327    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
4328    struct v3dv_draw_info info = {};
4329    info.vertex_count = vertexCount;
4330    info.instance_count = instanceCount;
4331    info.first_instance = firstInstance;
4332    info.first_vertex = firstVertex;
4333 
4334    cmd_buffer_draw(cmd_buffer, &info);
4335 }
4336 
4337 void
v3dv_CmdDrawIndexed(VkCommandBuffer commandBuffer,uint32_t indexCount,uint32_t instanceCount,uint32_t firstIndex,int32_t vertexOffset,uint32_t firstInstance)4338 v3dv_CmdDrawIndexed(VkCommandBuffer commandBuffer,
4339                     uint32_t indexCount,
4340                     uint32_t instanceCount,
4341                     uint32_t firstIndex,
4342                     int32_t vertexOffset,
4343                     uint32_t firstInstance)
4344 {
4345    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
4346 
4347    cmd_buffer_emit_pre_draw(cmd_buffer);
4348 
4349    struct v3dv_job *job = cmd_buffer->state.job;
4350    assert(job);
4351 
4352    const struct v3dv_pipeline *pipeline = cmd_buffer->state.pipeline;
4353    uint32_t hw_prim_type = v3d_hw_prim_type(pipeline->vs->topology);
4354    uint8_t index_type = ffs(cmd_buffer->state.index_buffer.index_size) - 1;
4355    uint32_t index_offset = firstIndex * cmd_buffer->state.index_buffer.index_size;
4356 
4357    if (vertexOffset != 0 || firstInstance != 0) {
4358       v3dv_cl_ensure_space_with_branch(
4359          &job->bcl, cl_packet_length(BASE_VERTEX_BASE_INSTANCE));
4360       v3dv_return_if_oom(cmd_buffer, NULL);
4361 
4362       cl_emit(&job->bcl, BASE_VERTEX_BASE_INSTANCE, base) {
4363          base.base_instance = firstInstance;
4364          base.base_vertex = vertexOffset;
4365       }
4366    }
4367 
4368    if (instanceCount == 1) {
4369       v3dv_cl_ensure_space_with_branch(
4370          &job->bcl, cl_packet_length(INDEXED_PRIM_LIST));
4371       v3dv_return_if_oom(cmd_buffer, NULL);
4372 
4373       cl_emit(&job->bcl, INDEXED_PRIM_LIST, prim) {
4374          prim.index_type = index_type;
4375          prim.length = indexCount;
4376          prim.index_offset = index_offset;
4377          prim.mode = hw_prim_type;
4378          prim.enable_primitive_restarts = pipeline->primitive_restart;
4379       }
4380    } else if (instanceCount > 1) {
4381       v3dv_cl_ensure_space_with_branch(
4382          &job->bcl, cl_packet_length(INDEXED_INSTANCED_PRIM_LIST));
4383       v3dv_return_if_oom(cmd_buffer, NULL);
4384 
4385       cl_emit(&job->bcl, INDEXED_INSTANCED_PRIM_LIST, prim) {
4386          prim.index_type = index_type;
4387          prim.index_offset = index_offset;
4388          prim.mode = hw_prim_type;
4389          prim.enable_primitive_restarts = pipeline->primitive_restart;
4390          prim.number_of_instances = instanceCount;
4391          prim.instance_length = indexCount;
4392       }
4393    }
4394 }
4395 
4396 void
v3dv_CmdDrawIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,uint32_t drawCount,uint32_t stride)4397 v3dv_CmdDrawIndirect(VkCommandBuffer commandBuffer,
4398                      VkBuffer _buffer,
4399                      VkDeviceSize offset,
4400                      uint32_t drawCount,
4401                      uint32_t stride)
4402 {
4403    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
4404    V3DV_FROM_HANDLE(v3dv_buffer, buffer, _buffer);
4405 
4406    /* drawCount is the number of draws to execute, and can be zero. */
4407    if (drawCount == 0)
4408       return;
4409 
4410    cmd_buffer_emit_pre_draw(cmd_buffer);
4411 
4412    struct v3dv_job *job = cmd_buffer->state.job;
4413    assert(job);
4414 
4415    const struct v3dv_pipeline *pipeline = cmd_buffer->state.pipeline;
4416    uint32_t hw_prim_type = v3d_hw_prim_type(pipeline->vs->topology);
4417 
4418    v3dv_cl_ensure_space_with_branch(
4419       &job->bcl, cl_packet_length(INDIRECT_VERTEX_ARRAY_INSTANCED_PRIMS));
4420    v3dv_return_if_oom(cmd_buffer, NULL);
4421 
4422    cl_emit(&job->bcl, INDIRECT_VERTEX_ARRAY_INSTANCED_PRIMS, prim) {
4423       prim.mode = hw_prim_type;
4424       prim.number_of_draw_indirect_array_records = drawCount;
4425       prim.stride_in_multiples_of_4_bytes = stride >> 2;
4426       prim.address = v3dv_cl_address(buffer->mem->bo,
4427                                      buffer->mem_offset + offset);
4428    }
4429 }
4430 
4431 void
v3dv_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,uint32_t drawCount,uint32_t stride)4432 v3dv_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer,
4433                             VkBuffer _buffer,
4434                             VkDeviceSize offset,
4435                             uint32_t drawCount,
4436                             uint32_t stride)
4437 {
4438    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
4439    V3DV_FROM_HANDLE(v3dv_buffer, buffer, _buffer);
4440 
4441    /* drawCount is the number of draws to execute, and can be zero. */
4442    if (drawCount == 0)
4443       return;
4444 
4445    cmd_buffer_emit_pre_draw(cmd_buffer);
4446 
4447    struct v3dv_job *job = cmd_buffer->state.job;
4448    assert(job);
4449 
4450    const struct v3dv_pipeline *pipeline = cmd_buffer->state.pipeline;
4451    uint32_t hw_prim_type = v3d_hw_prim_type(pipeline->vs->topology);
4452    uint8_t index_type = ffs(cmd_buffer->state.index_buffer.index_size) - 1;
4453 
4454    v3dv_cl_ensure_space_with_branch(
4455       &job->bcl, cl_packet_length(INDIRECT_INDEXED_INSTANCED_PRIM_LIST));
4456    v3dv_return_if_oom(cmd_buffer, NULL);
4457 
4458    cl_emit(&job->bcl, INDIRECT_INDEXED_INSTANCED_PRIM_LIST, prim) {
4459       prim.index_type = index_type;
4460       prim.mode = hw_prim_type;
4461       prim.enable_primitive_restarts = pipeline->primitive_restart;
4462       prim.number_of_draw_indirect_indexed_records = drawCount;
4463       prim.stride_in_multiples_of_4_bytes = stride >> 2;
4464       prim.address = v3dv_cl_address(buffer->mem->bo,
4465                                      buffer->mem_offset + offset);
4466    }
4467 }
4468 
4469 void
v3dv_CmdPipelineBarrier(VkCommandBuffer commandBuffer,VkPipelineStageFlags srcStageMask,VkPipelineStageFlags dstStageMask,VkDependencyFlags dependencyFlags,uint32_t memoryBarrierCount,const VkMemoryBarrier * pMemoryBarriers,uint32_t bufferBarrierCount,const VkBufferMemoryBarrier * pBufferBarriers,uint32_t imageBarrierCount,const VkImageMemoryBarrier * pImageBarriers)4470 v3dv_CmdPipelineBarrier(VkCommandBuffer commandBuffer,
4471                         VkPipelineStageFlags srcStageMask,
4472                         VkPipelineStageFlags dstStageMask,
4473                         VkDependencyFlags dependencyFlags,
4474                         uint32_t memoryBarrierCount,
4475                         const VkMemoryBarrier *pMemoryBarriers,
4476                         uint32_t bufferBarrierCount,
4477                         const VkBufferMemoryBarrier *pBufferBarriers,
4478                         uint32_t imageBarrierCount,
4479                         const VkImageMemoryBarrier *pImageBarriers)
4480 {
4481    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
4482 
4483    /* We only care about barriers between GPU jobs */
4484    if (srcStageMask == VK_PIPELINE_STAGE_HOST_BIT ||
4485        dstStageMask == VK_PIPELINE_STAGE_HOST_BIT) {
4486       return;
4487    }
4488 
4489    /* If we have a recording job, finish it here */
4490    struct v3dv_job *job = cmd_buffer->state.job;
4491    if (job)
4492       v3dv_cmd_buffer_finish_job(cmd_buffer);
4493 
4494    cmd_buffer->state.has_barrier = true;
4495    if (dstStageMask & (VK_PIPELINE_STAGE_VERTEX_INPUT_BIT |
4496                        VK_PIPELINE_STAGE_VERTEX_SHADER_BIT |
4497                        VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT |
4498                        VK_PIPELINE_STAGE_TESSELLATION_CONTROL_SHADER_BIT |
4499                        VK_PIPELINE_STAGE_TESSELLATION_EVALUATION_SHADER_BIT |
4500                        VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT)) {
4501       cmd_buffer->state.has_bcl_barrier = true;
4502    }
4503 }
4504 
4505 void
v3dv_CmdBindVertexBuffers(VkCommandBuffer commandBuffer,uint32_t firstBinding,uint32_t bindingCount,const VkBuffer * pBuffers,const VkDeviceSize * pOffsets)4506 v3dv_CmdBindVertexBuffers(VkCommandBuffer commandBuffer,
4507                           uint32_t firstBinding,
4508                           uint32_t bindingCount,
4509                           const VkBuffer *pBuffers,
4510                           const VkDeviceSize *pOffsets)
4511 {
4512    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
4513    struct v3dv_vertex_binding *vb = cmd_buffer->state.vertex_bindings;
4514 
4515    /* We have to defer setting up vertex buffer since we need the buffer
4516     * stride from the pipeline.
4517     */
4518 
4519    assert(firstBinding + bindingCount <= MAX_VBS);
4520    bool vb_state_changed = false;
4521    for (uint32_t i = 0; i < bindingCount; i++) {
4522       if (vb[firstBinding + i].buffer != v3dv_buffer_from_handle(pBuffers[i])) {
4523          vb[firstBinding + i].buffer = v3dv_buffer_from_handle(pBuffers[i]);
4524          vb_state_changed = true;
4525       }
4526       if (vb[firstBinding + i].offset != pOffsets[i]) {
4527          vb[firstBinding + i].offset = pOffsets[i];
4528          vb_state_changed = true;
4529       }
4530    }
4531 
4532    if (vb_state_changed)
4533       cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_VERTEX_BUFFER;
4534 }
4535 
4536 static uint32_t
get_index_size(VkIndexType index_type)4537 get_index_size(VkIndexType index_type)
4538 {
4539    switch (index_type) {
4540    case VK_INDEX_TYPE_UINT16:
4541       return 2;
4542       break;
4543    case VK_INDEX_TYPE_UINT32:
4544       return 4;
4545       break;
4546    default:
4547       unreachable("Unsupported index type");
4548    }
4549 }
4550 
4551 void
v3dv_CmdBindIndexBuffer(VkCommandBuffer commandBuffer,VkBuffer buffer,VkDeviceSize offset,VkIndexType indexType)4552 v3dv_CmdBindIndexBuffer(VkCommandBuffer commandBuffer,
4553                         VkBuffer buffer,
4554                         VkDeviceSize offset,
4555                         VkIndexType indexType)
4556 {
4557    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
4558    V3DV_FROM_HANDLE(v3dv_buffer, ibuffer, buffer);
4559 
4560    struct v3dv_job *job = cmd_buffer->state.job;
4561    assert(job);
4562 
4563    v3dv_cl_ensure_space_with_branch(
4564       &job->bcl, cl_packet_length(INDEX_BUFFER_SETUP));
4565    v3dv_return_if_oom(cmd_buffer, NULL);
4566 
4567    const uint32_t index_size = get_index_size(indexType);
4568 
4569    /* If we have started a new job we always need to emit index buffer state.
4570     * We know we are in that scenario because that is the only case where we
4571     * set the dirty bit.
4572     */
4573    if (!(cmd_buffer->state.dirty & V3DV_CMD_DIRTY_INDEX_BUFFER)) {
4574       if (buffer == cmd_buffer->state.index_buffer.buffer &&
4575           offset == cmd_buffer->state.index_buffer.offset &&
4576           index_size == cmd_buffer->state.index_buffer.index_size) {
4577          return;
4578       }
4579    }
4580 
4581    cl_emit(&job->bcl, INDEX_BUFFER_SETUP, ib) {
4582       ib.address = v3dv_cl_address(ibuffer->mem->bo,
4583                                    ibuffer->mem_offset + offset);
4584       ib.size = ibuffer->mem->bo->size;
4585    }
4586 
4587    cmd_buffer->state.index_buffer.buffer = buffer;
4588    cmd_buffer->state.index_buffer.offset = offset;
4589    cmd_buffer->state.index_buffer.index_size = index_size;
4590 
4591    cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_INDEX_BUFFER;
4592 }
4593 
4594 void
v3dv_CmdSetStencilCompareMask(VkCommandBuffer commandBuffer,VkStencilFaceFlags faceMask,uint32_t compareMask)4595 v3dv_CmdSetStencilCompareMask(VkCommandBuffer commandBuffer,
4596                               VkStencilFaceFlags faceMask,
4597                               uint32_t compareMask)
4598 {
4599    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
4600 
4601    if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
4602       cmd_buffer->state.dynamic.stencil_compare_mask.front = compareMask & 0xff;
4603    if (faceMask & VK_STENCIL_FACE_BACK_BIT)
4604       cmd_buffer->state.dynamic.stencil_compare_mask.back = compareMask & 0xff;
4605 
4606    cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_STENCIL_COMPARE_MASK;
4607 }
4608 
4609 void
v3dv_CmdSetStencilWriteMask(VkCommandBuffer commandBuffer,VkStencilFaceFlags faceMask,uint32_t writeMask)4610 v3dv_CmdSetStencilWriteMask(VkCommandBuffer commandBuffer,
4611                             VkStencilFaceFlags faceMask,
4612                             uint32_t writeMask)
4613 {
4614    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
4615 
4616    if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
4617       cmd_buffer->state.dynamic.stencil_write_mask.front = writeMask & 0xff;
4618    if (faceMask & VK_STENCIL_FACE_BACK_BIT)
4619       cmd_buffer->state.dynamic.stencil_write_mask.back = writeMask & 0xff;
4620 
4621    cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_STENCIL_WRITE_MASK;
4622 }
4623 
4624 void
v3dv_CmdSetStencilReference(VkCommandBuffer commandBuffer,VkStencilFaceFlags faceMask,uint32_t reference)4625 v3dv_CmdSetStencilReference(VkCommandBuffer commandBuffer,
4626                             VkStencilFaceFlags faceMask,
4627                             uint32_t reference)
4628 {
4629    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
4630 
4631    if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
4632       cmd_buffer->state.dynamic.stencil_reference.front = reference & 0xff;
4633    if (faceMask & VK_STENCIL_FACE_BACK_BIT)
4634       cmd_buffer->state.dynamic.stencil_reference.back = reference & 0xff;
4635 
4636    cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_STENCIL_REFERENCE;
4637 }
4638 
4639 void
v3dv_CmdSetDepthBias(VkCommandBuffer commandBuffer,float depthBiasConstantFactor,float depthBiasClamp,float depthBiasSlopeFactor)4640 v3dv_CmdSetDepthBias(VkCommandBuffer commandBuffer,
4641                      float depthBiasConstantFactor,
4642                      float depthBiasClamp,
4643                      float depthBiasSlopeFactor)
4644 {
4645    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
4646 
4647    cmd_buffer->state.dynamic.depth_bias.constant_factor = depthBiasConstantFactor;
4648    cmd_buffer->state.dynamic.depth_bias.slope_factor = depthBiasSlopeFactor;
4649    cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_DEPTH_BIAS;
4650 }
4651 
4652 void
v3dv_CmdSetDepthBounds(VkCommandBuffer commandBuffer,float minDepthBounds,float maxDepthBounds)4653 v3dv_CmdSetDepthBounds(VkCommandBuffer commandBuffer,
4654                        float minDepthBounds,
4655                        float maxDepthBounds)
4656 {
4657    /* We do not support depth bounds testing so we just ingore this. We are
4658     * already asserting that pipelines don't enable the feature anyway.
4659     */
4660 }
4661 
4662 void
v3dv_CmdSetLineWidth(VkCommandBuffer commandBuffer,float lineWidth)4663 v3dv_CmdSetLineWidth(VkCommandBuffer commandBuffer,
4664                      float lineWidth)
4665 {
4666    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
4667 
4668    cmd_buffer->state.dynamic.line_width = lineWidth;
4669    cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_LINE_WIDTH;
4670 }
4671 
4672 void
v3dv_CmdBindDescriptorSets(VkCommandBuffer commandBuffer,VkPipelineBindPoint pipelineBindPoint,VkPipelineLayout _layout,uint32_t firstSet,uint32_t descriptorSetCount,const VkDescriptorSet * pDescriptorSets,uint32_t dynamicOffsetCount,const uint32_t * pDynamicOffsets)4673 v3dv_CmdBindDescriptorSets(VkCommandBuffer commandBuffer,
4674                            VkPipelineBindPoint pipelineBindPoint,
4675                            VkPipelineLayout _layout,
4676                            uint32_t firstSet,
4677                            uint32_t descriptorSetCount,
4678                            const VkDescriptorSet *pDescriptorSets,
4679                            uint32_t dynamicOffsetCount,
4680                            const uint32_t *pDynamicOffsets)
4681 {
4682    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
4683    V3DV_FROM_HANDLE(v3dv_pipeline_layout, layout, _layout);
4684 
4685    uint32_t dyn_index = 0;
4686 
4687    assert(firstSet + descriptorSetCount <= MAX_SETS);
4688 
4689    struct v3dv_descriptor_state *descriptor_state =
4690       &cmd_buffer->state.descriptor_state[pipelineBindPoint];
4691 
4692    bool descriptor_state_changed = false;
4693    for (uint32_t i = 0; i < descriptorSetCount; i++) {
4694       V3DV_FROM_HANDLE(v3dv_descriptor_set, set, pDescriptorSets[i]);
4695       uint32_t index = firstSet + i;
4696 
4697       if (descriptor_state->descriptor_sets[index] != set) {
4698          descriptor_state->descriptor_sets[index] = set;
4699          descriptor_state_changed = true;
4700       }
4701 
4702       if (!(descriptor_state->valid & (1u << index))) {
4703          descriptor_state->valid |= (1u << index);
4704          descriptor_state_changed = true;
4705       }
4706 
4707       for (uint32_t j = 0; j < set->layout->dynamic_offset_count; j++, dyn_index++) {
4708          uint32_t idx = j + layout->set[i + firstSet].dynamic_offset_start;
4709 
4710          if (descriptor_state->dynamic_offsets[idx] != pDynamicOffsets[dyn_index]) {
4711             descriptor_state->dynamic_offsets[idx] = pDynamicOffsets[dyn_index];
4712             descriptor_state_changed = true;
4713          }
4714       }
4715    }
4716 
4717    if (descriptor_state_changed) {
4718       if (pipelineBindPoint == VK_PIPELINE_BIND_POINT_GRAPHICS)
4719          cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_DESCRIPTOR_SETS;
4720       else
4721          cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_COMPUTE_DESCRIPTOR_SETS;
4722    }
4723 }
4724 
4725 void
v3dv_CmdPushConstants(VkCommandBuffer commandBuffer,VkPipelineLayout layout,VkShaderStageFlags stageFlags,uint32_t offset,uint32_t size,const void * pValues)4726 v3dv_CmdPushConstants(VkCommandBuffer commandBuffer,
4727                       VkPipelineLayout layout,
4728                       VkShaderStageFlags stageFlags,
4729                       uint32_t offset,
4730                       uint32_t size,
4731                       const void *pValues)
4732 {
4733    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
4734 
4735    if (!memcmp((uint8_t *) cmd_buffer->push_constants_data + offset, pValues, size))
4736       return;
4737 
4738    memcpy((uint8_t *) cmd_buffer->push_constants_data + offset, pValues, size);
4739 
4740    cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_PUSH_CONSTANTS;
4741 }
4742 
4743 void
v3dv_CmdSetBlendConstants(VkCommandBuffer commandBuffer,const float blendConstants[4])4744 v3dv_CmdSetBlendConstants(VkCommandBuffer commandBuffer,
4745                           const float blendConstants[4])
4746 {
4747    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
4748    struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
4749 
4750    if (!memcmp(state->dynamic.blend_constants, blendConstants,
4751                sizeof(state->dynamic.blend_constants))) {
4752       return;
4753    }
4754 
4755    memcpy(state->dynamic.blend_constants, blendConstants,
4756           sizeof(state->dynamic.blend_constants));
4757 
4758    cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_BLEND_CONSTANTS;
4759 }
4760 
4761 void
v3dv_cmd_buffer_reset_queries(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_query_pool * pool,uint32_t first,uint32_t count)4762 v3dv_cmd_buffer_reset_queries(struct v3dv_cmd_buffer *cmd_buffer,
4763                               struct v3dv_query_pool *pool,
4764                               uint32_t first,
4765                               uint32_t count)
4766 {
4767    /* Resets can only happen outside a render pass instance so we should not
4768     * be in the middle of job recording.
4769     */
4770    assert(cmd_buffer->state.pass == NULL);
4771    assert(cmd_buffer->state.job == NULL);
4772 
4773    assert(first < pool->query_count);
4774    assert(first + count <= pool->query_count);
4775 
4776    struct v3dv_job *job =
4777       v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device,
4778                                      V3DV_JOB_TYPE_CPU_RESET_QUERIES,
4779                                      cmd_buffer, -1);
4780    v3dv_return_if_oom(cmd_buffer, NULL);
4781 
4782    job->cpu.query_reset.pool = pool;
4783    job->cpu.query_reset.first = first;
4784    job->cpu.query_reset.count = count;
4785 
4786    list_addtail(&job->list_link, &cmd_buffer->jobs);
4787 }
4788 
4789 static void
ensure_array_state(struct v3dv_cmd_buffer * cmd_buffer,uint32_t slot_size,uint32_t used_count,uint32_t * alloc_count,void ** ptr)4790 ensure_array_state(struct v3dv_cmd_buffer *cmd_buffer,
4791                    uint32_t slot_size,
4792                    uint32_t used_count,
4793                    uint32_t *alloc_count,
4794                    void **ptr)
4795 {
4796    if (used_count >= *alloc_count) {
4797       const uint32_t prev_slot_count = *alloc_count;
4798       void *old_buffer = *ptr;
4799 
4800       const uint32_t new_slot_count = MAX2(*alloc_count * 2, 4);
4801       const uint32_t bytes = new_slot_count * slot_size;
4802       *ptr = vk_alloc(&cmd_buffer->device->alloc, bytes, 8,
4803                       VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
4804       if (*ptr == NULL) {
4805          fprintf(stderr, "Error: failed to allocate CPU buffer for query.\n");
4806          v3dv_flag_oom(cmd_buffer, NULL);
4807          return;
4808       }
4809 
4810       memcpy(*ptr, old_buffer, prev_slot_count * slot_size);
4811       *alloc_count = new_slot_count;
4812    }
4813    assert(used_count < *alloc_count);
4814 }
4815 
4816 void
v3dv_cmd_buffer_begin_query(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_query_pool * pool,uint32_t query,VkQueryControlFlags flags)4817 v3dv_cmd_buffer_begin_query(struct v3dv_cmd_buffer *cmd_buffer,
4818                             struct v3dv_query_pool *pool,
4819                             uint32_t query,
4820                             VkQueryControlFlags flags)
4821 {
4822    /* FIXME: we only support one active query for now */
4823    assert(cmd_buffer->state.query.active_query == NULL);
4824    assert(query < pool->query_count);
4825 
4826    cmd_buffer->state.query.active_query = pool->queries[query].bo;
4827    cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_OCCLUSION_QUERY;
4828 }
4829 
4830 void
v3dv_cmd_buffer_end_query(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_query_pool * pool,uint32_t query)4831 v3dv_cmd_buffer_end_query(struct v3dv_cmd_buffer *cmd_buffer,
4832                           struct v3dv_query_pool *pool,
4833                           uint32_t query)
4834 {
4835    assert(query < pool->query_count);
4836    assert(cmd_buffer->state.query.active_query != NULL);
4837 
4838    if  (cmd_buffer->state.pass) {
4839       /* Queue the EndQuery in the command buffer state, we will create a CPU
4840        * job to flag all of these queries as possibly available right after the
4841        * render pass job in which they have been recorded.
4842        */
4843       struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
4844       ensure_array_state(cmd_buffer,
4845                          sizeof(struct v3dv_end_query_cpu_job_info),
4846                          state->query.end.used_count,
4847                          &state->query.end.alloc_count,
4848                          (void **) &state->query.end.states);
4849       v3dv_return_if_oom(cmd_buffer, NULL);
4850 
4851       struct v3dv_end_query_cpu_job_info *info =
4852          &state->query.end.states[state->query.end.used_count++];
4853 
4854       info->pool = pool;
4855       info->query = query;
4856    } else {
4857       /* Otherwise, schedule the CPU job immediately */
4858       struct v3dv_job *job =
4859          v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device,
4860                                         V3DV_JOB_TYPE_CPU_END_QUERY,
4861                                         cmd_buffer, -1);
4862       v3dv_return_if_oom(cmd_buffer, NULL);
4863 
4864       job->cpu.query_end.pool = pool;
4865       job->cpu.query_end.query = query;
4866       list_addtail(&job->list_link, &cmd_buffer->jobs);
4867    }
4868 
4869    cmd_buffer->state.query.active_query = NULL;
4870    cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_OCCLUSION_QUERY;
4871 }
4872 
4873 void
v3dv_cmd_buffer_copy_query_results(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_query_pool * pool,uint32_t first,uint32_t count,struct v3dv_buffer * dst,uint32_t offset,uint32_t stride,VkQueryResultFlags flags)4874 v3dv_cmd_buffer_copy_query_results(struct v3dv_cmd_buffer *cmd_buffer,
4875                                    struct v3dv_query_pool *pool,
4876                                    uint32_t first,
4877                                    uint32_t count,
4878                                    struct v3dv_buffer *dst,
4879                                    uint32_t offset,
4880                                    uint32_t stride,
4881                                    VkQueryResultFlags flags)
4882 {
4883    /* Copies can only happen outside a render pass instance so we should not
4884     * be in the middle of job recording.
4885     */
4886    assert(cmd_buffer->state.pass == NULL);
4887    assert(cmd_buffer->state.job == NULL);
4888 
4889    assert(first < pool->query_count);
4890    assert(first + count <= pool->query_count);
4891 
4892    struct v3dv_job *job =
4893       v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device,
4894                                      V3DV_JOB_TYPE_CPU_COPY_QUERY_RESULTS,
4895                                      cmd_buffer, -1);
4896    v3dv_return_if_oom(cmd_buffer, NULL);
4897 
4898    job->cpu.query_copy_results.pool = pool;
4899    job->cpu.query_copy_results.first = first;
4900    job->cpu.query_copy_results.count = count;
4901    job->cpu.query_copy_results.dst = dst;
4902    job->cpu.query_copy_results.offset = offset;
4903    job->cpu.query_copy_results.stride = stride;
4904    job->cpu.query_copy_results.flags = flags;
4905 
4906    list_addtail(&job->list_link, &cmd_buffer->jobs);
4907 }
4908 
4909 void
v3dv_cmd_buffer_add_tfu_job(struct v3dv_cmd_buffer * cmd_buffer,struct drm_v3d_submit_tfu * tfu)4910 v3dv_cmd_buffer_add_tfu_job(struct v3dv_cmd_buffer *cmd_buffer,
4911                             struct drm_v3d_submit_tfu *tfu)
4912 {
4913    struct v3dv_device *device = cmd_buffer->device;
4914    struct v3dv_job *job = vk_zalloc(&device->alloc,
4915                                     sizeof(struct v3dv_job), 8,
4916                                     VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
4917    if (!job) {
4918       v3dv_flag_oom(cmd_buffer, NULL);
4919       return;
4920    }
4921 
4922    v3dv_job_init(job, V3DV_JOB_TYPE_GPU_TFU, device, cmd_buffer, -1);
4923    job->tfu = *tfu;
4924    list_addtail(&job->list_link, &cmd_buffer->jobs);
4925 }
4926 
4927 void
v3dv_CmdSetEvent(VkCommandBuffer commandBuffer,VkEvent _event,VkPipelineStageFlags stageMask)4928 v3dv_CmdSetEvent(VkCommandBuffer commandBuffer,
4929                  VkEvent _event,
4930                  VkPipelineStageFlags stageMask)
4931 {
4932    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
4933    V3DV_FROM_HANDLE(v3dv_event, event, _event);
4934 
4935    /* Event (re)sets can only happen outside a render pass instance so we
4936     * should not be in the middle of job recording.
4937     */
4938    assert(cmd_buffer->state.pass == NULL);
4939    assert(cmd_buffer->state.job == NULL);
4940 
4941    struct v3dv_job *job =
4942       v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device,
4943                                      V3DV_JOB_TYPE_CPU_SET_EVENT,
4944                                      cmd_buffer, -1);
4945    v3dv_return_if_oom(cmd_buffer, NULL);
4946 
4947    job->cpu.event_set.event = event;
4948    job->cpu.event_set.state = 1;
4949 
4950    list_addtail(&job->list_link, &cmd_buffer->jobs);
4951 }
4952 
4953 void
v3dv_CmdResetEvent(VkCommandBuffer commandBuffer,VkEvent _event,VkPipelineStageFlags stageMask)4954 v3dv_CmdResetEvent(VkCommandBuffer commandBuffer,
4955                    VkEvent _event,
4956                    VkPipelineStageFlags stageMask)
4957 {
4958    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
4959    V3DV_FROM_HANDLE(v3dv_event, event, _event);
4960 
4961    /* Event (re)sets can only happen outside a render pass instance so we
4962     * should not be in the middle of job recording.
4963     */
4964    assert(cmd_buffer->state.pass == NULL);
4965    assert(cmd_buffer->state.job == NULL);
4966 
4967    struct v3dv_job *job =
4968       v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device,
4969                                      V3DV_JOB_TYPE_CPU_SET_EVENT,
4970                                      cmd_buffer, -1);
4971    v3dv_return_if_oom(cmd_buffer, NULL);
4972 
4973    job->cpu.event_set.event = event;
4974    job->cpu.event_set.state = 0;
4975 
4976    list_addtail(&job->list_link, &cmd_buffer->jobs);
4977 }
4978 
4979 void
v3dv_CmdWaitEvents(VkCommandBuffer commandBuffer,uint32_t eventCount,const VkEvent * pEvents,VkPipelineStageFlags srcStageMask,VkPipelineStageFlags dstStageMask,uint32_t memoryBarrierCount,const VkMemoryBarrier * pMemoryBarriers,uint32_t bufferMemoryBarrierCount,const VkBufferMemoryBarrier * pBufferMemoryBarriers,uint32_t imageMemoryBarrierCount,const VkImageMemoryBarrier * pImageMemoryBarriers)4980 v3dv_CmdWaitEvents(VkCommandBuffer commandBuffer,
4981                    uint32_t eventCount,
4982                    const VkEvent *pEvents,
4983                    VkPipelineStageFlags srcStageMask,
4984                    VkPipelineStageFlags dstStageMask,
4985                    uint32_t memoryBarrierCount,
4986                    const VkMemoryBarrier *pMemoryBarriers,
4987                    uint32_t bufferMemoryBarrierCount,
4988                    const VkBufferMemoryBarrier *pBufferMemoryBarriers,
4989                    uint32_t imageMemoryBarrierCount,
4990                    const VkImageMemoryBarrier *pImageMemoryBarriers)
4991 {
4992    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
4993 
4994    assert(eventCount > 0);
4995 
4996    struct v3dv_job *job =
4997       v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device,
4998                                      V3DV_JOB_TYPE_CPU_WAIT_EVENTS,
4999                                      cmd_buffer, -1);
5000    v3dv_return_if_oom(cmd_buffer, NULL);
5001 
5002    const uint32_t event_list_size = sizeof(struct v3dv_event *) * eventCount;
5003 
5004    job->cpu.event_wait.events =
5005       vk_alloc(&cmd_buffer->device->alloc, event_list_size, 8,
5006                VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
5007    if (!job->cpu.event_wait.events) {
5008       v3dv_flag_oom(cmd_buffer, NULL);
5009       return;
5010    }
5011    job->cpu.event_wait.event_count = eventCount;
5012 
5013    for (uint32_t i = 0; i < eventCount; i++)
5014       job->cpu.event_wait.events[i] = v3dv_event_from_handle(pEvents[i]);
5015 
5016    /* vkCmdWaitEvents can be recorded inside a render pass, so we might have
5017     * an active job.
5018     *
5019     * If we are inside a render pass, because we vkCmd(Re)SetEvent can't happen
5020     * inside a render pass, it is safe to move the wait job so it happens right
5021     * before the current job we are currently recording for the subpass, if any
5022     * (it would actually be safe to move it all the way back to right before
5023     * the start of the render pass).
5024     *
5025     * If we are outside a render pass then we should not have any on-going job
5026     * and we are free to just add the wait job without restrictions.
5027     */
5028    assert(cmd_buffer->state.pass || !cmd_buffer->state.job);
5029    list_addtail(&job->list_link, &cmd_buffer->jobs);
5030 }
5031 
5032 void
v3dv_CmdWriteTimestamp(VkCommandBuffer commandBuffer,VkPipelineStageFlagBits pipelineStage,VkQueryPool queryPool,uint32_t query)5033 v3dv_CmdWriteTimestamp(VkCommandBuffer commandBuffer,
5034                        VkPipelineStageFlagBits pipelineStage,
5035                        VkQueryPool queryPool,
5036                        uint32_t query)
5037 {
5038    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
5039    V3DV_FROM_HANDLE(v3dv_query_pool, query_pool, queryPool);
5040 
5041    /* If this is called inside a render pass we need to finish the current
5042     * job here...
5043     */
5044    if (cmd_buffer->state.pass)
5045       v3dv_cmd_buffer_finish_job(cmd_buffer);
5046 
5047    struct v3dv_job *job =
5048       v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device,
5049                                      V3DV_JOB_TYPE_CPU_TIMESTAMP_QUERY,
5050                                      cmd_buffer, -1);
5051    v3dv_return_if_oom(cmd_buffer, NULL);
5052 
5053    job->cpu.query_timestamp.pool = query_pool;
5054    job->cpu.query_timestamp.query = query;
5055 
5056    list_addtail(&job->list_link, &cmd_buffer->jobs);
5057    cmd_buffer->state.job = NULL;
5058 
5059    /* ...and resume the subpass after the timestamp */
5060    if (cmd_buffer->state.pass)
5061       v3dv_cmd_buffer_subpass_resume(cmd_buffer, cmd_buffer->state.subpass_idx);
5062 }
5063 
5064 static void
cmd_buffer_emit_pre_dispatch(struct v3dv_cmd_buffer * cmd_buffer)5065 cmd_buffer_emit_pre_dispatch(struct v3dv_cmd_buffer *cmd_buffer)
5066 {
5067    assert(cmd_buffer->state.pipeline);
5068    assert(cmd_buffer->state.pipeline->active_stages == VK_SHADER_STAGE_COMPUTE_BIT);
5069 
5070    /* We may need to compile shader variants based on bound textures */
5071    uint32_t *dirty = &cmd_buffer->state.dirty;
5072    if (*dirty & (V3DV_CMD_DIRTY_PIPELINE |
5073                  V3DV_CMD_DIRTY_COMPUTE_DESCRIPTOR_SETS)) {
5074       update_pipeline_variants(cmd_buffer);
5075    }
5076 
5077    *dirty &= ~(V3DV_CMD_DIRTY_PIPELINE |
5078                V3DV_CMD_DIRTY_COMPUTE_DESCRIPTOR_SETS);
5079 }
5080 
5081 #define V3D_CSD_CFG012_WG_COUNT_SHIFT 16
5082 #define V3D_CSD_CFG012_WG_OFFSET_SHIFT 0
5083 /* Allow this dispatch to start while the last one is still running. */
5084 #define V3D_CSD_CFG3_OVERLAP_WITH_PREV (1 << 26)
5085 /* Maximum supergroup ID.  6 bits. */
5086 #define V3D_CSD_CFG3_MAX_SG_ID_SHIFT 20
5087 /* Batches per supergroup minus 1.  8 bits. */
5088 #define V3D_CSD_CFG3_BATCHES_PER_SG_M1_SHIFT 12
5089 /* Workgroups per supergroup, 0 means 16 */
5090 #define V3D_CSD_CFG3_WGS_PER_SG_SHIFT 8
5091 #define V3D_CSD_CFG3_WG_SIZE_SHIFT 0
5092 
5093 #define V3D_CSD_CFG5_PROPAGATE_NANS (1 << 2)
5094 #define V3D_CSD_CFG5_SINGLE_SEG (1 << 1)
5095 #define V3D_CSD_CFG5_THREADING (1 << 0)
5096 
5097 void
v3dv_cmd_buffer_rewrite_indirect_csd_job(struct v3dv_csd_indirect_cpu_job_info * info,const uint32_t * wg_counts)5098 v3dv_cmd_buffer_rewrite_indirect_csd_job(
5099    struct v3dv_csd_indirect_cpu_job_info *info,
5100    const uint32_t *wg_counts)
5101 {
5102    assert(info->csd_job);
5103    struct v3dv_job *job = info->csd_job;
5104 
5105    assert(job->type == V3DV_JOB_TYPE_GPU_CSD);
5106    assert(wg_counts[0] > 0 && wg_counts[1] > 0 && wg_counts[2] > 0);
5107 
5108    struct drm_v3d_submit_csd *submit = &job->csd.submit;
5109 
5110    job->csd.wg_count[0] = wg_counts[0];
5111    job->csd.wg_count[1] = wg_counts[1];
5112    job->csd.wg_count[2] = wg_counts[2];
5113 
5114    submit->cfg[0] = wg_counts[0] << V3D_CSD_CFG012_WG_COUNT_SHIFT;
5115    submit->cfg[1] = wg_counts[1] << V3D_CSD_CFG012_WG_COUNT_SHIFT;
5116    submit->cfg[2] = wg_counts[2] << V3D_CSD_CFG012_WG_COUNT_SHIFT;
5117 
5118    submit->cfg[4] = DIV_ROUND_UP(info->wg_size, 16) *
5119                     (wg_counts[0] * wg_counts[1] * wg_counts[2]) - 1;
5120    assert(submit->cfg[4] != ~0);
5121 
5122    if (info->needs_wg_uniform_rewrite) {
5123       /* Make sure the GPU is not currently accessing the indirect CL for this
5124        * job, since we are about to overwrite some of the uniform data.
5125        */
5126       const uint64_t infinite = 0xffffffffffffffffull;
5127       v3dv_bo_wait(job->device, job->indirect.bo, infinite);
5128 
5129       for (uint32_t i = 0; i < 3; i++) {
5130          if (info->wg_uniform_offsets[i]) {
5131             /* Sanity check that our uniform pointers are within the allocated
5132              * BO space for our indirect CL.
5133              */
5134             assert(info->wg_uniform_offsets[i] >= (uint32_t *) job->indirect.base);
5135             assert(info->wg_uniform_offsets[i] < (uint32_t *) job->indirect.next);
5136             *(info->wg_uniform_offsets[i]) = wg_counts[i];
5137          }
5138       }
5139    }
5140 }
5141 
5142 static struct v3dv_job *
cmd_buffer_create_csd_job(struct v3dv_cmd_buffer * cmd_buffer,uint32_t group_count_x,uint32_t group_count_y,uint32_t group_count_z,uint32_t ** wg_uniform_offsets_out,uint32_t * wg_size_out)5143 cmd_buffer_create_csd_job(struct v3dv_cmd_buffer *cmd_buffer,
5144                           uint32_t group_count_x,
5145                           uint32_t group_count_y,
5146                           uint32_t group_count_z,
5147                           uint32_t **wg_uniform_offsets_out,
5148                           uint32_t *wg_size_out)
5149 {
5150    struct v3dv_pipeline *pipeline = cmd_buffer->state.pipeline;
5151    assert(pipeline && pipeline->cs && pipeline->cs->nir);
5152 
5153    struct v3dv_job *job = vk_zalloc(&cmd_buffer->device->alloc,
5154                                     sizeof(struct v3dv_job), 8,
5155                                     VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
5156    if (!job) {
5157       v3dv_flag_oom(cmd_buffer, NULL);
5158       return NULL;
5159    }
5160 
5161    v3dv_job_init(job, V3DV_JOB_TYPE_GPU_CSD, cmd_buffer->device, cmd_buffer, -1);
5162    cmd_buffer->state.job = job;
5163 
5164    struct drm_v3d_submit_csd *submit = &job->csd.submit;
5165 
5166    job->csd.wg_count[0] = group_count_x;
5167    job->csd.wg_count[1] = group_count_y;
5168    job->csd.wg_count[2] = group_count_z;
5169 
5170    submit->cfg[0] |= group_count_x << V3D_CSD_CFG012_WG_COUNT_SHIFT;
5171    submit->cfg[1] |= group_count_y << V3D_CSD_CFG012_WG_COUNT_SHIFT;
5172    submit->cfg[2] |= group_count_z << V3D_CSD_CFG012_WG_COUNT_SHIFT;
5173 
5174    const struct nir_shader *cs =  pipeline->cs->nir;
5175 
5176    const uint32_t wgs_per_sg = 1; /* FIXME */
5177    const uint32_t wg_size = cs->info.cs.local_size[0] *
5178                             cs->info.cs.local_size[1] *
5179                             cs->info.cs.local_size[2];
5180    submit->cfg[3] |= wgs_per_sg << V3D_CSD_CFG3_WGS_PER_SG_SHIFT;
5181    submit->cfg[3] |= ((DIV_ROUND_UP(wgs_per_sg * wg_size, 16) - 1) <<
5182                      V3D_CSD_CFG3_BATCHES_PER_SG_M1_SHIFT);
5183    submit->cfg[3] |= (wg_size & 0xff) << V3D_CSD_CFG3_WG_SIZE_SHIFT;
5184    if (wg_size_out)
5185       *wg_size_out = wg_size;
5186 
5187    uint32_t batches_per_wg = DIV_ROUND_UP(wg_size, 16);
5188    submit->cfg[4] = batches_per_wg *
5189                     (group_count_x * group_count_y * group_count_z) - 1;
5190    assert(submit->cfg[4] != ~0);
5191 
5192    assert(pipeline->cs->current_variant &&
5193           pipeline->cs->current_variant->assembly_bo);
5194    const struct v3dv_shader_variant *variant = pipeline->cs->current_variant;
5195    submit->cfg[5] = variant->assembly_bo->offset;
5196    submit->cfg[5] |= V3D_CSD_CFG5_PROPAGATE_NANS;
5197    if (variant->prog_data.base->single_seg)
5198       submit->cfg[5] |= V3D_CSD_CFG5_SINGLE_SEG;
5199    if (variant->prog_data.base->threads == 4)
5200       submit->cfg[5] |= V3D_CSD_CFG5_THREADING;
5201 
5202    if (variant->prog_data.cs->shared_size > 0) {
5203       job->csd.shared_memory =
5204          v3dv_bo_alloc(cmd_buffer->device,
5205                        variant->prog_data.cs->shared_size * wgs_per_sg,
5206                        "shared_vars", true);
5207       if (!job->csd.shared_memory) {
5208          v3dv_flag_oom(cmd_buffer, NULL);
5209          return job;
5210       }
5211    }
5212 
5213    v3dv_job_add_bo(job, variant->assembly_bo);
5214 
5215    struct v3dv_cl_reloc uniforms =
5216       v3dv_write_uniforms_wg_offsets(cmd_buffer, pipeline->cs,
5217                                      wg_uniform_offsets_out);
5218    submit->cfg[6] = uniforms.bo->offset + uniforms.offset;
5219 
5220    v3dv_job_add_bo(job, uniforms.bo);
5221 
5222    return job;
5223 }
5224 
5225 static void
cmd_buffer_dispatch(struct v3dv_cmd_buffer * cmd_buffer,uint32_t group_count_x,uint32_t group_count_y,uint32_t group_count_z)5226 cmd_buffer_dispatch(struct v3dv_cmd_buffer *cmd_buffer,
5227                     uint32_t group_count_x,
5228                     uint32_t group_count_y,
5229                     uint32_t group_count_z)
5230 {
5231    if (group_count_x == 0 || group_count_y == 0 || group_count_z == 0)
5232       return;
5233 
5234    struct v3dv_job *job =
5235       cmd_buffer_create_csd_job(cmd_buffer,
5236                                 group_count_x,
5237                                 group_count_y,
5238                                 group_count_z,
5239                                 NULL, NULL);
5240 
5241    list_addtail(&job->list_link, &cmd_buffer->jobs);
5242    cmd_buffer->state.job = NULL;
5243 }
5244 
5245 void
v3dv_CmdDispatch(VkCommandBuffer commandBuffer,uint32_t groupCountX,uint32_t groupCountY,uint32_t groupCountZ)5246 v3dv_CmdDispatch(VkCommandBuffer commandBuffer,
5247                  uint32_t groupCountX,
5248                  uint32_t groupCountY,
5249                  uint32_t groupCountZ)
5250 {
5251    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
5252 
5253    cmd_buffer_emit_pre_dispatch(cmd_buffer);
5254    cmd_buffer_dispatch(cmd_buffer, groupCountX, groupCountY, groupCountZ);
5255 }
5256 
5257 static void
cmd_buffer_dispatch_indirect(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_buffer * buffer,uint32_t offset)5258 cmd_buffer_dispatch_indirect(struct v3dv_cmd_buffer *cmd_buffer,
5259                              struct v3dv_buffer *buffer,
5260                              uint32_t offset)
5261 {
5262    /* We can't do indirect dispatches, so instead we record a CPU job that,
5263     * when executed in the queue, will map the indirect buffer, read the
5264     * dispatch parameters, and submit a regular dispatch.
5265     */
5266    struct v3dv_job *job =
5267       v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device,
5268                                      V3DV_JOB_TYPE_CPU_CSD_INDIRECT,
5269                                      cmd_buffer, -1);
5270    v3dv_return_if_oom(cmd_buffer, NULL);
5271 
5272    /* We need to create a CSD job now, even if we still don't know the actual
5273     * dispatch parameters, because the job setup needs to be done using the
5274     * current command buffer state (i.e. pipeline, descriptor sets, push
5275     * constants, etc.). So we create the job with default dispatch parameters
5276     * and we will rewrite the parts we need at submit time if the indirect
5277     * parameters don't match the ones we used to setup the job.
5278     */
5279    struct v3dv_job *csd_job =
5280       cmd_buffer_create_csd_job(cmd_buffer,
5281                                 1, 1, 1,
5282                                 &job->cpu.csd_indirect.wg_uniform_offsets[0],
5283                                 &job->cpu.csd_indirect.wg_size);
5284    v3dv_return_if_oom(cmd_buffer, NULL);
5285    assert(csd_job);
5286 
5287    job->cpu.csd_indirect.buffer = buffer;
5288    job->cpu.csd_indirect.offset = offset;
5289    job->cpu.csd_indirect.csd_job = csd_job;
5290 
5291    /* If the compute shader reads the workgroup sizes we will also need to
5292     * rewrite the corresponding uniforms.
5293     */
5294    job->cpu.csd_indirect.needs_wg_uniform_rewrite =
5295       job->cpu.csd_indirect.wg_uniform_offsets[0] ||
5296       job->cpu.csd_indirect.wg_uniform_offsets[1] ||
5297       job->cpu.csd_indirect.wg_uniform_offsets[2];
5298 
5299    list_addtail(&job->list_link, &cmd_buffer->jobs);
5300    cmd_buffer->state.job = NULL;
5301 }
5302 
5303 void
v3dv_CmdDispatchIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset)5304 v3dv_CmdDispatchIndirect(VkCommandBuffer commandBuffer,
5305                          VkBuffer _buffer,
5306                          VkDeviceSize offset)
5307 {
5308    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
5309    V3DV_FROM_HANDLE(v3dv_buffer, buffer, _buffer);
5310 
5311    assert(offset <= UINT32_MAX);
5312 
5313    cmd_buffer_emit_pre_dispatch(cmd_buffer);
5314    cmd_buffer_dispatch_indirect(cmd_buffer, buffer, offset);
5315 }
5316