1 /*
2  * Copyright © 2016 Red Hat.
3  * Copyright © 2016 Bas Nieuwenhuizen
4  *
5  * based in part on anv driver which is:
6  * Copyright © 2015 Intel Corporation
7  *
8  * Permission is hereby granted, free of charge, to any person obtaining a
9  * copy of this software and associated documentation files (the "Software"),
10  * to deal in the Software without restriction, including without limitation
11  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
12  * and/or sell copies of the Software, and to permit persons to whom the
13  * Software is furnished to do so, subject to the following conditions:
14  *
15  * The above copyright notice and this permission notice (including the next
16  * paragraph) shall be included in all copies or substantial portions of the
17  * Software.
18  *
19  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
22  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25  * DEALINGS IN THE SOFTWARE.
26  */
27 
28 #ifndef TU_PRIVATE_H
29 #define TU_PRIVATE_H
30 
31 #include <assert.h>
32 #include <pthread.h>
33 #include <stdbool.h>
34 #include <stdint.h>
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <string.h>
38 #ifdef HAVE_VALGRIND
39 #include <memcheck.h>
40 #include <valgrind.h>
41 #define VG(x) x
42 #else
43 #define VG(x) ((void)0)
44 #endif
45 
46 #define MESA_LOG_TAG "TU"
47 
48 #include "c11/threads.h"
49 #include "main/macros.h"
50 #include "util/list.h"
51 #include "util/log.h"
52 #include "util/macros.h"
53 #include "util/u_atomic.h"
54 #include "vk_alloc.h"
55 #include "vk_object.h"
56 #include "vk_debug_report.h"
57 #include "wsi_common.h"
58 
59 #include "ir3/ir3_compiler.h"
60 #include "ir3/ir3_shader.h"
61 
62 #include "adreno_common.xml.h"
63 #include "adreno_pm4.xml.h"
64 #include "a6xx.xml.h"
65 #include "fdl/freedreno_layout.h"
66 #include "common/freedreno_dev_info.h"
67 
68 #include "tu_descriptor_set.h"
69 #include "tu_extensions.h"
70 #include "tu_util.h"
71 
72 /* Pre-declarations needed for WSI entrypoints */
73 struct wl_surface;
74 struct wl_display;
75 typedef struct xcb_connection_t xcb_connection_t;
76 typedef uint32_t xcb_visualid_t;
77 typedef uint32_t xcb_window_t;
78 
79 #include <vulkan/vk_android_native_buffer.h>
80 #include <vulkan/vk_icd.h>
81 #include <vulkan/vulkan.h>
82 #include <vulkan/vulkan_intel.h>
83 
84 #include "tu_entrypoints.h"
85 
86 #include "vk_format.h"
87 
88 #define MAX_VBS 32
89 #define MAX_VERTEX_ATTRIBS 32
90 #define MAX_RTS 8
91 #define MAX_VSC_PIPES 32
92 #define MAX_VIEWPORTS 16
93 #define MAX_SCISSORS 16
94 #define MAX_DISCARD_RECTANGLES 4
95 #define MAX_PUSH_CONSTANTS_SIZE 128
96 #define MAX_PUSH_DESCRIPTORS 32
97 #define MAX_DYNAMIC_UNIFORM_BUFFERS 16
98 #define MAX_DYNAMIC_STORAGE_BUFFERS 8
99 #define MAX_DYNAMIC_BUFFERS                                                  \
100    (MAX_DYNAMIC_UNIFORM_BUFFERS + MAX_DYNAMIC_STORAGE_BUFFERS)
101 #define TU_MAX_DRM_DEVICES 8
102 #define MAX_VIEWS 16
103 #define MAX_BIND_POINTS 2 /* compute + graphics */
104 /* The Qualcomm driver exposes 0x20000058 */
105 #define MAX_STORAGE_BUFFER_RANGE 0x20000000
106 /* We use ldc for uniform buffer loads, just like the Qualcomm driver, so
107  * expose the same maximum range.
108  * TODO: The SIZE bitfield is 15 bits, and in 4-dword units, so the actual
109  * range might be higher.
110  */
111 #define MAX_UNIFORM_BUFFER_RANGE 0x10000
112 
113 #define A6XX_TEX_CONST_DWORDS 16
114 #define A6XX_TEX_SAMP_DWORDS 4
115 
116 #define for_each_bit(b, dword)                                               \
117    for (uint32_t __dword = (dword);                                          \
118         (b) = __builtin_ffs(__dword) - 1, __dword; __dword &= ~(1 << (b)))
119 
120 #define COND(bool, val) ((bool) ? (val) : 0)
121 #define BIT(bit) (1u << (bit))
122 
123 /* Whenever we generate an error, pass it through this function. Useful for
124  * debugging, where we can break on it. Only call at error site, not when
125  * propagating errors. Might be useful to plug in a stack trace here.
126  */
127 
128 struct tu_instance;
129 
130 VkResult
131 __vk_errorf(struct tu_instance *instance,
132             VkResult error,
133             bool force_print,
134             const char *file,
135             int line,
136             const char *format,
137             ...) PRINTFLIKE(6, 7);
138 
139 #define vk_error(instance, error)                                            \
140    __vk_errorf(instance, error, false, __FILE__, __LINE__, NULL);
141 #define vk_errorf(instance, error, format, ...)                              \
142    __vk_errorf(instance, error, false, __FILE__, __LINE__, format, ##__VA_ARGS__);
143 
144 /* Prints startup errors if TU_DEBUG=startup is set or on a debug driver
145  * build.
146  */
147 #define vk_startup_errorf(instance, error, format, ...) \
148    __vk_errorf(instance, error, instance->debug_flags & TU_DEBUG_STARTUP, \
149                __FILE__, __LINE__, format, ##__VA_ARGS__)
150 
151 void
152 __tu_finishme(const char *file, int line, const char *format, ...)
153    PRINTFLIKE(3, 4);
154 
155 /**
156  * Print a FINISHME message, including its source location.
157  */
158 #define tu_finishme(format, ...)                                             \
159    do {                                                                      \
160       static bool reported = false;                                          \
161       if (!reported) {                                                       \
162          __tu_finishme(__FILE__, __LINE__, format, ##__VA_ARGS__);           \
163          reported = true;                                                    \
164       }                                                                      \
165    } while (0)
166 
167 #define tu_stub()                                                            \
168    do {                                                                      \
169       tu_finishme("stub %s", __func__);                                      \
170    } while (0)
171 
172 void *
173 tu_lookup_entrypoint_unchecked(const char *name);
174 void *
175 tu_lookup_entrypoint_checked(
176    const char *name,
177    uint32_t core_version,
178    const struct tu_instance_extension_table *instance,
179    const struct tu_device_extension_table *device);
180 
181 struct tu_physical_device
182 {
183    struct vk_object_base base;
184 
185    struct tu_instance *instance;
186 
187    char name[VK_MAX_PHYSICAL_DEVICE_NAME_SIZE];
188    uint8_t driver_uuid[VK_UUID_SIZE];
189    uint8_t device_uuid[VK_UUID_SIZE];
190    uint8_t cache_uuid[VK_UUID_SIZE];
191 
192    struct wsi_device wsi_device;
193 
194    int local_fd;
195    int master_fd;
196 
197    unsigned gpu_id;
198    uint32_t gmem_size;
199    uint64_t gmem_base;
200 
201    struct freedreno_dev_info info;
202 
203    int msm_major_version;
204    int msm_minor_version;
205 
206    bool limited_z24s8;
207 
208    /* This is the drivers on-disk cache used as a fallback as opposed to
209     * the pipeline cache defined by apps.
210     */
211    struct disk_cache *disk_cache;
212 
213    struct tu_device_extension_table supported_extensions;
214 };
215 
216 enum tu_debug_flags
217 {
218    TU_DEBUG_STARTUP = 1 << 0,
219    TU_DEBUG_NIR = 1 << 1,
220    TU_DEBUG_IR3 = 1 << 2,
221    TU_DEBUG_NOBIN = 1 << 3,
222    TU_DEBUG_SYSMEM = 1 << 4,
223    TU_DEBUG_FORCEBIN = 1 << 5,
224    TU_DEBUG_NOUBWC = 1 << 6,
225    TU_DEBUG_NOMULTIPOS = 1 << 7,
226    TU_DEBUG_NOLRZ = 1 << 8,
227 };
228 
229 struct tu_instance
230 {
231    struct vk_object_base base;
232 
233    VkAllocationCallbacks alloc;
234 
235    uint32_t api_version;
236    int physical_device_count;
237    struct tu_physical_device physical_devices[TU_MAX_DRM_DEVICES];
238 
239    enum tu_debug_flags debug_flags;
240 
241    struct vk_debug_report_instance debug_report_callbacks;
242 
243    struct tu_instance_extension_table enabled_extensions;
244 };
245 
246 VkResult
247 tu_wsi_init(struct tu_physical_device *physical_device);
248 void
249 tu_wsi_finish(struct tu_physical_device *physical_device);
250 
251 bool
252 tu_instance_extension_supported(const char *name);
253 uint32_t
254 tu_physical_device_api_version(struct tu_physical_device *dev);
255 bool
256 tu_physical_device_extension_supported(struct tu_physical_device *dev,
257                                        const char *name);
258 
259 struct cache_entry;
260 
261 struct tu_pipeline_cache
262 {
263    struct vk_object_base base;
264 
265    struct tu_device *device;
266    pthread_mutex_t mutex;
267 
268    uint32_t total_size;
269    uint32_t table_size;
270    uint32_t kernel_count;
271    struct cache_entry **hash_table;
272    bool modified;
273 
274    VkAllocationCallbacks alloc;
275 };
276 
277 struct tu_pipeline_key
278 {
279 };
280 
281 
282 /* queue types */
283 #define TU_QUEUE_GENERAL 0
284 
285 #define TU_MAX_QUEUE_FAMILIES 1
286 
287 struct tu_syncobj;
288 
289 struct tu_queue
290 {
291    struct vk_object_base base;
292 
293    struct tu_device *device;
294    uint32_t queue_family_index;
295    int queue_idx;
296    VkDeviceQueueCreateFlags flags;
297 
298    uint32_t msm_queue_id;
299    int fence;
300 };
301 
302 struct tu_bo
303 {
304    uint32_t gem_handle;
305    uint64_t size;
306    uint64_t iova;
307    void *map;
308 };
309 
310 enum global_shader {
311    GLOBAL_SH_VS,
312    GLOBAL_SH_FS_BLIT,
313    GLOBAL_SH_FS_CLEAR0,
314    GLOBAL_SH_FS_CLEAR_MAX = GLOBAL_SH_FS_CLEAR0 + MAX_RTS,
315    GLOBAL_SH_COUNT,
316 };
317 
318 #define TU_BORDER_COLOR_COUNT 4096
319 #define TU_BORDER_COLOR_BUILTIN 6
320 
321 /* This struct defines the layout of the global_bo */
322 struct tu6_global
323 {
324    /* clear/blit shaders, all <= 16 instrs (16 instr = 1 instrlen unit) */
325    instr_t shaders[GLOBAL_SH_COUNT][16];
326 
327    uint32_t seqno_dummy;          /* dummy seqno for CP_EVENT_WRITE */
328    uint32_t _pad0;
329    volatile uint32_t vsc_draw_overflow;
330    uint32_t _pad1;
331    volatile uint32_t vsc_prim_overflow;
332    uint32_t _pad2;
333    uint64_t predicate;
334 
335    /* scratch space for VPC_SO[i].FLUSH_BASE_LO/HI, start on 32 byte boundary. */
336    struct {
337       uint32_t offset;
338       uint32_t pad[7];
339    } flush_base[4];
340 
341    /* note: larger global bo will be used for customBorderColors */
342    struct bcolor_entry bcolor_builtin[TU_BORDER_COLOR_BUILTIN], bcolor[];
343 };
344 #define gb_offset(member) offsetof(struct tu6_global, member)
345 #define global_iova(cmd, member) ((cmd)->device->global_bo.iova + gb_offset(member))
346 
347 void tu_init_clear_blit_shaders(struct tu6_global *global);
348 
349 /* extra space in vsc draw/prim streams */
350 #define VSC_PAD 0x40
351 
352 struct tu_device
353 {
354    struct vk_device vk;
355    struct tu_instance *instance;
356 
357    struct tu_queue *queues[TU_MAX_QUEUE_FAMILIES];
358    int queue_count[TU_MAX_QUEUE_FAMILIES];
359 
360    struct tu_physical_device *physical_device;
361    int fd;
362    int _lost;
363 
364    struct ir3_compiler *compiler;
365 
366    /* Backup in-memory cache to be used if the app doesn't provide one */
367    struct tu_pipeline_cache *mem_cache;
368 
369 #define MIN_SCRATCH_BO_SIZE_LOG2 12 /* A page */
370 
371    /* Currently the kernel driver uses a 32-bit GPU address space, but it
372     * should be impossible to go beyond 48 bits.
373     */
374    struct {
375       struct tu_bo bo;
376       mtx_t construct_mtx;
377       bool initialized;
378    } scratch_bos[48 - MIN_SCRATCH_BO_SIZE_LOG2];
379 
380    struct tu_bo global_bo;
381 
382    struct tu_device_extension_table enabled_extensions;
383 
384    uint32_t vsc_draw_strm_pitch;
385    uint32_t vsc_prim_strm_pitch;
386    BITSET_DECLARE(custom_border_color, TU_BORDER_COLOR_COUNT);
387    mtx_t mutex;
388 
389    /* bo list for submits: */
390    struct drm_msm_gem_submit_bo *bo_list;
391    /* map bo handles to bo list index: */
392    uint32_t *bo_idx;
393    uint32_t bo_count, bo_list_size, bo_idx_size;
394    mtx_t bo_mutex;
395 };
396 
397 VkResult _tu_device_set_lost(struct tu_device *device,
398                              const char *msg, ...) PRINTFLIKE(2, 3);
399 #define tu_device_set_lost(dev, ...) \
400    _tu_device_set_lost(dev, __VA_ARGS__)
401 
402 static inline bool
tu_device_is_lost(struct tu_device * device)403 tu_device_is_lost(struct tu_device *device)
404 {
405    return unlikely(p_atomic_read(&device->_lost));
406 }
407 
408 VkResult
409 tu_bo_init_new(struct tu_device *dev, struct tu_bo *bo, uint64_t size, bool dump);
410 VkResult
411 tu_bo_init_dmabuf(struct tu_device *dev,
412                   struct tu_bo *bo,
413                   uint64_t size,
414                   int fd);
415 int
416 tu_bo_export_dmabuf(struct tu_device *dev, struct tu_bo *bo);
417 void
418 tu_bo_finish(struct tu_device *dev, struct tu_bo *bo);
419 VkResult
420 tu_bo_map(struct tu_device *dev, struct tu_bo *bo);
421 
422 /* Get a scratch bo for use inside a command buffer. This will always return
423  * the same bo given the same size or similar sizes, so only one scratch bo
424  * can be used at the same time. It's meant for short-lived things where we
425  * need to write to some piece of memory, read from it, and then immediately
426  * discard it.
427  */
428 VkResult
429 tu_get_scratch_bo(struct tu_device *dev, uint64_t size, struct tu_bo **bo);
430 
431 struct tu_cs_entry
432 {
433    /* No ownership */
434    const struct tu_bo *bo;
435 
436    uint32_t size;
437    uint32_t offset;
438 };
439 
440 struct tu_cs_memory {
441    uint32_t *map;
442    uint64_t iova;
443 };
444 
445 struct tu_draw_state {
446    uint64_t iova : 48;
447    uint32_t size : 16;
448 };
449 
450 enum tu_dynamic_state
451 {
452    /* re-use VK_DYNAMIC_STATE_ enums for non-extended dynamic states */
453    TU_DYNAMIC_STATE_SAMPLE_LOCATIONS = VK_DYNAMIC_STATE_STENCIL_REFERENCE + 1,
454    TU_DYNAMIC_STATE_RB_DEPTH_CNTL,
455    TU_DYNAMIC_STATE_RB_STENCIL_CNTL,
456    TU_DYNAMIC_STATE_VB_STRIDE,
457    TU_DYNAMIC_STATE_COUNT,
458    /* no associated draw state: */
459    TU_DYNAMIC_STATE_PRIMITIVE_TOPOLOGY = TU_DYNAMIC_STATE_COUNT,
460    /* re-use the line width enum as it uses GRAS_SU_CNTL: */
461    TU_DYNAMIC_STATE_GRAS_SU_CNTL = VK_DYNAMIC_STATE_LINE_WIDTH,
462 };
463 
464 enum tu_draw_state_group_id
465 {
466    TU_DRAW_STATE_PROGRAM,
467    TU_DRAW_STATE_PROGRAM_BINNING,
468    TU_DRAW_STATE_TESS,
469    TU_DRAW_STATE_VB,
470    TU_DRAW_STATE_VI,
471    TU_DRAW_STATE_VI_BINNING,
472    TU_DRAW_STATE_RAST,
473    TU_DRAW_STATE_BLEND,
474    TU_DRAW_STATE_VS_CONST,
475    TU_DRAW_STATE_HS_CONST,
476    TU_DRAW_STATE_DS_CONST,
477    TU_DRAW_STATE_GS_CONST,
478    TU_DRAW_STATE_FS_CONST,
479    TU_DRAW_STATE_DESC_SETS,
480    TU_DRAW_STATE_DESC_SETS_LOAD,
481    TU_DRAW_STATE_VS_PARAMS,
482    TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM,
483    TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM,
484    TU_DRAW_STATE_LRZ,
485 
486    /* dynamic state related draw states */
487    TU_DRAW_STATE_DYNAMIC,
488    TU_DRAW_STATE_COUNT = TU_DRAW_STATE_DYNAMIC + TU_DYNAMIC_STATE_COUNT,
489 };
490 
491 enum tu_cs_mode
492 {
493 
494    /*
495     * A command stream in TU_CS_MODE_GROW mode grows automatically whenever it
496     * is full.  tu_cs_begin must be called before command packet emission and
497     * tu_cs_end must be called after.
498     *
499     * This mode may create multiple entries internally.  The entries must be
500     * submitted together.
501     */
502    TU_CS_MODE_GROW,
503 
504    /*
505     * A command stream in TU_CS_MODE_EXTERNAL mode wraps an external,
506     * fixed-size buffer.  tu_cs_begin and tu_cs_end are optional and have no
507     * effect on it.
508     *
509     * This mode does not create any entry or any BO.
510     */
511    TU_CS_MODE_EXTERNAL,
512 
513    /*
514     * A command stream in TU_CS_MODE_SUB_STREAM mode does not support direct
515     * command packet emission.  tu_cs_begin_sub_stream must be called to get a
516     * sub-stream to emit comamnd packets to.  When done with the sub-stream,
517     * tu_cs_end_sub_stream must be called.
518     *
519     * This mode does not create any entry internally.
520     */
521    TU_CS_MODE_SUB_STREAM,
522 };
523 
524 struct tu_cs
525 {
526    uint32_t *start;
527    uint32_t *cur;
528    uint32_t *reserved_end;
529    uint32_t *end;
530 
531    struct tu_device *device;
532    enum tu_cs_mode mode;
533    uint32_t next_bo_size;
534 
535    struct tu_cs_entry *entries;
536    uint32_t entry_count;
537    uint32_t entry_capacity;
538 
539    struct tu_bo **bos;
540    uint32_t bo_count;
541    uint32_t bo_capacity;
542 
543    /* state for cond_exec_start/cond_exec_end */
544    uint32_t cond_flags;
545    uint32_t *cond_dwords;
546 };
547 
548 struct tu_device_memory
549 {
550    struct vk_object_base base;
551 
552    struct tu_bo bo;
553 };
554 
555 struct tu_descriptor_range
556 {
557    uint64_t va;
558    uint32_t size;
559 };
560 
561 struct tu_descriptor_set
562 {
563    struct vk_object_base base;
564 
565    const struct tu_descriptor_set_layout *layout;
566    struct tu_descriptor_pool *pool;
567    uint32_t size;
568 
569    uint64_t va;
570    uint32_t *mapped_ptr;
571 
572    uint32_t *dynamic_descriptors;
573 };
574 
575 struct tu_descriptor_pool_entry
576 {
577    uint32_t offset;
578    uint32_t size;
579    struct tu_descriptor_set *set;
580 };
581 
582 struct tu_descriptor_pool
583 {
584    struct vk_object_base base;
585 
586    struct tu_bo bo;
587    uint64_t current_offset;
588    uint64_t size;
589 
590    uint8_t *host_memory_base;
591    uint8_t *host_memory_ptr;
592    uint8_t *host_memory_end;
593 
594    uint32_t entry_count;
595    uint32_t max_entry_count;
596    struct tu_descriptor_pool_entry entries[0];
597 };
598 
599 struct tu_descriptor_update_template_entry
600 {
601    VkDescriptorType descriptor_type;
602 
603    /* The number of descriptors to update */
604    uint32_t descriptor_count;
605 
606    /* Into mapped_ptr or dynamic_descriptors, in units of the respective array
607     */
608    uint32_t dst_offset;
609 
610    /* In dwords. Not valid/used for dynamic descriptors */
611    uint32_t dst_stride;
612 
613    uint32_t buffer_offset;
614 
615    /* Only valid for combined image samplers and samplers */
616    uint16_t has_sampler;
617 
618    /* In bytes */
619    size_t src_offset;
620    size_t src_stride;
621 
622    /* For push descriptors */
623    const struct tu_sampler *immutable_samplers;
624 };
625 
626 struct tu_descriptor_update_template
627 {
628    struct vk_object_base base;
629 
630    uint32_t entry_count;
631    VkPipelineBindPoint bind_point;
632    struct tu_descriptor_update_template_entry entry[0];
633 };
634 
635 struct tu_buffer
636 {
637    struct vk_object_base base;
638 
639    VkDeviceSize size;
640 
641    VkBufferUsageFlags usage;
642    VkBufferCreateFlags flags;
643 
644    struct tu_bo *bo;
645    VkDeviceSize bo_offset;
646 };
647 
648 static inline uint64_t
tu_buffer_iova(struct tu_buffer * buffer)649 tu_buffer_iova(struct tu_buffer *buffer)
650 {
651    return buffer->bo->iova + buffer->bo_offset;
652 }
653 
654 const char *
655 tu_get_debug_option_name(int id);
656 
657 const char *
658 tu_get_perftest_option_name(int id);
659 
660 struct tu_descriptor_state
661 {
662    struct tu_descriptor_set *sets[MAX_SETS];
663    struct tu_descriptor_set push_set;
664    uint32_t dynamic_descriptors[MAX_DYNAMIC_BUFFERS * A6XX_TEX_CONST_DWORDS];
665 };
666 
667 enum tu_cmd_dirty_bits
668 {
669    TU_CMD_DIRTY_VERTEX_BUFFERS = BIT(0),
670    TU_CMD_DIRTY_VB_STRIDE = BIT(1),
671    TU_CMD_DIRTY_GRAS_SU_CNTL = BIT(2),
672    TU_CMD_DIRTY_RB_DEPTH_CNTL = BIT(3),
673    TU_CMD_DIRTY_RB_STENCIL_CNTL = BIT(4),
674    TU_CMD_DIRTY_DESC_SETS_LOAD = BIT(5),
675    TU_CMD_DIRTY_COMPUTE_DESC_SETS_LOAD = BIT(6),
676    TU_CMD_DIRTY_SHADER_CONSTS = BIT(7),
677    TU_CMD_DIRTY_LRZ = BIT(8),
678    /* all draw states were disabled and need to be re-enabled: */
679    TU_CMD_DIRTY_DRAW_STATE = BIT(9)
680 };
681 
682 /* There are only three cache domains we have to care about: the CCU, or
683  * color cache unit, which is used for color and depth/stencil attachments
684  * and copy/blit destinations, and is split conceptually into color and depth,
685  * and the universal cache or UCHE which is used for pretty much everything
686  * else, except for the CP (uncached) and host. We need to flush whenever data
687  * crosses these boundaries.
688  */
689 
690 enum tu_cmd_access_mask {
691    TU_ACCESS_UCHE_READ = 1 << 0,
692    TU_ACCESS_UCHE_WRITE = 1 << 1,
693    TU_ACCESS_CCU_COLOR_READ = 1 << 2,
694    TU_ACCESS_CCU_COLOR_WRITE = 1 << 3,
695    TU_ACCESS_CCU_DEPTH_READ = 1 << 4,
696    TU_ACCESS_CCU_DEPTH_WRITE = 1 << 5,
697 
698    /* Experiments have shown that while it's safe to avoid flushing the CCU
699     * after each blit/renderpass, it's not safe to assume that subsequent
700     * lookups with a different attachment state will hit unflushed cache
701     * entries. That is, the CCU needs to be flushed and possibly invalidated
702     * when accessing memory with a different attachment state. Writing to an
703     * attachment under the following conditions after clearing using the
704     * normal 2d engine path is known to have issues:
705     *
706     * - It isn't the 0'th layer.
707     * - There are more than one attachment, and this isn't the 0'th attachment
708     *   (this seems to also depend on the cpp of the attachments).
709     *
710     * Our best guess is that the layer/MRT state is used when computing
711     * the location of a cache entry in CCU, to avoid conflicts. We assume that
712     * any access in a renderpass after or before an access by a transfer needs
713     * a flush/invalidate, and use the _INCOHERENT variants to represent access
714     * by a transfer.
715     */
716    TU_ACCESS_CCU_COLOR_INCOHERENT_READ = 1 << 6,
717    TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE = 1 << 7,
718    TU_ACCESS_CCU_DEPTH_INCOHERENT_READ = 1 << 8,
719    TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE = 1 << 9,
720 
721    /* Accesses by the host */
722    TU_ACCESS_HOST_READ = 1 << 10,
723    TU_ACCESS_HOST_WRITE = 1 << 11,
724 
725    /* Accesses by a GPU engine which bypasses any cache. e.g. writes via
726     * CP_EVENT_WRITE::BLIT and the CP are SYSMEM_WRITE.
727     */
728    TU_ACCESS_SYSMEM_READ = 1 << 12,
729    TU_ACCESS_SYSMEM_WRITE = 1 << 13,
730 
731    /* Set if a WFI is required. This can be required for:
732     * - 2D engine which (on some models) doesn't wait for flushes to complete
733     *   before starting
734     * - CP draw indirect opcodes, where we need to wait for any flushes to
735     *   complete but the CP implicitly waits for WFI's to complete and
736     *   therefore we only need a WFI after the flushes.
737     */
738    TU_ACCESS_WFI_READ = 1 << 14,
739 
740    /* Set if a CP_WAIT_FOR_ME is required due to the data being read by the CP
741     * without it waiting for any WFI.
742     */
743    TU_ACCESS_WFM_READ = 1 << 15,
744 
745    /* Memory writes from the CP start in-order with draws and event writes,
746     * but execute asynchronously and hence need a CP_WAIT_MEM_WRITES if read.
747     */
748    TU_ACCESS_CP_WRITE = 1 << 16,
749 
750    TU_ACCESS_READ =
751       TU_ACCESS_UCHE_READ |
752       TU_ACCESS_CCU_COLOR_READ |
753       TU_ACCESS_CCU_DEPTH_READ |
754       TU_ACCESS_CCU_COLOR_INCOHERENT_READ |
755       TU_ACCESS_CCU_DEPTH_INCOHERENT_READ |
756       TU_ACCESS_HOST_READ |
757       TU_ACCESS_SYSMEM_READ |
758       TU_ACCESS_WFI_READ |
759       TU_ACCESS_WFM_READ,
760 
761    TU_ACCESS_WRITE =
762       TU_ACCESS_UCHE_WRITE |
763       TU_ACCESS_CCU_COLOR_WRITE |
764       TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE |
765       TU_ACCESS_CCU_DEPTH_WRITE |
766       TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE |
767       TU_ACCESS_HOST_WRITE |
768       TU_ACCESS_SYSMEM_WRITE |
769       TU_ACCESS_CP_WRITE,
770 
771    TU_ACCESS_ALL =
772       TU_ACCESS_READ |
773       TU_ACCESS_WRITE,
774 };
775 
776 enum tu_cmd_flush_bits {
777    TU_CMD_FLAG_CCU_FLUSH_DEPTH = 1 << 0,
778    TU_CMD_FLAG_CCU_FLUSH_COLOR = 1 << 1,
779    TU_CMD_FLAG_CCU_INVALIDATE_DEPTH = 1 << 2,
780    TU_CMD_FLAG_CCU_INVALIDATE_COLOR = 1 << 3,
781    TU_CMD_FLAG_CACHE_FLUSH = 1 << 4,
782    TU_CMD_FLAG_CACHE_INVALIDATE = 1 << 5,
783    TU_CMD_FLAG_WAIT_MEM_WRITES = 1 << 6,
784    TU_CMD_FLAG_WAIT_FOR_IDLE = 1 << 7,
785    TU_CMD_FLAG_WAIT_FOR_ME = 1 << 8,
786 
787    TU_CMD_FLAG_ALL_FLUSH =
788       TU_CMD_FLAG_CCU_FLUSH_DEPTH |
789       TU_CMD_FLAG_CCU_FLUSH_COLOR |
790       TU_CMD_FLAG_CACHE_FLUSH |
791       /* Treat the CP as a sort of "cache" which may need to be "flushed" via
792        * waiting for writes to land with WAIT_FOR_MEM_WRITES.
793        */
794       TU_CMD_FLAG_WAIT_MEM_WRITES,
795 
796    TU_CMD_FLAG_GPU_INVALIDATE =
797       TU_CMD_FLAG_CCU_INVALIDATE_DEPTH |
798       TU_CMD_FLAG_CCU_INVALIDATE_COLOR |
799       TU_CMD_FLAG_CACHE_INVALIDATE,
800 
801    TU_CMD_FLAG_ALL_INVALIDATE =
802       TU_CMD_FLAG_GPU_INVALIDATE |
803       /* Treat the CP as a sort of "cache" which may need to be "invalidated"
804        * via waiting for UCHE/CCU flushes to land with WFI/WFM.
805        */
806       TU_CMD_FLAG_WAIT_FOR_IDLE |
807       TU_CMD_FLAG_WAIT_FOR_ME,
808 };
809 
810 /* Changing the CCU from sysmem mode to gmem mode or vice-versa is pretty
811  * heavy, involving a CCU cache flush/invalidate and a WFI in order to change
812  * which part of the gmem is used by the CCU. Here we keep track of what the
813  * state of the CCU.
814  */
815 enum tu_cmd_ccu_state {
816    TU_CMD_CCU_SYSMEM,
817    TU_CMD_CCU_GMEM,
818    TU_CMD_CCU_UNKNOWN,
819 };
820 
821 struct tu_cache_state {
822    /* Caches which must be made available (flushed) eventually if there are
823     * any users outside that cache domain, and caches which must be
824     * invalidated eventually if there are any reads.
825     */
826    enum tu_cmd_flush_bits pending_flush_bits;
827    /* Pending flushes */
828    enum tu_cmd_flush_bits flush_bits;
829 };
830 
831 struct tu_lrz_pipeline
832 {
833    bool write : 1;
834    bool invalidate : 1;
835 
836    bool enable : 1;
837    bool greater : 1;
838    bool z_test_enable : 1;
839    bool blend_disable_write : 1;
840 };
841 
842 struct tu_lrz_state
843 {
844    /* Depth/Stencil image currently on use to do LRZ */
845    struct tu_image *image;
846    bool valid : 1;
847    struct tu_draw_state state;
848 };
849 
850 struct tu_cmd_state
851 {
852    uint32_t dirty;
853 
854    struct tu_pipeline *pipeline;
855    struct tu_pipeline *compute_pipeline;
856 
857    /* Vertex buffers, viewports, and scissors
858     * the states for these can be updated partially, so we need to save these
859     * to be able to emit a complete draw state
860     */
861    struct {
862       uint64_t base;
863       uint32_t size;
864       uint32_t stride;
865    } vb[MAX_VBS];
866    VkViewport viewport[MAX_VIEWPORTS];
867    VkRect2D scissor[MAX_SCISSORS];
868    uint32_t max_viewport, max_scissor;
869 
870    /* for dynamic states that can't be emitted directly */
871    uint32_t dynamic_stencil_mask;
872    uint32_t dynamic_stencil_wrmask;
873    uint32_t dynamic_stencil_ref;
874 
875    uint32_t gras_su_cntl, rb_depth_cntl, rb_stencil_cntl;
876    enum pc_di_primtype primtype;
877 
878    /* saved states to re-emit in TU_CMD_DIRTY_DRAW_STATE case */
879    struct tu_draw_state dynamic_state[TU_DYNAMIC_STATE_COUNT];
880    struct tu_draw_state vertex_buffers;
881    struct tu_draw_state shader_const[MESA_SHADER_STAGES];
882    struct tu_draw_state desc_sets;
883 
884    struct tu_draw_state vs_params;
885 
886    /* Index buffer */
887    uint64_t index_va;
888    uint32_t max_index_count;
889    uint8_t index_size;
890 
891    /* because streamout base has to be 32-byte aligned
892     * there is an extra offset to deal with when it is
893     * unaligned
894     */
895    uint8_t streamout_offset[IR3_MAX_SO_BUFFERS];
896 
897    /* Renderpasses are tricky, because we may need to flush differently if
898     * using sysmem vs. gmem and therefore we have to delay any flushing that
899     * happens before a renderpass. So we have to have two copies of the flush
900     * state, one for intra-renderpass flushes (i.e. renderpass dependencies)
901     * and one for outside a renderpass.
902     */
903    struct tu_cache_state cache;
904    struct tu_cache_state renderpass_cache;
905 
906    enum tu_cmd_ccu_state ccu_state;
907 
908    const struct tu_render_pass *pass;
909    const struct tu_subpass *subpass;
910    const struct tu_framebuffer *framebuffer;
911    VkRect2D render_area;
912 
913    struct tu_cs_entry tile_store_ib;
914 
915    bool xfb_used;
916    bool has_tess;
917    bool has_subpass_predication;
918    bool predication_active;
919 
920    struct tu_lrz_state lrz;
921 };
922 
923 struct tu_cmd_pool
924 {
925    struct vk_object_base base;
926 
927    VkAllocationCallbacks alloc;
928    struct list_head cmd_buffers;
929    struct list_head free_cmd_buffers;
930    uint32_t queue_family_index;
931 };
932 
933 enum tu_cmd_buffer_status
934 {
935    TU_CMD_BUFFER_STATUS_INVALID,
936    TU_CMD_BUFFER_STATUS_INITIAL,
937    TU_CMD_BUFFER_STATUS_RECORDING,
938    TU_CMD_BUFFER_STATUS_EXECUTABLE,
939    TU_CMD_BUFFER_STATUS_PENDING,
940 };
941 
942 struct tu_cmd_buffer
943 {
944    struct vk_object_base base;
945 
946    struct tu_device *device;
947 
948    struct tu_cmd_pool *pool;
949    struct list_head pool_link;
950 
951    VkCommandBufferUsageFlags usage_flags;
952    VkCommandBufferLevel level;
953    enum tu_cmd_buffer_status status;
954 
955    struct tu_cmd_state state;
956    uint32_t queue_family_index;
957 
958    uint32_t push_constants[MAX_PUSH_CONSTANTS_SIZE / 4];
959    VkShaderStageFlags push_constant_stages;
960    struct tu_descriptor_set meta_push_descriptors;
961 
962    struct tu_descriptor_state descriptors[MAX_BIND_POINTS];
963 
964    VkResult record_result;
965 
966    struct tu_cs cs;
967    struct tu_cs draw_cs;
968    struct tu_cs draw_epilogue_cs;
969    struct tu_cs sub_cs;
970 
971    uint32_t vsc_draw_strm_pitch;
972    uint32_t vsc_prim_strm_pitch;
973 };
974 
975 /* Temporary struct for tracking a register state to be written, used by
976  * a6xx-pack.h and tu_cs_emit_regs()
977  */
978 struct tu_reg_value {
979    uint32_t reg;
980    uint64_t value;
981    bool is_address;
982    struct tu_bo *bo;
983    bool bo_write;
984    uint32_t bo_offset;
985    uint32_t bo_shift;
986 };
987 
988 
989 void tu_emit_cache_flush_renderpass(struct tu_cmd_buffer *cmd_buffer,
990                                     struct tu_cs *cs);
991 
992 void tu_emit_cache_flush_ccu(struct tu_cmd_buffer *cmd_buffer,
993                              struct tu_cs *cs,
994                              enum tu_cmd_ccu_state ccu_state);
995 
996 void
997 tu6_emit_event_write(struct tu_cmd_buffer *cmd,
998                      struct tu_cs *cs,
999                      enum vgt_event_type event);
1000 
1001 static inline struct tu_descriptor_state *
tu_get_descriptors_state(struct tu_cmd_buffer * cmd_buffer,VkPipelineBindPoint bind_point)1002 tu_get_descriptors_state(struct tu_cmd_buffer *cmd_buffer,
1003                          VkPipelineBindPoint bind_point)
1004 {
1005    return &cmd_buffer->descriptors[bind_point];
1006 }
1007 
1008 struct tu_event
1009 {
1010    struct vk_object_base base;
1011    struct tu_bo bo;
1012 };
1013 
1014 struct tu_shader_module
1015 {
1016    struct vk_object_base base;
1017 
1018    uint32_t code_size;
1019    uint32_t code[];
1020 };
1021 
1022 struct tu_push_constant_range
1023 {
1024    uint32_t lo;
1025    uint32_t count;
1026 };
1027 
1028 struct tu_shader
1029 {
1030    struct ir3_shader *ir3_shader;
1031 
1032    struct tu_push_constant_range push_consts;
1033    uint8_t active_desc_sets;
1034    bool multi_pos_output;
1035 };
1036 
1037 bool
1038 tu_nir_lower_multiview(nir_shader *nir, uint32_t mask, bool *multi_pos_output,
1039                        struct tu_device *dev);
1040 
1041 nir_shader *
1042 tu_spirv_to_nir(struct tu_device *dev,
1043                 const VkPipelineShaderStageCreateInfo *stage_info,
1044                 gl_shader_stage stage);
1045 
1046 struct tu_shader *
1047 tu_shader_create(struct tu_device *dev,
1048                  nir_shader *nir,
1049                  unsigned multiview_mask,
1050                  struct tu_pipeline_layout *layout,
1051                  const VkAllocationCallbacks *alloc);
1052 
1053 void
1054 tu_shader_destroy(struct tu_device *dev,
1055                   struct tu_shader *shader,
1056                   const VkAllocationCallbacks *alloc);
1057 
1058 struct tu_program_descriptor_linkage
1059 {
1060    struct ir3_const_state const_state;
1061 
1062    uint32_t constlen;
1063 
1064    struct tu_push_constant_range push_consts;
1065 };
1066 
1067 struct tu_pipeline
1068 {
1069    struct vk_object_base base;
1070 
1071    struct tu_cs cs;
1072 
1073    struct tu_pipeline_layout *layout;
1074 
1075    bool need_indirect_descriptor_sets;
1076    VkShaderStageFlags active_stages;
1077    uint32_t active_desc_sets;
1078 
1079    /* mask of enabled dynamic states
1080     * if BIT(i) is set, pipeline->dynamic_state[i] is *NOT* used
1081     */
1082    uint32_t dynamic_state_mask;
1083    struct tu_draw_state dynamic_state[TU_DYNAMIC_STATE_COUNT];
1084 
1085    /* for dynamic states which use the same register: */
1086    uint32_t gras_su_cntl, gras_su_cntl_mask;
1087    uint32_t rb_depth_cntl, rb_depth_cntl_mask;
1088    uint32_t rb_stencil_cntl, rb_stencil_cntl_mask;
1089 
1090    bool rb_depth_cntl_disable;
1091 
1092    /* draw states for the pipeline */
1093    struct tu_draw_state load_state, rast_state, blend_state;
1094 
1095    /* for vertex buffers state */
1096    uint32_t num_vbs;
1097 
1098    struct
1099    {
1100       struct tu_draw_state state;
1101       struct tu_draw_state binning_state;
1102 
1103       struct tu_program_descriptor_linkage link[MESA_SHADER_STAGES];
1104    } program;
1105 
1106    struct
1107    {
1108       struct tu_draw_state state;
1109       struct tu_draw_state binning_state;
1110    } vi;
1111 
1112    struct
1113    {
1114       enum pc_di_primtype primtype;
1115       bool primitive_restart;
1116    } ia;
1117 
1118    struct
1119    {
1120       uint32_t patch_type;
1121       uint32_t param_stride;
1122       uint32_t hs_bo_regid;
1123       uint32_t ds_bo_regid;
1124       bool upper_left_domain_origin;
1125    } tess;
1126 
1127    struct
1128    {
1129       uint32_t local_size[3];
1130    } compute;
1131 
1132    struct tu_lrz_pipeline lrz;
1133 };
1134 
1135 void
1136 tu6_emit_viewport(struct tu_cs *cs, const VkViewport *viewport, uint32_t num_viewport);
1137 
1138 void
1139 tu6_emit_scissor(struct tu_cs *cs, const VkRect2D *scs, uint32_t scissor_count);
1140 
1141 void
1142 tu6_clear_lrz(struct tu_cmd_buffer *cmd, struct tu_cs *cs, struct tu_image* image, const VkClearValue *value);
1143 
1144 void
1145 tu6_emit_sample_locations(struct tu_cs *cs, const VkSampleLocationsInfoEXT *samp_loc);
1146 
1147 void
1148 tu6_emit_depth_bias(struct tu_cs *cs,
1149                     float constant_factor,
1150                     float clamp,
1151                     float slope_factor);
1152 
1153 void tu6_emit_msaa(struct tu_cs *cs, VkSampleCountFlagBits samples);
1154 
1155 void tu6_emit_window_scissor(struct tu_cs *cs, uint32_t x1, uint32_t y1, uint32_t x2, uint32_t y2);
1156 
1157 void tu6_emit_window_offset(struct tu_cs *cs, uint32_t x1, uint32_t y1);
1158 
1159 void
1160 tu6_emit_xs_config(struct tu_cs *cs,
1161                    gl_shader_stage stage,
1162                    const struct ir3_shader_variant *xs,
1163                    uint64_t binary_iova);
1164 
1165 void
1166 tu6_emit_vpc(struct tu_cs *cs,
1167              const struct ir3_shader_variant *vs,
1168              const struct ir3_shader_variant *hs,
1169              const struct ir3_shader_variant *ds,
1170              const struct ir3_shader_variant *gs,
1171              const struct ir3_shader_variant *fs,
1172              uint32_t patch_control_points,
1173              bool vshs_workgroup);
1174 
1175 void
1176 tu6_emit_fs_inputs(struct tu_cs *cs, const struct ir3_shader_variant *fs);
1177 
1178 struct tu_image_view;
1179 
1180 void
1181 tu_resolve_sysmem(struct tu_cmd_buffer *cmd,
1182                   struct tu_cs *cs,
1183                   struct tu_image_view *src,
1184                   struct tu_image_view *dst,
1185                   uint32_t layer_mask,
1186                   uint32_t layers,
1187                   const VkRect2D *rect);
1188 
1189 void
1190 tu_clear_sysmem_attachment(struct tu_cmd_buffer *cmd,
1191                            struct tu_cs *cs,
1192                            uint32_t a,
1193                            const VkRenderPassBeginInfo *info);
1194 
1195 void
1196 tu_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
1197                          struct tu_cs *cs,
1198                          uint32_t a,
1199                          const VkRenderPassBeginInfo *info);
1200 
1201 void
1202 tu_load_gmem_attachment(struct tu_cmd_buffer *cmd,
1203                         struct tu_cs *cs,
1204                         uint32_t a,
1205                         bool force_load);
1206 
1207 /* expose this function to be able to emit load without checking LOAD_OP */
1208 void
1209 tu_emit_load_gmem_attachment(struct tu_cmd_buffer *cmd, struct tu_cs *cs, uint32_t a);
1210 
1211 /* note: gmem store can also resolve */
1212 void
1213 tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
1214                          struct tu_cs *cs,
1215                          uint32_t a,
1216                          uint32_t gmem_a);
1217 
1218 enum tu_supported_formats {
1219    FMT_VERTEX = 1,
1220    FMT_TEXTURE = 2,
1221    FMT_COLOR = 4,
1222 };
1223 
1224 struct tu_native_format
1225 {
1226    enum a6xx_format fmt : 8;
1227    enum a3xx_color_swap swap : 8;
1228    enum a6xx_tile_mode tile_mode : 8;
1229    enum tu_supported_formats supported : 8;
1230 };
1231 
1232 struct tu_native_format tu6_format_vtx(VkFormat format);
1233 struct tu_native_format tu6_format_color(VkFormat format, enum a6xx_tile_mode tile_mode);
1234 struct tu_native_format tu6_format_texture(VkFormat format, enum a6xx_tile_mode tile_mode);
1235 
1236 static inline enum a6xx_format
tu6_base_format(VkFormat format)1237 tu6_base_format(VkFormat format)
1238 {
1239    /* note: tu6_format_color doesn't care about tiling for .fmt field */
1240    return tu6_format_color(format, TILE6_LINEAR).fmt;
1241 }
1242 
1243 struct tu_image
1244 {
1245    struct vk_object_base base;
1246 
1247    /* The original VkFormat provided by the client.  This may not match any
1248     * of the actual surface formats.
1249     */
1250    VkFormat vk_format;
1251    uint32_t level_count;
1252    uint32_t layer_count;
1253 
1254    struct fdl_layout layout[3];
1255    uint32_t total_size;
1256 
1257 #ifdef ANDROID
1258    /* For VK_ANDROID_native_buffer, the WSI image owns the memory, */
1259    VkDeviceMemory owned_memory;
1260 #endif
1261 
1262    /* Set when bound */
1263    struct tu_bo *bo;
1264    VkDeviceSize bo_offset;
1265 
1266    uint32_t lrz_height;
1267    uint32_t lrz_pitch;
1268    uint32_t lrz_offset;
1269 };
1270 
1271 static inline uint32_t
tu_get_layerCount(const struct tu_image * image,const VkImageSubresourceRange * range)1272 tu_get_layerCount(const struct tu_image *image,
1273                   const VkImageSubresourceRange *range)
1274 {
1275    return range->layerCount == VK_REMAINING_ARRAY_LAYERS
1276              ? image->layer_count - range->baseArrayLayer
1277              : range->layerCount;
1278 }
1279 
1280 static inline uint32_t
tu_get_levelCount(const struct tu_image * image,const VkImageSubresourceRange * range)1281 tu_get_levelCount(const struct tu_image *image,
1282                   const VkImageSubresourceRange *range)
1283 {
1284    return range->levelCount == VK_REMAINING_MIP_LEVELS
1285              ? image->level_count - range->baseMipLevel
1286              : range->levelCount;
1287 }
1288 
1289 struct tu_image_view
1290 {
1291    struct vk_object_base base;
1292 
1293    struct tu_image *image; /**< VkImageViewCreateInfo::image */
1294 
1295    uint64_t base_addr;
1296    uint64_t ubwc_addr;
1297    uint32_t layer_size;
1298    uint32_t ubwc_layer_size;
1299 
1300    /* used to determine if fast gmem store path can be used */
1301    VkExtent2D extent;
1302    bool need_y2_align;
1303 
1304    bool ubwc_enabled;
1305 
1306    uint32_t descriptor[A6XX_TEX_CONST_DWORDS];
1307 
1308    /* Descriptor for use as a storage image as opposed to a sampled image.
1309     * This has a few differences for cube maps (e.g. type).
1310     */
1311    uint32_t storage_descriptor[A6XX_TEX_CONST_DWORDS];
1312 
1313    /* pre-filled register values */
1314    uint32_t PITCH;
1315    uint32_t FLAG_BUFFER_PITCH;
1316 
1317    uint32_t RB_MRT_BUF_INFO;
1318    uint32_t SP_FS_MRT_REG;
1319 
1320    uint32_t SP_PS_2D_SRC_INFO;
1321    uint32_t SP_PS_2D_SRC_SIZE;
1322 
1323    uint32_t RB_2D_DST_INFO;
1324 
1325    uint32_t RB_BLIT_DST_INFO;
1326 
1327    /* for d32s8 separate stencil */
1328    uint64_t stencil_base_addr;
1329    uint32_t stencil_layer_size;
1330    uint32_t stencil_PITCH;
1331 };
1332 
1333 struct tu_sampler_ycbcr_conversion {
1334    struct vk_object_base base;
1335 
1336    VkFormat format;
1337    VkSamplerYcbcrModelConversion ycbcr_model;
1338    VkSamplerYcbcrRange ycbcr_range;
1339    VkComponentMapping components;
1340    VkChromaLocation chroma_offsets[2];
1341    VkFilter chroma_filter;
1342 };
1343 
1344 struct tu_sampler {
1345    struct vk_object_base base;
1346 
1347    uint32_t descriptor[A6XX_TEX_SAMP_DWORDS];
1348    struct tu_sampler_ycbcr_conversion *ycbcr_sampler;
1349 };
1350 
1351 void
1352 tu_cs_image_ref(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer);
1353 
1354 void
1355 tu_cs_image_ref_2d(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer, bool src);
1356 
1357 void
1358 tu_cs_image_flag_ref(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer);
1359 
1360 void
1361 tu_cs_image_stencil_ref(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer);
1362 
1363 #define tu_image_view_stencil(iview, x) \
1364    ((iview->x & ~A6XX_##x##_COLOR_FORMAT__MASK) | A6XX_##x##_COLOR_FORMAT(FMT6_8_UINT))
1365 
1366 VkResult
1367 tu_gralloc_info(struct tu_device *device,
1368                 const VkNativeBufferANDROID *gralloc_info,
1369                 int *dma_buf,
1370                 uint64_t *modifier);
1371 
1372 VkResult
1373 tu_import_memory_from_gralloc_handle(VkDevice device_h,
1374                                      int dma_buf,
1375                                      const VkAllocationCallbacks *alloc,
1376                                      VkImage image_h);
1377 
1378 void
1379 tu_image_view_init(struct tu_image_view *iview,
1380                    const VkImageViewCreateInfo *pCreateInfo,
1381                    bool limited_z24s8);
1382 
1383 bool
1384 ubwc_possible(VkFormat format, VkImageType type, VkImageUsageFlags usage, bool limited_z24s8);
1385 
1386 struct tu_buffer_view
1387 {
1388    struct vk_object_base base;
1389 
1390    uint32_t descriptor[A6XX_TEX_CONST_DWORDS];
1391 
1392    struct tu_buffer *buffer;
1393 };
1394 void
1395 tu_buffer_view_init(struct tu_buffer_view *view,
1396                     struct tu_device *device,
1397                     const VkBufferViewCreateInfo *pCreateInfo);
1398 
1399 struct tu_attachment_info
1400 {
1401    struct tu_image_view *attachment;
1402 };
1403 
1404 struct tu_framebuffer
1405 {
1406    struct vk_object_base base;
1407 
1408    uint32_t width;
1409    uint32_t height;
1410    uint32_t layers;
1411 
1412    /* size of the first tile */
1413    VkExtent2D tile0;
1414    /* number of tiles */
1415    VkExtent2D tile_count;
1416 
1417    /* size of the first VSC pipe */
1418    VkExtent2D pipe0;
1419    /* number of VSC pipes */
1420    VkExtent2D pipe_count;
1421 
1422    /* pipe register values */
1423    uint32_t pipe_config[MAX_VSC_PIPES];
1424    uint32_t pipe_sizes[MAX_VSC_PIPES];
1425 
1426    uint32_t attachment_count;
1427    struct tu_attachment_info attachments[0];
1428 };
1429 
1430 void
1431 tu_framebuffer_tiling_config(struct tu_framebuffer *fb,
1432                              const struct tu_device *device,
1433                              const struct tu_render_pass *pass);
1434 
1435 struct tu_subpass_barrier {
1436    VkPipelineStageFlags src_stage_mask;
1437    VkAccessFlags src_access_mask;
1438    VkAccessFlags dst_access_mask;
1439    bool incoherent_ccu_color, incoherent_ccu_depth;
1440 };
1441 
1442 struct tu_subpass_attachment
1443 {
1444    uint32_t attachment;
1445 };
1446 
1447 struct tu_subpass
1448 {
1449    uint32_t input_count;
1450    uint32_t color_count;
1451    struct tu_subpass_attachment *input_attachments;
1452    struct tu_subpass_attachment *color_attachments;
1453    struct tu_subpass_attachment *resolve_attachments;
1454    struct tu_subpass_attachment depth_stencil_attachment;
1455 
1456    VkSampleCountFlagBits samples;
1457 
1458    uint32_t srgb_cntl;
1459    uint32_t multiview_mask;
1460 
1461    struct tu_subpass_barrier start_barrier;
1462 };
1463 
1464 struct tu_render_pass_attachment
1465 {
1466    VkFormat format;
1467    uint32_t samples;
1468    uint32_t cpp;
1469    VkImageAspectFlags clear_mask;
1470    uint32_t clear_views;
1471    bool load;
1472    bool store;
1473    int32_t gmem_offset;
1474    /* for D32S8 separate stencil: */
1475    bool load_stencil;
1476    bool store_stencil;
1477    int32_t gmem_offset_stencil;
1478 };
1479 
1480 struct tu_render_pass
1481 {
1482    struct vk_object_base base;
1483 
1484    uint32_t attachment_count;
1485    uint32_t subpass_count;
1486    uint32_t gmem_pixels;
1487    uint32_t tile_align_w;
1488    struct tu_subpass_attachment *subpass_attachments;
1489    struct tu_render_pass_attachment *attachments;
1490    struct tu_subpass_barrier end_barrier;
1491    struct tu_subpass subpasses[0];
1492 };
1493 
1494 struct tu_query_pool
1495 {
1496    struct vk_object_base base;
1497 
1498    VkQueryType type;
1499    uint32_t stride;
1500    uint64_t size;
1501    uint32_t pipeline_statistics;
1502    struct tu_bo bo;
1503 };
1504 
1505 void
1506 tu_update_descriptor_sets(VkDescriptorSet overrideSet,
1507                           uint32_t descriptorWriteCount,
1508                           const VkWriteDescriptorSet *pDescriptorWrites,
1509                           uint32_t descriptorCopyCount,
1510                           const VkCopyDescriptorSet *pDescriptorCopies);
1511 
1512 void
1513 tu_update_descriptor_set_with_template(
1514    struct tu_descriptor_set *set,
1515    VkDescriptorUpdateTemplate descriptorUpdateTemplate,
1516    const void *pData);
1517 
1518 VkResult
1519 tu_physical_device_init(struct tu_physical_device *device,
1520                         struct tu_instance *instance);
1521 VkResult
1522 tu_enumerate_devices(struct tu_instance *instance);
1523 
1524 int
1525 tu_drm_submitqueue_new(const struct tu_device *dev,
1526                        int priority,
1527                        uint32_t *queue_id);
1528 
1529 void
1530 tu_drm_submitqueue_close(const struct tu_device *dev, uint32_t queue_id);
1531 
1532 int
1533 tu_signal_fences(struct tu_device *device, struct tu_syncobj *fence1, struct tu_syncobj *fence2);
1534 
1535 int
1536 tu_syncobj_to_fd(struct tu_device *device, struct tu_syncobj *sync);
1537 
1538 #define TU_DEFINE_HANDLE_CASTS(__tu_type, __VkType)                          \
1539                                                                              \
1540    static inline struct __tu_type *__tu_type##_from_handle(__VkType _handle) \
1541    {                                                                         \
1542       return (struct __tu_type *) _handle;                                   \
1543    }                                                                         \
1544                                                                              \
1545    static inline __VkType __tu_type##_to_handle(struct __tu_type *_obj)      \
1546    {                                                                         \
1547       return (__VkType) _obj;                                                \
1548    }
1549 
1550 #define TU_DEFINE_NONDISP_HANDLE_CASTS(__tu_type, __VkType)                  \
1551                                                                              \
1552    static inline struct __tu_type *__tu_type##_from_handle(__VkType _handle) \
1553    {                                                                         \
1554       return (struct __tu_type *) (uintptr_t) _handle;                       \
1555    }                                                                         \
1556                                                                              \
1557    static inline __VkType __tu_type##_to_handle(struct __tu_type *_obj)      \
1558    {                                                                         \
1559       return (__VkType)(uintptr_t) _obj;                                     \
1560    }
1561 
1562 #define TU_FROM_HANDLE(__tu_type, __name, __handle)                          \
1563    struct __tu_type *__name = __tu_type##_from_handle(__handle)
1564 
1565 TU_DEFINE_HANDLE_CASTS(tu_cmd_buffer, VkCommandBuffer)
1566 TU_DEFINE_HANDLE_CASTS(tu_device, VkDevice)
1567 TU_DEFINE_HANDLE_CASTS(tu_instance, VkInstance)
1568 TU_DEFINE_HANDLE_CASTS(tu_physical_device, VkPhysicalDevice)
1569 TU_DEFINE_HANDLE_CASTS(tu_queue, VkQueue)
1570 
1571 TU_DEFINE_NONDISP_HANDLE_CASTS(tu_cmd_pool, VkCommandPool)
1572 TU_DEFINE_NONDISP_HANDLE_CASTS(tu_buffer, VkBuffer)
1573 TU_DEFINE_NONDISP_HANDLE_CASTS(tu_buffer_view, VkBufferView)
1574 TU_DEFINE_NONDISP_HANDLE_CASTS(tu_descriptor_pool, VkDescriptorPool)
1575 TU_DEFINE_NONDISP_HANDLE_CASTS(tu_descriptor_set, VkDescriptorSet)
1576 TU_DEFINE_NONDISP_HANDLE_CASTS(tu_descriptor_set_layout,
1577                                VkDescriptorSetLayout)
1578 TU_DEFINE_NONDISP_HANDLE_CASTS(tu_descriptor_update_template,
1579                                VkDescriptorUpdateTemplate)
1580 TU_DEFINE_NONDISP_HANDLE_CASTS(tu_device_memory, VkDeviceMemory)
1581 TU_DEFINE_NONDISP_HANDLE_CASTS(tu_event, VkEvent)
1582 TU_DEFINE_NONDISP_HANDLE_CASTS(tu_framebuffer, VkFramebuffer)
1583 TU_DEFINE_NONDISP_HANDLE_CASTS(tu_image, VkImage)
1584 TU_DEFINE_NONDISP_HANDLE_CASTS(tu_image_view, VkImageView);
1585 TU_DEFINE_NONDISP_HANDLE_CASTS(tu_pipeline_cache, VkPipelineCache)
1586 TU_DEFINE_NONDISP_HANDLE_CASTS(tu_pipeline, VkPipeline)
1587 TU_DEFINE_NONDISP_HANDLE_CASTS(tu_pipeline_layout, VkPipelineLayout)
1588 TU_DEFINE_NONDISP_HANDLE_CASTS(tu_query_pool, VkQueryPool)
1589 TU_DEFINE_NONDISP_HANDLE_CASTS(tu_render_pass, VkRenderPass)
1590 TU_DEFINE_NONDISP_HANDLE_CASTS(tu_sampler, VkSampler)
1591 TU_DEFINE_NONDISP_HANDLE_CASTS(tu_sampler_ycbcr_conversion, VkSamplerYcbcrConversion)
1592 TU_DEFINE_NONDISP_HANDLE_CASTS(tu_shader_module, VkShaderModule)
1593 
1594 /* for TU_FROM_HANDLE with both VkFence and VkSemaphore: */
1595 #define tu_syncobj_from_handle(x) ((struct tu_syncobj*) (uintptr_t) (x))
1596 
1597 #endif /* TU_PRIVATE_H */
1598