1// Copyright 2021-2023 The Khronos Group Inc.
2//
3// SPDX-License-Identifier: CC-BY-4.0
4
5= VK_EXT_device_fault
6:toc: left
7:refpage: https://registry.khronos.org/vulkan/specs/1.2-extensions/man/html/
8:sectnums:
9
10This document outlines functionality to allow applications to query for
11additional diagnostic information following device-loss.
12
13== Problem Statement
14
15Device-loss errors can be challenging to diagnose. They can be triggered by a
16number of issues, including invalid application behaviour, driver bugs, and
17physical failure or removal of hardware. Whilst the Vulkan Validation layers are
18recommended as a first step in diagnosing the majority of API usage issues, they
19are unable to address all possible causes of device-loss.
20
21This proposal aims to provide application developers with additional information
22that may aid in diagnosing such errors.
23
24== Solution Space
25
26Several options have been considered:
27
28- Provide foundational extensions to enable the development of crash postmortem
29  tooling
30- Develop extensions or tools that aim to attribute faults to individual Vulkan
31  objects
32- Rely on individual vendor tools and extensions
33
34This proposal focuses on the first option. It represents a partial solution,
35with further extensions required in order to fully enable crash postmortem
36tooling.
37
38== Proposal
39
40=== API Features
41
42The following features are exposed by the `VK_EXT_device_fault` extension:
43
44[source,c]
45----
46typedef struct VkPhysicalDeviceFaultFeaturesEXT {
47    VkStructureType    sType;
48    void*              pNext;
49    VkBool32           deviceFault;
50    VkBool32           deviceFaultVendorBinary;
51} VkPhysicalDeviceFaultFeaturesEXT;
52----
53
54`deviceFault` is the main feature enabling this extension’s functionality and
55must be supported if this extension is supported.
56
57`deviceFaultVendorBinary` is an optional feature that enables support for
58vendor-specific binary crash dumps, which may be interpreted via external vendor
59tools.
60
61=== Querying for Fault Information
62
63Following device-loss, applications may query for additional diagnostic
64information by calling `vkGetDeviceFaultInfoEXT`.
65
66[source,c]
67----
68typedef struct VkDeviceFaultCountsEXT {
69    VkStructureType    sType;
70    void*              pNext;
71    uint32_t           addressInfoCount;
72    uint32_t           vendorInfoCount;
73    VkDeviceSize       vendorBinarySize;
74} VkDeviceFaultCountsEXT;
75
76typedef struct VkDeviceFaultInfoEXT {
77    VkStructureType                 sType;
78    void*                           pNext;
79    char                            description[VK_MAX_DESCRIPTION_SIZE];
80    VkDeviceFaultAddressInfoEXT*    pAddressInfos;
81    VkDeviceFaultVendorInfoEXT*     pVendorInfos;
82    void*                           pVendorBinaryData;
83} VkDeviceFaultInfoEXT;
84
85VKAPI_ATTR VkResult VKAPI_CALL vkGetDeviceFaultInfoEXT(
86    VkDevice                                    device,
87    VkDeviceFaultCountsEXT*                     pFaultCounts,
88    VkDeviceFaultInfoEXT*                       pFaultInfo);
89----
90
91The signature of `vkGetDeviceFaultInfoEXT` is intended to mirror the design of
92existing query functions, where the second parameter (`pFaultCounts`) indicates
93size of output arrays, or the number of results written. However, device fault
94information requires multiple output arrays. Therefore, a
95`VkDeviceFaultCountsEXT` structure is used to specify the sizes of multiple
96arrays at once.
97
98[source,c]
99----
100// Query number of available results
101VkDeviceFaultCountsEXT faultCounts{};
102faultCounts.sType = VK_STRUCTURE_TYPE_DEVICE_FAULT_COUNTS_EXT;
103
104vkGetDeviceFaultInfoEXT(device, &faultCounts, NULL);
105
106// Allocate output arrays and query fault data
107VkDeviceFaultInfoEXT faultInfo{}
108info.sType             = VK_STRUCTURE_TYPE_DEVICE_FAULT_INFO_EXT;
109info.pAddressInfos = (VkDeviceFaultAddressInfoEXT*) malloc(sizeof(VkDeviceFaultAddressInfoEXT) *
110                                                           faultCounts.addressInfoCount);
111info.pVendorInfos  = (VkDeviceFaultVendorInfoEXT*)  malloc(sizeof(VkDeviceFaultVendorInfoEXT)  *
112                                                           faultCounts.vendorInfoCount);
113info.pVendorBinaryData = malloc(faultCounts.vendorBinarySize);
114
115vkGetDeviceFaultInfoEXT(device, &faultCounts, &faultInfo);
116----
117
118=== Interpreting GPU Virtual Addresses
119
120Implementations may return information on both page faults generated by invalid
121memory accesses, and instruction pointers indicating the instructions executing
122at the time of the fault.
123
124[source,c]
125----
126typedef enum VkDeviceFaultAddressTypeEXT {
127    VK_DEVICE_FAULT_ADDRESS_TYPE_NONE_EXT = 0,
128    VK_DEVICE_FAULT_ADDRESS_TYPE_READ_INVALID_EXT = 1,
129    VK_DEVICE_FAULT_ADDRESS_TYPE_WRITE_INVALID_EXT = 2,
130    VK_DEVICE_FAULT_ADDRESS_TYPE_EXECUTE_INVALID_EXT = 3,
131    VK_DEVICE_FAULT_ADDRESS_TYPE_INSTRUCTION_POINTER_UNKNOWN_EXT = 4,
132    VK_DEVICE_FAULT_ADDRESS_TYPE_INSTRUCTION_POINTER_INVALID_EXT = 5,
133    VK_DEVICE_FAULT_ADDRESS_TYPE_INSTRUCTION_POINTER_FAULT_EXT = 6,
134    VK_DEVICE_FAULT_ADDRESS_TYPE_MAX_ENUM_EXT = 0x7FFFFFFF
135} VkDeviceFaultAddressTypeEXT;
136
137typedef struct VkDeviceFaultAddressInfoEXT {
138    VkDeviceFaultAddressTypeEXT    addressType;
139    VkDeviceAddress                reportedAddress;
140    VkDeviceSize                   addressPrecision;
141} VkDeviceFaultAddressInfoEXT;
142----
143
144Page addresses and instruction pointers are reported as GPU virtual addresses,
145and additional extensions or vendor tools may be required in order to correlate
146these extensions with individual Vulkan objects.
147
148Implementations may only be able to report these addresses with limited
149precision. The combination of `reportedAddress` and `addressPrecision`
150allow the possible range of addresses to be calculated, such that:
151
152[source,c++]
153---------------------------------------------------
154lower_address = (pInfo->reportedAddress & ~(pInfo->addressPrecision-1))
155upper_address = (pInfo->reportedAddress |  (pInfo->addressPrecision-1))
156---------------------------------------------------
157
158[NOTE]
159.Note
160====
161It is valid for the `reportedAddress` to contain a more precise address
162than indicated by `addressPrecision`.
163In this case, the value of `reportedAddress` should be
164treated as an additional hint as to the value of the address that triggered the
165page fault, or to the value of an instruction pointer.
166====
167
168
169=== Vendor Binary Crash Dumps
170
171Optionally, implementations may also support the generation of vendor-specific
172binary blobs containing additional diagnostic information. All vendor-specific
173binaries will begin with a common header. The contents of the remainder of the
174binary blob are vendor-specific, and will require vendor-specific documentation
175or tools to interpret.
176
177[source,c]
178----
179typedef struct VkDeviceFaultVendorBinaryHeaderVersionOneEXT {
180    uint32_t                                     headerSize;
181    VkDeviceFaultVendorBinaryHeaderVersionEXT    headerVersion;
182    uint32_t                                     vendorID;
183    uint32_t                                     deviceID;
184    uint32_t                                     driverVersion;
185    uint8_t                                      pipelineCacheUUID[VK_UUID_SIZE];
186    uint32_t                                     applicationNameOffset;
187    uint32_t                                     applicationVersion;
188    uint32_t                                     engineNameOffset;
189} VkDeviceFaultVendorBinaryHeaderVersionOneEXT;
190----
191
192== Issues
193
1941) Should `vkGetDeviceFaultInfoEXT` return multiple faults?
195
196*RESOLVED*: No. This extension only seeks to identify a single fault as a
197possible cause of device loss and not to maintain a log of multiple faults.
198We anticipate that in cases where a GPU does encounter multiple faults, there
199is a high probability that the faults would be duplicates, such as those caused
200by parallel execution of the same defective code.
201
2022) Can `vkGetDeviceFaultInfoEXT` be called prior to device loss?
203
204*RESOLVED*: No. `VK_KHR_fault_handling` in VulkanSC does support an equivalent
205to this, but `VK_KHR_fault_handling` aims to address a different use case, where
206a fault log is polled prior to device loss to enable remedial action to be taken.
207
2083) Do page faults need to report the actual address that was accessed, or
209should we allow reporting of the page address?
210
211*RESOLVED*: Some IHVs hardware reports page faults at page alignment, or
212at some other hardware-unit dependent granularity, rather than the precise
213address that triggered the fault. All addresses are reported at hardware-unit
214dependent granularity, along with an associated precision indicator. This information
215can be used to compute an address range that contains the original address that
216triggered the fault.
217
2184) How should we report cases where one of multiple pipelines may have caused a
219fault?
220
221*RESOLVED*: In cases where a fault cannot be attributed to a single unique
222pipeline, reporting the set of possible candidates is desirable.
223
2245) The page fault and instruction address information structures have similar
225structure. Should they be combined?
226
227*RESOLVED*: Yes. These have been combined as `VkDeviceFaultAddressInfoEXT`
228to reduce API surface area.
229
2306) How should implementors approach extensibility for vendor-specific faults?
231Should they rely on pname:pNext chains, or should the extension introduce a
232generic structure to return vendor error codes and human-readable descriptions
233in the base structure?
234
235*RESOLVED*: Implementors should utilize the generic
236`VkDeviceFaultVendorInfoEXT` structures where applicable, and fallback to
237extending pname:pNext chains where this is insufficient. Where a pname:pNext
238chain is required, vendors should tailor their human-readable error
239descriptions to advise developers that additional information may be available.
240