1 /*
2  * Copyright 2015,2016 Advanced Micro Devices, Inc.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * on the rights to use, copy, modify, merge, publish, distribute, sub
8  * license, and/or sell copies of the Software, and to permit persons to whom
9  * the Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
22  *
23  */
24 
25 #ifndef AMDKERNELCODET_H
26 #define AMDKERNELCODET_H
27 
28 //---------------------------------------------------------------------------//
29 // AMD Kernel Code, and its dependencies                                     //
30 //---------------------------------------------------------------------------//
31 
32 // Sets val bits for specified mask in specified dst packed instance.
33 #define AMD_HSA_BITS_SET(dst, mask, val)                                       \
34   dst &= (~(1 << mask ## _SHIFT) & ~mask);                                     \
35   dst |= (((val) << mask ## _SHIFT) & mask)
36 
37 // Gets bits for specified mask from specified src packed instance.
38 #define AMD_HSA_BITS_GET(src, mask)                                            \
39   ((src & mask) >> mask ## _SHIFT)
40 
41 /* Every amd_*_code_t has the following properties, which are composed of
42  * a number of bit fields. Every bit field has a mask (AMD_CODE_PROPERTY_*),
43  * bit width (AMD_CODE_PROPERTY_*_WIDTH, and bit shift amount
44  * (AMD_CODE_PROPERTY_*_SHIFT) for convenient access. Unused bits must be 0.
45  *
46  * (Note that bit fields cannot be used as their layout is
47  * implementation defined in the C standard and so cannot be used to
48  * specify an ABI)
49  */
50 enum amd_code_property_mask_t {
51 
52   /* Enable the setup of the SGPR user data registers
53    * (AMD_CODE_PROPERTY_ENABLE_SGPR_*), see documentation of amd_kernel_code_t
54    * for initial register state.
55    *
56    * The total number of SGPRuser data registers requested must not
57    * exceed 16. Any requests beyond 16 will be ignored.
58    *
59    * Used to set COMPUTE_PGM_RSRC2.USER_SGPR (set to total count of
60    * SGPR user data registers enabled up to 16).
61    */
62 
63   AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_SHIFT = 0,
64   AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_WIDTH = 1,
65   AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_SHIFT,
66 
67   AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_SHIFT = 1,
68   AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_WIDTH = 1,
69   AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_SHIFT,
70 
71   AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_SHIFT = 2,
72   AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_WIDTH = 1,
73   AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_SHIFT,
74 
75   AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_SHIFT = 3,
76   AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_WIDTH = 1,
77   AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_SHIFT,
78 
79   AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_SHIFT = 4,
80   AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_WIDTH = 1,
81   AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_SHIFT,
82 
83   AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_SHIFT = 5,
84   AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_WIDTH = 1,
85   AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_SHIFT,
86 
87   AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_SHIFT = 6,
88   AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_WIDTH = 1,
89   AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_SHIFT,
90 
91   AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_SHIFT = 7,
92   AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_WIDTH = 1,
93   AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_SHIFT,
94 
95   AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_SHIFT = 8,
96   AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_WIDTH = 1,
97   AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_SHIFT,
98 
99   AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_SHIFT = 9,
100   AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_WIDTH = 1,
101   AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_SHIFT,
102 
103   AMD_CODE_PROPERTY_RESERVED1_SHIFT = 10,
104   AMD_CODE_PROPERTY_RESERVED1_WIDTH = 6,
105   AMD_CODE_PROPERTY_RESERVED1 = ((1 << AMD_CODE_PROPERTY_RESERVED1_WIDTH) - 1) << AMD_CODE_PROPERTY_RESERVED1_SHIFT,
106 
107   /* Control wave ID base counter for GDS ordered-append. Used to set
108    * COMPUTE_DISPATCH_INITIATOR.ORDERED_APPEND_ENBL. (Not sure if
109    * ORDERED_APPEND_MODE also needs to be settable)
110    */
111   AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_SHIFT = 16,
112   AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_WIDTH = 1,
113   AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS = ((1 << AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_SHIFT,
114 
115   /* The interleave (swizzle) element size in bytes required by the
116    * code for private memory. This must be 2, 4, 8 or 16. This value
117    * is provided to the finalizer when it is invoked and is recorded
118    * here. The hardware will interleave the memory requests of each
119    * lane of a wavefront by this element size to ensure each
120    * work-item gets a distinct memory memory location. Therefore, the
121    * finalizer ensures that all load and store operations done to
122    * private memory do not exceed this size. For example, if the
123    * element size is 4 (32-bits or dword) and a 64-bit value must be
124    * loaded, the finalizer will generate two 32-bit loads. This
125    * ensures that the interleaving will get the work-item
126    * specific dword for both halves of the 64-bit value. If it just
127    * did a 64-bit load then it would get one dword which belonged to
128    * its own work-item, but the second dword would belong to the
129    * adjacent lane work-item since the interleaving is in dwords.
130    *
131    * The value used must match the value that the runtime configures
132    * the GPU flat scratch (SH_STATIC_MEM_CONFIG.ELEMENT_SIZE). This
133    * is generally DWORD.
134    *
135    * USE VALUES FROM THE AMD_ELEMENT_BYTE_SIZE_T ENUM.
136    */
137   AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_SHIFT = 17,
138   AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_WIDTH = 2,
139   AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE = ((1 << AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_WIDTH) - 1) << AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_SHIFT,
140 
141   /* Are global memory addresses 64 bits. Must match
142    * amd_kernel_code_t.hsail_machine_model ==
143    * HSA_MACHINE_LARGE. Must also match
144    * SH_MEM_CONFIG.PTR32 (GFX6 (SI)/GFX7 (CI)),
145    * SH_MEM_CONFIG.ADDRESS_MODE (GFX8 (VI)+).
146    */
147   AMD_CODE_PROPERTY_IS_PTR64_SHIFT = 19,
148   AMD_CODE_PROPERTY_IS_PTR64_WIDTH = 1,
149   AMD_CODE_PROPERTY_IS_PTR64 = ((1 << AMD_CODE_PROPERTY_IS_PTR64_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_PTR64_SHIFT,
150 
151   /* Indicate if the generated ISA is using a dynamically sized call
152    * stack. This can happen if calls are implemented using a call
153    * stack and recursion, alloca or calls to indirect functions are
154    * present. In these cases the Finalizer cannot compute the total
155    * private segment size at compile time. In this case the
156    * workitem_private_segment_byte_size only specifies the statically
157    * know private segment size, and additional space must be added
158    * for the call stack.
159    */
160   AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_SHIFT = 20,
161   AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_WIDTH = 1,
162   AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK = ((1 << AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_SHIFT,
163 
164   /* Indicate if code generated has support for debugging. */
165   AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_SHIFT = 21,
166   AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_WIDTH = 1,
167   AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED = ((1 << AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_SHIFT,
168 
169   AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_SHIFT = 22,
170   AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_WIDTH = 1,
171   AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED = ((1 << AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_SHIFT,
172 
173   AMD_CODE_PROPERTY_RESERVED2_SHIFT = 23,
174   AMD_CODE_PROPERTY_RESERVED2_WIDTH = 9,
175   AMD_CODE_PROPERTY_RESERVED2 = ((1 << AMD_CODE_PROPERTY_RESERVED2_WIDTH) - 1) << AMD_CODE_PROPERTY_RESERVED2_SHIFT
176 };
177 
178 /* AMD Kernel Code Object (amd_kernel_code_t). GPU CP uses the AMD Kernel
179  * Code Object to set up the hardware to execute the kernel dispatch.
180  *
181  * Initial Kernel Register State.
182  *
183  * Initial kernel register state will be set up by CP/SPI prior to the start
184  * of execution of every wavefront. This is limited by the constraints of the
185  * current hardware.
186  *
187  * The order of the SGPR registers is defined, but the Finalizer can specify
188  * which ones are actually setup in the amd_kernel_code_t object using the
189  * enable_sgpr_* bit fields. The register numbers used for enabled registers
190  * are dense starting at SGPR0: the first enabled register is SGPR0, the next
191  * enabled register is SGPR1 etc.; disabled registers do not have an SGPR
192  * number.
193  *
194  * The initial SGPRs comprise up to 16 User SRGPs that are set up by CP and
195  * apply to all waves of the grid. It is possible to specify more than 16 User
196  * SGPRs using the enable_sgpr_* bit fields, in which case only the first 16
197  * are actually initialized. These are then immediately followed by the System
198  * SGPRs that are set up by ADC/SPI and can have different values for each wave
199  * of the grid dispatch.
200  *
201  * SGPR register initial state is defined as follows:
202  *
203  * Private Segment Buffer (enable_sgpr_private_segment_buffer):
204  *   Number of User SGPR registers: 4. V# that can be used, together with
205  *   Scratch Wave Offset as an offset, to access the Private/Spill/Arg
206  *   segments using a segment address. It must be set as follows:
207  *     - Base address: of the scratch memory area used by the dispatch. It
208  *       does not include the scratch wave offset. It will be the per process
209  *       SH_HIDDEN_PRIVATE_BASE_VMID plus any offset from this dispatch (for
210  *       example there may be a per pipe offset, or per AQL Queue offset).
211  *     - Stride + data_format: Element Size * Index Stride (???)
212  *     - Cache swizzle: ???
213  *     - Swizzle enable: SH_STATIC_MEM_CONFIG.SWIZZLE_ENABLE (must be 1 for
214  *       scratch)
215  *     - Num records: Flat Scratch Work Item Size / Element Size (???)
216  *     - Dst_sel_*: ???
217  *     - Num_format: ???
218  *     - Element_size: SH_STATIC_MEM_CONFIG.ELEMENT_SIZE (will be DWORD, must
219  *       agree with amd_kernel_code_t.privateElementSize)
220  *     - Index_stride: SH_STATIC_MEM_CONFIG.INDEX_STRIDE (will be 64 as must
221  *       be number of wavefront lanes for scratch, must agree with
222  *       amd_kernel_code_t.wavefrontSize)
223  *     - Add tid enable: 1
224  *     - ATC: from SH_MEM_CONFIG.PRIVATE_ATC,
225  *     - Hash_enable: ???
226  *     - Heap: ???
227  *     - Mtype: from SH_STATIC_MEM_CONFIG.PRIVATE_MTYPE
228  *     - Type: 0 (a buffer) (???)
229  *
230  * Dispatch Ptr (enable_sgpr_dispatch_ptr):
231  *   Number of User SGPR registers: 2. 64 bit address of AQL dispatch packet
232  *   for kernel actually executing.
233  *
234  * Queue Ptr (enable_sgpr_queue_ptr):
235  *   Number of User SGPR registers: 2. 64 bit address of AmdQueue object for
236  *   AQL queue on which the dispatch packet was queued.
237  *
238  * Kernarg Segment Ptr (enable_sgpr_kernarg_segment_ptr):
239  *   Number of User SGPR registers: 2. 64 bit address of Kernarg segment. This
240  *   is directly copied from the kernargPtr in the dispatch packet. Having CP
241  *   load it once avoids loading it at the beginning of every wavefront.
242  *
243  * Dispatch Id (enable_sgpr_dispatch_id):
244  *   Number of User SGPR registers: 2. 64 bit Dispatch ID of the dispatch
245  *   packet being executed.
246  *
247  * Flat Scratch Init (enable_sgpr_flat_scratch_init):
248  *   Number of User SGPR registers: 2. This is 2 SGPRs.
249  *
250  *   For CI/VI:
251  *     The first SGPR is a 32 bit byte offset from SH_MEM_HIDDEN_PRIVATE_BASE
252  *     to base of memory for scratch for this dispatch. This is the same offset
253  *     used in computing the Scratch Segment Buffer base address. The value of
254  *     Scratch Wave Offset must be added by the kernel code and moved to
255  *     SGPRn-4 for use as the FLAT SCRATCH BASE in flat memory instructions.
256  *
257  *     The second SGPR is 32 bit byte size of a single work-item's scratch
258  *     memory usage. This is directly loaded from the dispatch packet Private
259  *     Segment Byte Size and rounded up to a multiple of DWORD.
260  *
261  *     \todo [Does CP need to round this to >4 byte alignment?]
262  *
263  *     The kernel code must move to SGPRn-3 for use as the FLAT SCRATCH SIZE in
264  *     flat memory instructions. Having CP load it once avoids loading it at
265  *     the beginning of every wavefront.
266  *
267  * Private Segment Size (enable_sgpr_private_segment_size):
268  *   Number of User SGPR registers: 1. The 32 bit byte size of a single
269  *   work-item's scratch memory allocation. This is the value from the dispatch
270  *   packet. Private Segment Byte Size rounded up by CP to a multiple of DWORD.
271  *
272  *   \todo [Does CP need to round this to >4 byte alignment?]
273  *
274  *   Having CP load it once avoids loading it at the beginning of every
275  *   wavefront.
276  *
277  *   \todo [This will not be used for CI/VI since it is the same value as
278  *   the second SGPR of Flat Scratch Init.
279  *
280  * Grid Work-Group Count X (enable_sgpr_grid_workgroup_count_x):
281  *   Number of User SGPR registers: 1. 32 bit count of the number of
282  *   work-groups in the X dimension for the grid being executed. Computed from
283  *   the fields in the HsaDispatchPacket as
284  *   ((gridSize.x+workgroupSize.x-1)/workgroupSize.x).
285  *
286  * Grid Work-Group Count Y (enable_sgpr_grid_workgroup_count_y):
287  *   Number of User SGPR registers: 1. 32 bit count of the number of
288  *   work-groups in the Y dimension for the grid being executed. Computed from
289  *   the fields in the HsaDispatchPacket as
290  *   ((gridSize.y+workgroupSize.y-1)/workgroupSize.y).
291  *
292  *   Only initialized if <16 previous SGPRs initialized.
293  *
294  * Grid Work-Group Count Z (enable_sgpr_grid_workgroup_count_z):
295  *   Number of User SGPR registers: 1. 32 bit count of the number of
296  *   work-groups in the Z dimension for the grid being executed. Computed
297  *   from the fields in the HsaDispatchPacket as
298  *   ((gridSize.z+workgroupSize.z-1)/workgroupSize.z).
299  *
300  *   Only initialized if <16 previous SGPRs initialized.
301  *
302  * Work-Group Id X (enable_sgpr_workgroup_id_x):
303  *   Number of System SGPR registers: 1. 32 bit work group id in X dimension
304  *   of grid for wavefront. Always present.
305  *
306  * Work-Group Id Y (enable_sgpr_workgroup_id_y):
307  *   Number of System SGPR registers: 1. 32 bit work group id in Y dimension
308  *   of grid for wavefront.
309  *
310  * Work-Group Id Z (enable_sgpr_workgroup_id_z):
311  *   Number of System SGPR registers: 1. 32 bit work group id in Z dimension
312  *   of grid for wavefront. If present then Work-group Id Y will also be
313  *   present
314  *
315  * Work-Group Info (enable_sgpr_workgroup_info):
316  *   Number of System SGPR registers: 1. {first_wave, 14'b0000,
317  *   ordered_append_term[10:0], threadgroup_size_in_waves[5:0]}
318  *
319  * Private Segment Wave Byte Offset
320  * (enable_sgpr_private_segment_wave_byte_offset):
321  *   Number of System SGPR registers: 1. 32 bit byte offset from base of
322  *   dispatch scratch base. Must be used as an offset with Private/Spill/Arg
323  *   segment address when using Scratch Segment Buffer. It must be added to
324  *   Flat Scratch Offset if setting up FLAT SCRATCH for flat addressing.
325  *
326  *
327  * The order of the VGPR registers is defined, but the Finalizer can specify
328  * which ones are actually setup in the amd_kernel_code_t object using the
329  * enableVgpr*  bit fields. The register numbers used for enabled registers
330  * are dense starting at VGPR0: the first enabled register is VGPR0, the next
331  * enabled register is VGPR1 etc.; disabled registers do not have an VGPR
332  * number.
333  *
334  * VGPR register initial state is defined as follows:
335  *
336  * Work-Item Id X (always initialized):
337  *   Number of registers: 1. 32 bit work item id in X dimension of work-group
338  *   for wavefront lane.
339  *
340  * Work-Item Id X (enable_vgpr_workitem_id > 0):
341  *   Number of registers: 1. 32 bit work item id in Y dimension of work-group
342  *   for wavefront lane.
343  *
344  * Work-Item Id X (enable_vgpr_workitem_id > 0):
345  *   Number of registers: 1. 32 bit work item id in Z dimension of work-group
346  *   for wavefront lane.
347  *
348  *
349  * The setting of registers is being done by existing GPU hardware as follows:
350  *   1) SGPRs before the Work-Group Ids are set by CP using the 16 User Data
351  *      registers.
352  *   2) Work-group Id registers X, Y, Z are set by SPI which supports any
353  *      combination including none.
354  *   3) Scratch Wave Offset is also set by SPI which is why its value cannot
355  *      be added into the value Flat Scratch Offset which would avoid the
356  *      Finalizer generated prolog having to do the add.
357  *   4) The VGPRs are set by SPI which only supports specifying either (X),
358  *      (X, Y) or (X, Y, Z).
359  *
360  * Flat Scratch Dispatch Offset and Flat Scratch Size are adjacent SGRRs so
361  * they can be moved as a 64 bit value to the hardware required SGPRn-3 and
362  * SGPRn-4 respectively using the Finalizer ?FLAT_SCRATCH? Register.
363  *
364  * The global segment can be accessed either using flat operations or buffer
365  * operations. If buffer operations are used then the Global Buffer used to
366  * access HSAIL Global/Readonly/Kernarg (which are combine) segments using a
367  * segment address is not passed into the kernel code by CP since its base
368  * address is always 0. Instead the Finalizer generates prolog code to
369  * initialize 4 SGPRs with a V# that has the following properties, and then
370  * uses that in the buffer instructions:
371  *   - base address of 0
372  *   - no swizzle
373  *   - ATC=1
374  *   - MTYPE set to support memory coherence specified in
375  *     amd_kernel_code_t.globalMemoryCoherence
376  *
377  * When the Global Buffer is used to access the Kernarg segment, must add the
378  * dispatch packet kernArgPtr to a kernarg segment address before using this V#.
379  * Alternatively scalar loads can be used if the kernarg offset is uniform, as
380  * the kernarg segment is constant for the duration of the kernel execution.
381  */
382 
383 typedef struct amd_kernel_code_s {
384   uint32_t amd_kernel_code_version_major;
385   uint32_t amd_kernel_code_version_minor;
386   uint16_t amd_machine_kind;
387   uint16_t amd_machine_version_major;
388   uint16_t amd_machine_version_minor;
389   uint16_t amd_machine_version_stepping;
390 
391   /* Byte offset (possibly negative) from start of amd_kernel_code_t
392    * object to kernel's entry point instruction. The actual code for
393    * the kernel is required to be 256 byte aligned to match hardware
394    * requirements (SQ cache line is 16). The code must be position
395    * independent code (PIC) for AMD devices to give runtime the
396    * option of copying code to discrete GPU memory or APU L2
397    * cache. The Finalizer should endeavour to allocate all kernel
398    * machine code in contiguous memory pages so that a device
399    * pre-fetcher will tend to only pre-fetch Kernel Code objects,
400    * improving cache performance.
401    */
402   int64_t kernel_code_entry_byte_offset;
403 
404   /* Range of bytes to consider prefetching expressed as an offset
405    * and size. The offset is from the start (possibly negative) of
406    * amd_kernel_code_t object. Set both to 0 if no prefetch
407    * information is available.
408    */
409   int64_t kernel_code_prefetch_byte_offset;
410   uint64_t kernel_code_prefetch_byte_size;
411 
412   /* Number of bytes of scratch backing memory required for full
413    * occupancy of target chip. This takes into account the number of
414    * bytes of scratch per work-item, the wavefront size, the maximum
415    * number of wavefronts per CU, and the number of CUs. This is an
416    * upper limit on scratch. If the grid being dispatched is small it
417    * may only need less than this. If the kernel uses no scratch, or
418    * the Finalizer has not computed this value, it must be 0.
419    */
420   uint64_t max_scratch_backing_memory_byte_size;
421 
422   /* Shader program settings for CS. Contains COMPUTE_PGM_RSRC1 and
423    * COMPUTE_PGM_RSRC2 registers.
424    */
425   uint64_t compute_pgm_resource_registers;
426 
427   /* Code properties. See amd_code_property_mask_t for a full list of
428    * properties.
429    */
430   uint32_t code_properties;
431 
432   /* The amount of memory required for the combined private, spill
433    * and arg segments for a work-item in bytes. If
434    * is_dynamic_callstack is 1 then additional space must be added to
435    * this value for the call stack.
436    */
437   uint32_t workitem_private_segment_byte_size;
438 
439   /* The amount of group segment memory required by a work-group in
440    * bytes. This does not include any dynamically allocated group
441    * segment memory that may be added when the kernel is
442    * dispatched.
443    */
444   uint32_t workgroup_group_segment_byte_size;
445 
446   /* Number of byte of GDS required by kernel dispatch. Must be 0 if
447    * not using GDS.
448    */
449   uint32_t gds_segment_byte_size;
450 
451   /* The size in bytes of the kernarg segment that holds the values
452    * of the arguments to the kernel. This could be used by CP to
453    * prefetch the kernarg segment pointed to by the dispatch packet.
454    */
455   uint64_t kernarg_segment_byte_size;
456 
457   /* Number of fbarrier's used in the kernel and all functions it
458    * calls. If the implementation uses group memory to allocate the
459    * fbarriers then that amount must already be included in the
460    * workgroup_group_segment_byte_size total.
461    */
462   uint32_t workgroup_fbarrier_count;
463 
464   /* Number of scalar registers used by a wavefront. This includes
465    * the special SGPRs for VCC, Flat Scratch Base, Flat Scratch Size
466    * and XNACK (for GFX8 (VI)). It does not include the 16 SGPR added if a
467    * trap handler is enabled. Used to set COMPUTE_PGM_RSRC1.SGPRS.
468    */
469   uint16_t wavefront_sgpr_count;
470 
471   /* Number of vector registers used by each work-item. Used to set
472    * COMPUTE_PGM_RSRC1.VGPRS.
473    */
474   uint16_t workitem_vgpr_count;
475 
476   /* If reserved_vgpr_count is 0 then must be 0. Otherwise, this is the
477    * first fixed VGPR number reserved.
478    */
479   uint16_t reserved_vgpr_first;
480 
481   /* The number of consecutive VGPRs reserved by the client. If
482    * is_debug_supported then this count includes VGPRs reserved
483    * for debugger use.
484    */
485   uint16_t reserved_vgpr_count;
486 
487   /* If reserved_sgpr_count is 0 then must be 0. Otherwise, this is the
488    * first fixed SGPR number reserved.
489    */
490   uint16_t reserved_sgpr_first;
491 
492   /* The number of consecutive SGPRs reserved by the client. If
493    * is_debug_supported then this count includes SGPRs reserved
494    * for debugger use.
495    */
496   uint16_t reserved_sgpr_count;
497 
498   /* If is_debug_supported is 0 then must be 0. Otherwise, this is the
499    * fixed SGPR number used to hold the wave scratch offset for the
500    * entire kernel execution, or uint16_t(-1) if the register is not
501    * used or not known.
502    */
503   uint16_t debug_wavefront_private_segment_offset_sgpr;
504 
505   /* If is_debug_supported is 0 then must be 0. Otherwise, this is the
506    * fixed SGPR number of the first of 4 SGPRs used to hold the
507    * scratch V# used for the entire kernel execution, or uint16_t(-1)
508    * if the registers are not used or not known.
509    */
510   uint16_t debug_private_segment_buffer_sgpr;
511 
512   /* The maximum byte alignment of variables used by the kernel in
513    * the specified memory segment. Expressed as a power of two. Must
514    * be at least HSA_POWERTWO_16.
515    */
516   uint8_t kernarg_segment_alignment;
517   uint8_t group_segment_alignment;
518   uint8_t private_segment_alignment;
519 
520   /* Wavefront size expressed as a power of two. Must be a power of 2
521    * in range 1..64 inclusive. Used to support runtime query that
522    * obtains wavefront size, which may be used by application to
523    * allocated dynamic group memory and set the dispatch work-group
524    * size.
525    */
526   uint8_t wavefront_size;
527 
528   int32_t call_convention;
529   uint8_t reserved3[12];
530   uint64_t runtime_loader_kernel_symbol;
531   uint64_t control_directives[16];
532 } amd_kernel_code_t;
533 
534 #endif // AMDKERNELCODET_H
535