1 //===----RTLs/hsa/src/rtl.cpp - Target RTLs Implementation -------- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // RTL for hsa machine
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include <algorithm>
14 #include <assert.h>
15 #include <cstdio>
16 #include <cstdlib>
17 #include <cstring>
18 #include <dlfcn.h>
19 #include <elf.h>
20 #include <ffi.h>
21 #include <fstream>
22 #include <iostream>
23 #include <libelf.h>
24 #include <list>
25 #include <memory>
26 #include <mutex>
27 #include <shared_mutex>
28 #include <thread>
29 #include <unordered_map>
30 #include <vector>
31 
32 // Header from ATMI interface
33 #include "atmi_interop_hsa.h"
34 #include "atmi_runtime.h"
35 
36 #include "internal.h"
37 
38 #include "Debug.h"
39 #include "omptargetplugin.h"
40 
41 #include "llvm/Frontend/OpenMP/OMPGridValues.h"
42 
43 #ifndef TARGET_NAME
44 #define TARGET_NAME AMDHSA
45 #endif
46 #define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL"
47 
48 // hostrpc interface, FIXME: consider moving to its own include these are
49 // statically linked into amdgpu/plugin if present from hostrpc_services.a,
50 // linked as --whole-archive to override the weak symbols that are used to
51 // implement a fallback for toolchains that do not yet have a hostrpc library.
52 extern "C" {
53 unsigned long hostrpc_assign_buffer(hsa_agent_t agent, hsa_queue_t *this_Q,
54                                     uint32_t device_id);
55 hsa_status_t hostrpc_init();
56 hsa_status_t hostrpc_terminate();
57 
hostrpc_init()58 __attribute__((weak)) hsa_status_t hostrpc_init() { return HSA_STATUS_SUCCESS; }
hostrpc_terminate()59 __attribute__((weak)) hsa_status_t hostrpc_terminate() {
60   return HSA_STATUS_SUCCESS;
61 }
62 __attribute__((weak)) unsigned long
hostrpc_assign_buffer(hsa_agent_t,hsa_queue_t *,uint32_t device_id)63 hostrpc_assign_buffer(hsa_agent_t, hsa_queue_t *, uint32_t device_id) {
64   DP("Warning: Attempting to assign hostrpc to device %u, but hostrpc library "
65      "missing\n",
66      device_id);
67   return 0;
68 }
69 }
70 
71 int print_kernel_trace;
72 
73 // Size of the target call stack struture
74 uint32_t TgtStackItemSize = 0;
75 
76 #undef check // Drop definition from internal.h
77 #ifdef OMPTARGET_DEBUG
78 #define check(msg, status)                                                     \
79   if (status != ATMI_STATUS_SUCCESS) {                                         \
80     /* fprintf(stderr, "[%s:%d] %s failed.\n", __FILE__, __LINE__, #msg);*/    \
81     DP(#msg " failed\n");                                                      \
82     /*assert(0);*/                                                             \
83   } else {                                                                     \
84     /* fprintf(stderr, "[%s:%d] %s succeeded.\n", __FILE__, __LINE__, #msg);   \
85      */                                                                        \
86     DP(#msg " succeeded\n");                                                   \
87   }
88 #else
89 #define check(msg, status)                                                     \
90   {}
91 #endif
92 
93 #include "../../common/elf_common.c"
94 
elf_machine_id_is_amdgcn(__tgt_device_image * image)95 static bool elf_machine_id_is_amdgcn(__tgt_device_image *image) {
96   const uint16_t amdgcnMachineID = 224;
97   int32_t r = elf_check_machine(image, amdgcnMachineID);
98   if (!r) {
99     DP("Supported machine ID not found\n");
100   }
101   return r;
102 }
103 
104 /// Keep entries table per device
105 struct FuncOrGblEntryTy {
106   __tgt_target_table Table;
107   std::vector<__tgt_offload_entry> Entries;
108 };
109 
110 enum ExecutionModeType {
111   SPMD,    // constructors, destructors,
112            // combined constructs (`teams distribute parallel for [simd]`)
113   GENERIC, // everything else
114   NONE
115 };
116 
117 struct KernelArgPool {
118 private:
119   static pthread_mutex_t mutex;
120 
121 public:
122   uint32_t kernarg_segment_size;
123   void *kernarg_region = nullptr;
124   std::queue<int> free_kernarg_segments;
125 
kernarg_size_including_implicitKernelArgPool126   uint32_t kernarg_size_including_implicit() {
127     return kernarg_segment_size + sizeof(atmi_implicit_args_t);
128   }
129 
~KernelArgPoolKernelArgPool130   ~KernelArgPool() {
131     if (kernarg_region) {
132       auto r = hsa_amd_memory_pool_free(kernarg_region);
133       assert(r == HSA_STATUS_SUCCESS);
134       ErrorCheck(Memory pool free, r);
135     }
136   }
137 
138   // Can't really copy or move a mutex
139   KernelArgPool() = default;
140   KernelArgPool(const KernelArgPool &) = delete;
141   KernelArgPool(KernelArgPool &&) = delete;
142 
KernelArgPoolKernelArgPool143   KernelArgPool(uint32_t kernarg_segment_size)
144       : kernarg_segment_size(kernarg_segment_size) {
145 
146     // atmi uses one pool per kernel for all gpus, with a fixed upper size
147     // preserving that exact scheme here, including the queue<int>
148     {
149       hsa_status_t err = hsa_amd_memory_pool_allocate(
150           atl_gpu_kernarg_pools[0],
151           kernarg_size_including_implicit() * MAX_NUM_KERNELS, 0,
152           &kernarg_region);
153       ErrorCheck(Allocating memory for the executable-kernel, err);
154       core::allow_access_to_all_gpu_agents(kernarg_region);
155 
156       for (int i = 0; i < MAX_NUM_KERNELS; i++) {
157         free_kernarg_segments.push(i);
158       }
159     }
160   }
161 
allocateKernelArgPool162   void *allocate(uint64_t arg_num) {
163     assert((arg_num * sizeof(void *)) == kernarg_segment_size);
164     lock l(&mutex);
165     void *res = nullptr;
166     if (!free_kernarg_segments.empty()) {
167 
168       int free_idx = free_kernarg_segments.front();
169       res = static_cast<void *>(static_cast<char *>(kernarg_region) +
170                                 (free_idx * kernarg_size_including_implicit()));
171       assert(free_idx == pointer_to_index(res));
172       free_kernarg_segments.pop();
173     }
174     return res;
175   }
176 
deallocateKernelArgPool177   void deallocate(void *ptr) {
178     lock l(&mutex);
179     int idx = pointer_to_index(ptr);
180     free_kernarg_segments.push(idx);
181   }
182 
183 private:
pointer_to_indexKernelArgPool184   int pointer_to_index(void *ptr) {
185     ptrdiff_t bytes =
186         static_cast<char *>(ptr) - static_cast<char *>(kernarg_region);
187     assert(bytes >= 0);
188     assert(bytes % kernarg_size_including_implicit() == 0);
189     return bytes / kernarg_size_including_implicit();
190   }
191   struct lock {
lockKernelArgPool::lock192     lock(pthread_mutex_t *m) : m(m) { pthread_mutex_lock(m); }
~lockKernelArgPool::lock193     ~lock() { pthread_mutex_unlock(m); }
194     pthread_mutex_t *m;
195   };
196 };
197 pthread_mutex_t KernelArgPool::mutex = PTHREAD_MUTEX_INITIALIZER;
198 
199 std::unordered_map<std::string /*kernel*/, std::unique_ptr<KernelArgPool>>
200     KernelArgPoolMap;
201 
202 /// Use a single entity to encode a kernel and a set of flags
203 struct KernelTy {
204   // execution mode of kernel
205   // 0 - SPMD mode (without master warp)
206   // 1 - Generic mode (with master warp)
207   int8_t ExecutionMode;
208   int16_t ConstWGSize;
209   int32_t device_id;
210   void *CallStackAddr = nullptr;
211   const char *Name;
212 
KernelTyKernelTy213   KernelTy(int8_t _ExecutionMode, int16_t _ConstWGSize, int32_t _device_id,
214            void *_CallStackAddr, const char *_Name,
215            uint32_t _kernarg_segment_size)
216       : ExecutionMode(_ExecutionMode), ConstWGSize(_ConstWGSize),
217         device_id(_device_id), CallStackAddr(_CallStackAddr), Name(_Name) {
218     DP("Construct kernelinfo: ExecMode %d\n", ExecutionMode);
219 
220     std::string N(_Name);
221     if (KernelArgPoolMap.find(N) == KernelArgPoolMap.end()) {
222       KernelArgPoolMap.insert(
223           std::make_pair(N, std::unique_ptr<KernelArgPool>(
224                                 new KernelArgPool(_kernarg_segment_size))));
225     }
226   }
227 };
228 
229 /// List that contains all the kernels.
230 /// FIXME: we may need this to be per device and per library.
231 std::list<KernelTy> KernelsList;
232 
233 // ATMI API to get gpu and gpu memory place
get_gpu_place(int device_id)234 static atmi_place_t get_gpu_place(int device_id) {
235   return ATMI_PLACE_GPU(0, device_id);
236 }
get_gpu_mem_place(int device_id)237 static atmi_mem_place_t get_gpu_mem_place(int device_id) {
238   return ATMI_MEM_PLACE_GPU_MEM(0, device_id, 0);
239 }
240 
find_gpu_agents()241 static std::vector<hsa_agent_t> find_gpu_agents() {
242   std::vector<hsa_agent_t> res;
243 
244   hsa_status_t err = hsa_iterate_agents(
245       [](hsa_agent_t agent, void *data) -> hsa_status_t {
246         std::vector<hsa_agent_t> *res =
247             static_cast<std::vector<hsa_agent_t> *>(data);
248 
249         hsa_device_type_t device_type;
250         // get_info fails iff HSA runtime not yet initialized
251         hsa_status_t err =
252             hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &device_type);
253         if (print_kernel_trace > 0 && err != HSA_STATUS_SUCCESS)
254           printf("rtl.cpp: err %d\n", err);
255         assert(err == HSA_STATUS_SUCCESS);
256 
257         if (device_type == HSA_DEVICE_TYPE_GPU) {
258           res->push_back(agent);
259         }
260         return HSA_STATUS_SUCCESS;
261       },
262       &res);
263 
264   // iterate_agents fails iff HSA runtime not yet initialized
265   if (print_kernel_trace > 0 && err != HSA_STATUS_SUCCESS)
266     printf("rtl.cpp: err %d\n", err);
267   assert(err == HSA_STATUS_SUCCESS);
268   return res;
269 }
270 
callbackQueue(hsa_status_t status,hsa_queue_t * source,void * data)271 static void callbackQueue(hsa_status_t status, hsa_queue_t *source,
272                           void *data) {
273   if (status != HSA_STATUS_SUCCESS) {
274     const char *status_string;
275     if (hsa_status_string(status, &status_string) != HSA_STATUS_SUCCESS) {
276       status_string = "unavailable";
277     }
278     fprintf(stderr, "[%s:%d] GPU error in queue %p %d (%s)\n", __FILE__,
279             __LINE__, source, status, status_string);
280     abort();
281   }
282 }
283 
284 namespace core {
packet_store_release(uint32_t * packet,uint16_t header,uint16_t rest)285 void packet_store_release(uint32_t *packet, uint16_t header, uint16_t rest) {
286   __atomic_store_n(packet, header | (rest << 16), __ATOMIC_RELEASE);
287 }
288 
create_header(hsa_packet_type_t type,int barrier,atmi_task_fence_scope_t acq_fence,atmi_task_fence_scope_t rel_fence)289 uint16_t create_header(hsa_packet_type_t type, int barrier,
290                        atmi_task_fence_scope_t acq_fence,
291                        atmi_task_fence_scope_t rel_fence) {
292   uint16_t header = type << HSA_PACKET_HEADER_TYPE;
293   header |= barrier << HSA_PACKET_HEADER_BARRIER;
294   header |= (hsa_fence_scope_t) static_cast<int>(
295       acq_fence << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE);
296   header |= (hsa_fence_scope_t) static_cast<int>(
297       rel_fence << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
298   return header;
299 }
300 } // namespace core
301 
302 /// Class containing all the device information
303 class RTLDeviceInfoTy {
304   std::vector<std::list<FuncOrGblEntryTy>> FuncGblEntries;
305 
306 public:
307   // load binary populates symbol tables and mutates various global state
308   // run uses those symbol tables
309   std::shared_timed_mutex load_run_lock;
310 
311   int NumberOfDevices;
312 
313   // GPU devices
314   std::vector<hsa_agent_t> HSAAgents;
315   std::vector<hsa_queue_t *> HSAQueues; // one per gpu
316 
317   // Device properties
318   std::vector<int> ComputeUnits;
319   std::vector<int> GroupsPerDevice;
320   std::vector<int> ThreadsPerGroup;
321   std::vector<int> WarpSize;
322 
323   // OpenMP properties
324   std::vector<int> NumTeams;
325   std::vector<int> NumThreads;
326 
327   // OpenMP Environment properties
328   int EnvNumTeams;
329   int EnvTeamLimit;
330   int EnvMaxTeamsDefault;
331 
332   // OpenMP Requires Flags
333   int64_t RequiresFlags;
334 
335   // Resource pools
336   SignalPoolT FreeSignalPool;
337 
338   struct atmiFreePtrDeletor {
operator ()RTLDeviceInfoTy::atmiFreePtrDeletor339     void operator()(void *p) {
340       atmi_free(p); // ignore failure to free
341     }
342   };
343 
344   // device_State shared across loaded binaries, error if inconsistent size
345   std::vector<std::pair<std::unique_ptr<void, atmiFreePtrDeletor>, uint64_t>>
346       deviceStateStore;
347 
348   static const unsigned HardTeamLimit =
349       (1 << 16) - 1; // 64K needed to fit in uint16
350   static const int DefaultNumTeams = 128;
351   static const int Max_Teams =
352       llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Max_Teams];
353   static const int Warp_Size =
354       llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Warp_Size];
355   static const int Max_WG_Size =
356       llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Max_WG_Size];
357   static const int Default_WG_Size =
358       llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Default_WG_Size];
359 
360   using MemcpyFunc = atmi_status_t (*)(hsa_signal_t, void *, const void *,
361                                        size_t size, hsa_agent_t);
freesignalpool_memcpy(void * dest,const void * src,size_t size,MemcpyFunc Func,int32_t deviceId)362   atmi_status_t freesignalpool_memcpy(void *dest, const void *src, size_t size,
363                                       MemcpyFunc Func, int32_t deviceId) {
364     hsa_agent_t agent = HSAAgents[deviceId];
365     hsa_signal_t s = FreeSignalPool.pop();
366     if (s.handle == 0) {
367       return ATMI_STATUS_ERROR;
368     }
369     atmi_status_t r = Func(s, dest, src, size, agent);
370     FreeSignalPool.push(s);
371     return r;
372   }
373 
freesignalpool_memcpy_d2h(void * dest,const void * src,size_t size,int32_t deviceId)374   atmi_status_t freesignalpool_memcpy_d2h(void *dest, const void *src,
375                                           size_t size, int32_t deviceId) {
376     return freesignalpool_memcpy(dest, src, size, atmi_memcpy_d2h, deviceId);
377   }
378 
freesignalpool_memcpy_h2d(void * dest,const void * src,size_t size,int32_t deviceId)379   atmi_status_t freesignalpool_memcpy_h2d(void *dest, const void *src,
380                                           size_t size, int32_t deviceId) {
381     return freesignalpool_memcpy(dest, src, size, atmi_memcpy_h2d, deviceId);
382   }
383 
384   // Record entry point associated with device
addOffloadEntry(int32_t device_id,__tgt_offload_entry entry)385   void addOffloadEntry(int32_t device_id, __tgt_offload_entry entry) {
386     assert(device_id < (int32_t)FuncGblEntries.size() &&
387            "Unexpected device id!");
388     FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
389 
390     E.Entries.push_back(entry);
391   }
392 
393   // Return true if the entry is associated with device
findOffloadEntry(int32_t device_id,void * addr)394   bool findOffloadEntry(int32_t device_id, void *addr) {
395     assert(device_id < (int32_t)FuncGblEntries.size() &&
396            "Unexpected device id!");
397     FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
398 
399     for (auto &it : E.Entries) {
400       if (it.addr == addr)
401         return true;
402     }
403 
404     return false;
405   }
406 
407   // Return the pointer to the target entries table
getOffloadEntriesTable(int32_t device_id)408   __tgt_target_table *getOffloadEntriesTable(int32_t device_id) {
409     assert(device_id < (int32_t)FuncGblEntries.size() &&
410            "Unexpected device id!");
411     FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
412 
413     int32_t size = E.Entries.size();
414 
415     // Table is empty
416     if (!size)
417       return 0;
418 
419     __tgt_offload_entry *begin = &E.Entries[0];
420     __tgt_offload_entry *end = &E.Entries[size - 1];
421 
422     // Update table info according to the entries and return the pointer
423     E.Table.EntriesBegin = begin;
424     E.Table.EntriesEnd = ++end;
425 
426     return &E.Table;
427   }
428 
429   // Clear entries table for a device
clearOffloadEntriesTable(int device_id)430   void clearOffloadEntriesTable(int device_id) {
431     assert(device_id < (int32_t)FuncGblEntries.size() &&
432            "Unexpected device id!");
433     FuncGblEntries[device_id].emplace_back();
434     FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
435     // KernelArgPoolMap.clear();
436     E.Entries.clear();
437     E.Table.EntriesBegin = E.Table.EntriesEnd = 0;
438   }
439 
RTLDeviceInfoTy()440   RTLDeviceInfoTy() {
441     // LIBOMPTARGET_KERNEL_TRACE provides a kernel launch trace to stderr
442     // anytime. You do not need a debug library build.
443     //  0 => no tracing
444     //  1 => tracing dispatch only
445     // >1 => verbosity increase
446     if (char *envStr = getenv("LIBOMPTARGET_KERNEL_TRACE"))
447       print_kernel_trace = atoi(envStr);
448     else
449       print_kernel_trace = 0;
450 
451     DP("Start initializing HSA-ATMI\n");
452     atmi_status_t err = atmi_init();
453     if (err != ATMI_STATUS_SUCCESS) {
454       DP("Error when initializing HSA-ATMI\n");
455       return;
456     }
457     // Init hostcall soon after initializing ATMI
458     hostrpc_init();
459 
460     HSAAgents = find_gpu_agents();
461     NumberOfDevices = (int)HSAAgents.size();
462 
463     if (NumberOfDevices == 0) {
464       DP("There are no devices supporting HSA.\n");
465       return;
466     } else {
467       DP("There are %d devices supporting HSA.\n", NumberOfDevices);
468     }
469 
470     // Init the device info
471     HSAQueues.resize(NumberOfDevices);
472     FuncGblEntries.resize(NumberOfDevices);
473     ThreadsPerGroup.resize(NumberOfDevices);
474     ComputeUnits.resize(NumberOfDevices);
475     GroupsPerDevice.resize(NumberOfDevices);
476     WarpSize.resize(NumberOfDevices);
477     NumTeams.resize(NumberOfDevices);
478     NumThreads.resize(NumberOfDevices);
479     deviceStateStore.resize(NumberOfDevices);
480 
481     for (int i = 0; i < NumberOfDevices; i++) {
482       uint32_t queue_size = 0;
483       {
484         hsa_status_t err;
485         err = hsa_agent_get_info(HSAAgents[i], HSA_AGENT_INFO_QUEUE_MAX_SIZE,
486                                  &queue_size);
487         ErrorCheck(Querying the agent maximum queue size, err);
488         if (queue_size > core::Runtime::getInstance().getMaxQueueSize()) {
489           queue_size = core::Runtime::getInstance().getMaxQueueSize();
490         }
491       }
492 
493       hsa_status_t rc = hsa_queue_create(
494           HSAAgents[i], queue_size, HSA_QUEUE_TYPE_MULTI, callbackQueue, NULL,
495           UINT32_MAX, UINT32_MAX, &HSAQueues[i]);
496       if (rc != HSA_STATUS_SUCCESS) {
497         DP("Failed to create HSA queues\n");
498         return;
499       }
500 
501       deviceStateStore[i] = {nullptr, 0};
502     }
503 
504     for (int i = 0; i < NumberOfDevices; i++) {
505       ThreadsPerGroup[i] = RTLDeviceInfoTy::Default_WG_Size;
506       GroupsPerDevice[i] = RTLDeviceInfoTy::DefaultNumTeams;
507       ComputeUnits[i] = 1;
508       DP("Device %d: Initial groupsPerDevice %d & threadsPerGroup %d\n", i,
509          GroupsPerDevice[i], ThreadsPerGroup[i]);
510     }
511 
512     // Get environment variables regarding teams
513     char *envStr = getenv("OMP_TEAM_LIMIT");
514     if (envStr) {
515       // OMP_TEAM_LIMIT has been set
516       EnvTeamLimit = std::stoi(envStr);
517       DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit);
518     } else {
519       EnvTeamLimit = -1;
520     }
521     envStr = getenv("OMP_NUM_TEAMS");
522     if (envStr) {
523       // OMP_NUM_TEAMS has been set
524       EnvNumTeams = std::stoi(envStr);
525       DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams);
526     } else {
527       EnvNumTeams = -1;
528     }
529     // Get environment variables regarding expMaxTeams
530     envStr = getenv("OMP_MAX_TEAMS_DEFAULT");
531     if (envStr) {
532       EnvMaxTeamsDefault = std::stoi(envStr);
533       DP("Parsed OMP_MAX_TEAMS_DEFAULT=%d\n", EnvMaxTeamsDefault);
534     } else {
535       EnvMaxTeamsDefault = -1;
536     }
537 
538     // Default state.
539     RequiresFlags = OMP_REQ_UNDEFINED;
540   }
541 
~RTLDeviceInfoTy()542   ~RTLDeviceInfoTy() {
543     DP("Finalizing the HSA-ATMI DeviceInfo.\n");
544     // Run destructors on types that use HSA before
545     // atmi_finalize removes access to it
546     deviceStateStore.clear();
547     KernelArgPoolMap.clear();
548     // Terminate hostrpc before finalizing ATMI
549     hostrpc_terminate();
550     atmi_finalize();
551   }
552 };
553 
554 pthread_mutex_t SignalPoolT::mutex = PTHREAD_MUTEX_INITIALIZER;
555 
556 // TODO: May need to drop the trailing to fields until deviceRTL is updated
557 struct omptarget_device_environmentTy {
558   int32_t debug_level; // gets value of envvar LIBOMPTARGET_DEVICE_RTL_DEBUG
559                        // only useful for Debug build of deviceRTLs
560   int32_t num_devices; // gets number of active offload devices
561   int32_t device_num;  // gets a value 0 to num_devices-1
562 };
563 
564 static RTLDeviceInfoTy DeviceInfo;
565 
566 namespace {
567 
dataRetrieve(int32_t DeviceId,void * HstPtr,void * TgtPtr,int64_t Size,__tgt_async_info * AsyncInfoPtr)568 int32_t dataRetrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr, int64_t Size,
569                      __tgt_async_info *AsyncInfoPtr) {
570   assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");
571   assert(DeviceId < DeviceInfo.NumberOfDevices && "Device ID too large");
572   // Return success if we are not copying back to host from target.
573   if (!HstPtr)
574     return OFFLOAD_SUCCESS;
575   atmi_status_t err;
576   DP("Retrieve data %ld bytes, (tgt:%016llx) -> (hst:%016llx).\n", Size,
577      (long long unsigned)(Elf64_Addr)TgtPtr,
578      (long long unsigned)(Elf64_Addr)HstPtr);
579 
580   err = DeviceInfo.freesignalpool_memcpy_d2h(HstPtr, TgtPtr, (size_t)Size,
581                                              DeviceId);
582 
583   if (err != ATMI_STATUS_SUCCESS) {
584     DP("Error when copying data from device to host. Pointers: "
585        "host = 0x%016lx, device = 0x%016lx, size = %lld\n",
586        (Elf64_Addr)HstPtr, (Elf64_Addr)TgtPtr, (unsigned long long)Size);
587     return OFFLOAD_FAIL;
588   }
589   DP("DONE Retrieve data %ld bytes, (tgt:%016llx) -> (hst:%016llx).\n", Size,
590      (long long unsigned)(Elf64_Addr)TgtPtr,
591      (long long unsigned)(Elf64_Addr)HstPtr);
592   return OFFLOAD_SUCCESS;
593 }
594 
dataSubmit(int32_t DeviceId,void * TgtPtr,void * HstPtr,int64_t Size,__tgt_async_info * AsyncInfoPtr)595 int32_t dataSubmit(int32_t DeviceId, void *TgtPtr, void *HstPtr, int64_t Size,
596                    __tgt_async_info *AsyncInfoPtr) {
597   assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");
598   atmi_status_t err;
599   assert(DeviceId < DeviceInfo.NumberOfDevices && "Device ID too large");
600   // Return success if we are not doing host to target.
601   if (!HstPtr)
602     return OFFLOAD_SUCCESS;
603 
604   DP("Submit data %ld bytes, (hst:%016llx) -> (tgt:%016llx).\n", Size,
605      (long long unsigned)(Elf64_Addr)HstPtr,
606      (long long unsigned)(Elf64_Addr)TgtPtr);
607   err = DeviceInfo.freesignalpool_memcpy_h2d(TgtPtr, HstPtr, (size_t)Size,
608                                              DeviceId);
609   if (err != ATMI_STATUS_SUCCESS) {
610     DP("Error when copying data from host to device. Pointers: "
611        "host = 0x%016lx, device = 0x%016lx, size = %lld\n",
612        (Elf64_Addr)HstPtr, (Elf64_Addr)TgtPtr, (unsigned long long)Size);
613     return OFFLOAD_FAIL;
614   }
615   return OFFLOAD_SUCCESS;
616 }
617 
618 // Async.
619 // The implementation was written with cuda streams in mind. The semantics of
620 // that are to execute kernels on a queue in order of insertion. A synchronise
621 // call then makes writes visible between host and device. This means a series
622 // of N data_submit_async calls are expected to execute serially. HSA offers
623 // various options to run the data copies concurrently. This may require changes
624 // to libomptarget.
625 
626 // __tgt_async_info* contains a void * Queue. Queue = 0 is used to indicate that
627 // there are no outstanding kernels that need to be synchronized. Any async call
628 // may be passed a Queue==0, at which point the cuda implementation will set it
629 // to non-null (see getStream). The cuda streams are per-device. Upstream may
630 // change this interface to explicitly initialize the async_info_pointer, but
631 // until then hsa lazily initializes it as well.
632 
initAsyncInfoPtr(__tgt_async_info * async_info_ptr)633 void initAsyncInfoPtr(__tgt_async_info *async_info_ptr) {
634   // set non-null while using async calls, return to null to indicate completion
635   assert(async_info_ptr);
636   if (!async_info_ptr->Queue) {
637     async_info_ptr->Queue = reinterpret_cast<void *>(UINT64_MAX);
638   }
639 }
finiAsyncInfoPtr(__tgt_async_info * async_info_ptr)640 void finiAsyncInfoPtr(__tgt_async_info *async_info_ptr) {
641   assert(async_info_ptr);
642   assert(async_info_ptr->Queue);
643   async_info_ptr->Queue = 0;
644 }
645 } // namespace
646 
__tgt_rtl_is_valid_binary(__tgt_device_image * image)647 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) {
648   return elf_machine_id_is_amdgcn(image);
649 }
650 
__tgt_rtl_number_of_devices()651 int __tgt_rtl_number_of_devices() { return DeviceInfo.NumberOfDevices; }
652 
__tgt_rtl_init_requires(int64_t RequiresFlags)653 int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) {
654   DP("Init requires flags to %ld\n", RequiresFlags);
655   DeviceInfo.RequiresFlags = RequiresFlags;
656   return RequiresFlags;
657 }
658 
__tgt_rtl_init_device(int device_id)659 int32_t __tgt_rtl_init_device(int device_id) {
660   hsa_status_t err;
661 
662   // this is per device id init
663   DP("Initialize the device id: %d\n", device_id);
664 
665   hsa_agent_t agent = DeviceInfo.HSAAgents[device_id];
666 
667   // Get number of Compute Unit
668   uint32_t compute_units = 0;
669   err = hsa_agent_get_info(
670       agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT,
671       &compute_units);
672   if (err != HSA_STATUS_SUCCESS) {
673     DeviceInfo.ComputeUnits[device_id] = 1;
674     DP("Error getting compute units : settiing to 1\n");
675   } else {
676     DeviceInfo.ComputeUnits[device_id] = compute_units;
677     DP("Using %d compute unis per grid\n", DeviceInfo.ComputeUnits[device_id]);
678   }
679   if (print_kernel_trace == 4)
680     fprintf(stderr, "Device#%-2d CU's: %2d\n", device_id,
681             DeviceInfo.ComputeUnits[device_id]);
682 
683   // Query attributes to determine number of threads/block and blocks/grid.
684   uint16_t workgroup_max_dim[3];
685   err = hsa_agent_get_info(agent, HSA_AGENT_INFO_WORKGROUP_MAX_DIM,
686                            &workgroup_max_dim);
687   if (err != HSA_STATUS_SUCCESS) {
688     DeviceInfo.GroupsPerDevice[device_id] = RTLDeviceInfoTy::DefaultNumTeams;
689     DP("Error getting grid dims: num groups : %d\n",
690        RTLDeviceInfoTy::DefaultNumTeams);
691   } else if (workgroup_max_dim[0] <= RTLDeviceInfoTy::HardTeamLimit) {
692     DeviceInfo.GroupsPerDevice[device_id] = workgroup_max_dim[0];
693     DP("Using %d ROCm blocks per grid\n",
694        DeviceInfo.GroupsPerDevice[device_id]);
695   } else {
696     DeviceInfo.GroupsPerDevice[device_id] = RTLDeviceInfoTy::HardTeamLimit;
697     DP("Max ROCm blocks per grid %d exceeds the hard team limit %d, capping "
698        "at the hard limit\n",
699        workgroup_max_dim[0], RTLDeviceInfoTy::HardTeamLimit);
700   }
701 
702   // Get thread limit
703   hsa_dim3_t grid_max_dim;
704   err = hsa_agent_get_info(agent, HSA_AGENT_INFO_GRID_MAX_DIM, &grid_max_dim);
705   if (err == HSA_STATUS_SUCCESS) {
706     DeviceInfo.ThreadsPerGroup[device_id] =
707         reinterpret_cast<uint32_t *>(&grid_max_dim)[0] /
708         DeviceInfo.GroupsPerDevice[device_id];
709     if ((DeviceInfo.ThreadsPerGroup[device_id] >
710          RTLDeviceInfoTy::Max_WG_Size) ||
711         DeviceInfo.ThreadsPerGroup[device_id] == 0) {
712       DP("Capped thread limit: %d\n", RTLDeviceInfoTy::Max_WG_Size);
713       DeviceInfo.ThreadsPerGroup[device_id] = RTLDeviceInfoTy::Max_WG_Size;
714     } else {
715       DP("Using ROCm Queried thread limit: %d\n",
716          DeviceInfo.ThreadsPerGroup[device_id]);
717     }
718   } else {
719     DeviceInfo.ThreadsPerGroup[device_id] = RTLDeviceInfoTy::Max_WG_Size;
720     DP("Error getting max block dimension, use default:%d \n",
721        RTLDeviceInfoTy::Max_WG_Size);
722   }
723 
724   // Get wavefront size
725   uint32_t wavefront_size = 0;
726   err =
727       hsa_agent_get_info(agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &wavefront_size);
728   if (err == HSA_STATUS_SUCCESS) {
729     DP("Queried wavefront size: %d\n", wavefront_size);
730     DeviceInfo.WarpSize[device_id] = wavefront_size;
731   } else {
732     DP("Default wavefront size: %d\n",
733        llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Warp_Size]);
734     DeviceInfo.WarpSize[device_id] =
735         llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Warp_Size];
736   }
737 
738   // Adjust teams to the env variables
739   if (DeviceInfo.EnvTeamLimit > 0 &&
740       DeviceInfo.GroupsPerDevice[device_id] > DeviceInfo.EnvTeamLimit) {
741     DeviceInfo.GroupsPerDevice[device_id] = DeviceInfo.EnvTeamLimit;
742     DP("Capping max groups per device to OMP_TEAM_LIMIT=%d\n",
743        DeviceInfo.EnvTeamLimit);
744   }
745 
746   // Set default number of teams
747   if (DeviceInfo.EnvNumTeams > 0) {
748     DeviceInfo.NumTeams[device_id] = DeviceInfo.EnvNumTeams;
749     DP("Default number of teams set according to environment %d\n",
750        DeviceInfo.EnvNumTeams);
751   } else {
752     DeviceInfo.NumTeams[device_id] = RTLDeviceInfoTy::DefaultNumTeams;
753     DP("Default number of teams set according to library's default %d\n",
754        RTLDeviceInfoTy::DefaultNumTeams);
755   }
756 
757   if (DeviceInfo.NumTeams[device_id] > DeviceInfo.GroupsPerDevice[device_id]) {
758     DeviceInfo.NumTeams[device_id] = DeviceInfo.GroupsPerDevice[device_id];
759     DP("Default number of teams exceeds device limit, capping at %d\n",
760        DeviceInfo.GroupsPerDevice[device_id]);
761   }
762 
763   // Set default number of threads
764   DeviceInfo.NumThreads[device_id] = RTLDeviceInfoTy::Default_WG_Size;
765   DP("Default number of threads set according to library's default %d\n",
766      RTLDeviceInfoTy::Default_WG_Size);
767   if (DeviceInfo.NumThreads[device_id] >
768       DeviceInfo.ThreadsPerGroup[device_id]) {
769     DeviceInfo.NumTeams[device_id] = DeviceInfo.ThreadsPerGroup[device_id];
770     DP("Default number of threads exceeds device limit, capping at %d\n",
771        DeviceInfo.ThreadsPerGroup[device_id]);
772   }
773 
774   DP("Device %d: default limit for groupsPerDevice %d & threadsPerGroup %d\n",
775      device_id, DeviceInfo.GroupsPerDevice[device_id],
776      DeviceInfo.ThreadsPerGroup[device_id]);
777 
778   DP("Device %d: wavefront size %d, total threads %d x %d = %d\n", device_id,
779      DeviceInfo.WarpSize[device_id], DeviceInfo.ThreadsPerGroup[device_id],
780      DeviceInfo.GroupsPerDevice[device_id],
781      DeviceInfo.GroupsPerDevice[device_id] *
782          DeviceInfo.ThreadsPerGroup[device_id]);
783 
784   return OFFLOAD_SUCCESS;
785 }
786 
787 namespace {
find_only_SHT_HASH(Elf * elf)788 Elf64_Shdr *find_only_SHT_HASH(Elf *elf) {
789   size_t N;
790   int rc = elf_getshdrnum(elf, &N);
791   if (rc != 0) {
792     return nullptr;
793   }
794 
795   Elf64_Shdr *result = nullptr;
796   for (size_t i = 0; i < N; i++) {
797     Elf_Scn *scn = elf_getscn(elf, i);
798     if (scn) {
799       Elf64_Shdr *shdr = elf64_getshdr(scn);
800       if (shdr) {
801         if (shdr->sh_type == SHT_HASH) {
802           if (result == nullptr) {
803             result = shdr;
804           } else {
805             // multiple SHT_HASH sections not handled
806             return nullptr;
807           }
808         }
809       }
810     }
811   }
812   return result;
813 }
814 
elf_lookup(Elf * elf,char * base,Elf64_Shdr * section_hash,const char * symname)815 const Elf64_Sym *elf_lookup(Elf *elf, char *base, Elf64_Shdr *section_hash,
816                             const char *symname) {
817 
818   assert(section_hash);
819   size_t section_symtab_index = section_hash->sh_link;
820   Elf64_Shdr *section_symtab =
821       elf64_getshdr(elf_getscn(elf, section_symtab_index));
822   size_t section_strtab_index = section_symtab->sh_link;
823 
824   const Elf64_Sym *symtab =
825       reinterpret_cast<const Elf64_Sym *>(base + section_symtab->sh_offset);
826 
827   const uint32_t *hashtab =
828       reinterpret_cast<const uint32_t *>(base + section_hash->sh_offset);
829 
830   // Layout:
831   // nbucket
832   // nchain
833   // bucket[nbucket]
834   // chain[nchain]
835   uint32_t nbucket = hashtab[0];
836   const uint32_t *bucket = &hashtab[2];
837   const uint32_t *chain = &hashtab[nbucket + 2];
838 
839   const size_t max = strlen(symname) + 1;
840   const uint32_t hash = elf_hash(symname);
841   for (uint32_t i = bucket[hash % nbucket]; i != 0; i = chain[i]) {
842     char *n = elf_strptr(elf, section_strtab_index, symtab[i].st_name);
843     if (strncmp(symname, n, max) == 0) {
844       return &symtab[i];
845     }
846   }
847 
848   return nullptr;
849 }
850 
851 typedef struct {
852   void *addr = nullptr;
853   uint32_t size = UINT32_MAX;
854 } symbol_info;
855 
get_symbol_info_without_loading(Elf * elf,char * base,const char * symname,symbol_info * res)856 int get_symbol_info_without_loading(Elf *elf, char *base, const char *symname,
857                                     symbol_info *res) {
858   if (elf_kind(elf) != ELF_K_ELF) {
859     return 1;
860   }
861 
862   Elf64_Shdr *section_hash = find_only_SHT_HASH(elf);
863   if (!section_hash) {
864     return 1;
865   }
866 
867   const Elf64_Sym *sym = elf_lookup(elf, base, section_hash, symname);
868   if (!sym) {
869     return 1;
870   }
871 
872   if (sym->st_size > UINT32_MAX) {
873     return 1;
874   }
875 
876   res->size = static_cast<uint32_t>(sym->st_size);
877   res->addr = sym->st_value + base;
878   return 0;
879 }
880 
get_symbol_info_without_loading(char * base,size_t img_size,const char * symname,symbol_info * res)881 int get_symbol_info_without_loading(char *base, size_t img_size,
882                                     const char *symname, symbol_info *res) {
883   Elf *elf = elf_memory(base, img_size);
884   if (elf) {
885     int rc = get_symbol_info_without_loading(elf, base, symname, res);
886     elf_end(elf);
887     return rc;
888   }
889   return 1;
890 }
891 
interop_get_symbol_info(char * base,size_t img_size,const char * symname,void ** var_addr,uint32_t * var_size)892 atmi_status_t interop_get_symbol_info(char *base, size_t img_size,
893                                       const char *symname, void **var_addr,
894                                       uint32_t *var_size) {
895   symbol_info si;
896   int rc = get_symbol_info_without_loading(base, img_size, symname, &si);
897   if (rc == 0) {
898     *var_addr = si.addr;
899     *var_size = si.size;
900     return ATMI_STATUS_SUCCESS;
901   } else {
902     return ATMI_STATUS_ERROR;
903   }
904 }
905 
906 template <typename C>
module_register_from_memory_to_place(void * module_bytes,size_t module_size,atmi_place_t place,C cb)907 atmi_status_t module_register_from_memory_to_place(void *module_bytes,
908                                                    size_t module_size,
909                                                    atmi_place_t place, C cb) {
910   auto L = [](void *data, size_t size, void *cb_state) -> atmi_status_t {
911     C *unwrapped = static_cast<C *>(cb_state);
912     return (*unwrapped)(data, size);
913   };
914   return atmi_module_register_from_memory_to_place(
915       module_bytes, module_size, place, L, static_cast<void *>(&cb));
916 }
917 } // namespace
918 
get_device_State_bytes(char * ImageStart,size_t img_size)919 static uint64_t get_device_State_bytes(char *ImageStart, size_t img_size) {
920   uint64_t device_State_bytes = 0;
921   {
922     // If this is the deviceRTL, get the state variable size
923     symbol_info size_si;
924     int rc = get_symbol_info_without_loading(
925         ImageStart, img_size, "omptarget_nvptx_device_State_size", &size_si);
926 
927     if (rc == 0) {
928       if (size_si.size != sizeof(uint64_t)) {
929         fprintf(stderr,
930                 "Found device_State_size variable with wrong size, aborting\n");
931         exit(1);
932       }
933 
934       // Read number of bytes directly from the elf
935       memcpy(&device_State_bytes, size_si.addr, sizeof(uint64_t));
936     }
937   }
938   return device_State_bytes;
939 }
940 
941 static __tgt_target_table *
942 __tgt_rtl_load_binary_locked(int32_t device_id, __tgt_device_image *image);
943 
944 static __tgt_target_table *
945 __tgt_rtl_load_binary_locked(int32_t device_id, __tgt_device_image *image);
946 
__tgt_rtl_load_binary(int32_t device_id,__tgt_device_image * image)947 __tgt_target_table *__tgt_rtl_load_binary(int32_t device_id,
948                                           __tgt_device_image *image) {
949   DeviceInfo.load_run_lock.lock();
950   __tgt_target_table *res = __tgt_rtl_load_binary_locked(device_id, image);
951   DeviceInfo.load_run_lock.unlock();
952   return res;
953 }
954 
atmi_calloc(void ** ret_ptr,size_t size,atmi_mem_place_t place)955 static atmi_status_t atmi_calloc(void **ret_ptr, size_t size,
956                                  atmi_mem_place_t place) {
957   uint64_t rounded = 4 * ((size + 3) / 4);
958   void *ptr;
959   atmi_status_t err = atmi_malloc(&ptr, rounded, place);
960   if (err != ATMI_STATUS_SUCCESS) {
961     return err;
962   }
963 
964   hsa_status_t rc = hsa_amd_memory_fill(ptr, 0, rounded / 4);
965   if (rc != HSA_STATUS_SUCCESS) {
966     fprintf(stderr, "zero fill device_state failed with %u\n", rc);
967     atmi_free(ptr);
968     return ATMI_STATUS_ERROR;
969   }
970 
971   *ret_ptr = ptr;
972   return ATMI_STATUS_SUCCESS;
973 }
974 
__tgt_rtl_load_binary_locked(int32_t device_id,__tgt_device_image * image)975 __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t device_id,
976                                                  __tgt_device_image *image) {
977   // This function loads the device image onto gpu[device_id] and does other
978   // per-image initialization work. Specifically:
979   //
980   // - Initialize an omptarget_device_environmentTy instance embedded in the
981   //   image at the symbol "omptarget_device_environment"
982   //   Fields debug_level, device_num, num_devices. Used by the deviceRTL.
983   //
984   // - Allocate a large array per-gpu (could be moved to init_device)
985   //   - Read a uint64_t at symbol omptarget_nvptx_device_State_size
986   //   - Allocate at least that many bytes of gpu memory
987   //   - Zero initialize it
988   //   - Write the pointer to the symbol omptarget_nvptx_device_State
989   //
990   // - Pulls some per-kernel information together from various sources and
991   //   records it in the KernelsList for quicker access later
992   //
993   // The initialization can be done before or after loading the image onto the
994   // gpu. This function presently does a mixture. Using the hsa api to get/set
995   // the information is simpler to implement, in exchange for more complicated
996   // runtime behaviour. E.g. launching a kernel or using dma to get eight bytes
997   // back from the gpu vs a hashtable lookup on the host.
998 
999   const size_t img_size = (char *)image->ImageEnd - (char *)image->ImageStart;
1000 
1001   DeviceInfo.clearOffloadEntriesTable(device_id);
1002 
1003   // We do not need to set the ELF version because the caller of this function
1004   // had to do that to decide the right runtime to use
1005 
1006   if (!elf_machine_id_is_amdgcn(image)) {
1007     return NULL;
1008   }
1009 
1010   omptarget_device_environmentTy host_device_env;
1011   host_device_env.num_devices = DeviceInfo.NumberOfDevices;
1012   host_device_env.device_num = device_id;
1013   host_device_env.debug_level = 0;
1014 #ifdef OMPTARGET_DEBUG
1015   if (char *envStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG")) {
1016     host_device_env.debug_level = std::stoi(envStr);
1017   }
1018 #endif
1019 
1020   auto on_deserialized_data = [&](void *data, size_t size) -> atmi_status_t {
1021     const char *device_env_Name = "omptarget_device_environment";
1022     symbol_info si;
1023     int rc = get_symbol_info_without_loading((char *)image->ImageStart,
1024                                              img_size, device_env_Name, &si);
1025     if (rc != 0) {
1026       DP("Finding global device environment '%s' - symbol missing.\n",
1027          device_env_Name);
1028       // no need to return FAIL, consider this is a not a device debug build.
1029       return ATMI_STATUS_SUCCESS;
1030     }
1031     if (si.size != sizeof(host_device_env)) {
1032       return ATMI_STATUS_ERROR;
1033     }
1034     DP("Setting global device environment %u bytes\n", si.size);
1035     uint64_t offset = (char *)si.addr - (char *)image->ImageStart;
1036     void *pos = (char *)data + offset;
1037     memcpy(pos, &host_device_env, sizeof(host_device_env));
1038     return ATMI_STATUS_SUCCESS;
1039   };
1040 
1041   atmi_status_t err;
1042   {
1043     err = module_register_from_memory_to_place(
1044         (void *)image->ImageStart, img_size, get_gpu_place(device_id),
1045         on_deserialized_data);
1046 
1047     check("Module registering", err);
1048     if (err != ATMI_STATUS_SUCCESS) {
1049       char GPUName[64] = "--unknown gpu--";
1050       hsa_agent_t agent = DeviceInfo.HSAAgents[device_id];
1051       (void)hsa_agent_get_info(agent, (hsa_agent_info_t)HSA_AGENT_INFO_NAME,
1052                                (void *)GPUName);
1053       fprintf(stderr,
1054               "Possible gpu arch mismatch: %s, please check"
1055               " compiler: -march=<gpu> flag\n",
1056               GPUName);
1057       return NULL;
1058     }
1059   }
1060 
1061   DP("ATMI module successfully loaded!\n");
1062 
1063   {
1064     // the device_State array is either large value in bss or a void* that
1065     // needs to be assigned to a pointer to an array of size device_state_bytes
1066 
1067     void *state_ptr;
1068     uint32_t state_ptr_size;
1069     atmi_status_t err = atmi_interop_hsa_get_symbol_info(
1070         get_gpu_mem_place(device_id), "omptarget_nvptx_device_State",
1071         &state_ptr, &state_ptr_size);
1072 
1073     if (err != ATMI_STATUS_SUCCESS) {
1074       fprintf(stderr, "failed to find device_state symbol\n");
1075       return NULL;
1076     }
1077 
1078     if (state_ptr_size < sizeof(void *)) {
1079       fprintf(stderr, "unexpected size of state_ptr %u != %zu\n",
1080               state_ptr_size, sizeof(void *));
1081       return NULL;
1082     }
1083 
1084     // if it's larger than a void*, assume it's a bss array and no further
1085     // initialization is required. Only try to set up a pointer for
1086     // sizeof(void*)
1087     if (state_ptr_size == sizeof(void *)) {
1088       uint64_t device_State_bytes =
1089           get_device_State_bytes((char *)image->ImageStart, img_size);
1090       if (device_State_bytes == 0) {
1091         return NULL;
1092       }
1093 
1094       auto &dss = DeviceInfo.deviceStateStore[device_id];
1095       if (dss.first.get() == nullptr) {
1096         assert(dss.second == 0);
1097         void *ptr = NULL;
1098         atmi_status_t err =
1099             atmi_calloc(&ptr, device_State_bytes, get_gpu_mem_place(device_id));
1100         if (err != ATMI_STATUS_SUCCESS) {
1101           fprintf(stderr, "Failed to allocate device_state array\n");
1102           return NULL;
1103         }
1104         dss = {std::unique_ptr<void, RTLDeviceInfoTy::atmiFreePtrDeletor>{ptr},
1105                device_State_bytes};
1106       }
1107 
1108       void *ptr = dss.first.get();
1109       if (device_State_bytes != dss.second) {
1110         fprintf(stderr, "Inconsistent sizes of device_State unsupported\n");
1111         exit(1);
1112       }
1113 
1114       // write ptr to device memory so it can be used by later kernels
1115       err = DeviceInfo.freesignalpool_memcpy_h2d(state_ptr, &ptr,
1116                                                  sizeof(void *), device_id);
1117       if (err != ATMI_STATUS_SUCCESS) {
1118         fprintf(stderr, "memcpy install of state_ptr failed\n");
1119         return NULL;
1120       }
1121     }
1122   }
1123 
1124   // TODO: Check with Guansong to understand the below comment more thoroughly.
1125   // Here, we take advantage of the data that is appended after img_end to get
1126   // the symbols' name we need to load. This data consist of the host entries
1127   // begin and end as well as the target name (see the offloading linker script
1128   // creation in clang compiler).
1129 
1130   // Find the symbols in the module by name. The name can be obtain by
1131   // concatenating the host entry name with the target name
1132 
1133   __tgt_offload_entry *HostBegin = image->EntriesBegin;
1134   __tgt_offload_entry *HostEnd = image->EntriesEnd;
1135 
1136   for (__tgt_offload_entry *e = HostBegin; e != HostEnd; ++e) {
1137 
1138     if (!e->addr) {
1139       // The host should have always something in the address to
1140       // uniquely identify the target region.
1141       fprintf(stderr, "Analyzing host entry '<null>' (size = %lld)...\n",
1142               (unsigned long long)e->size);
1143       return NULL;
1144     }
1145 
1146     if (e->size) {
1147       __tgt_offload_entry entry = *e;
1148 
1149       void *varptr;
1150       uint32_t varsize;
1151 
1152       err = atmi_interop_hsa_get_symbol_info(get_gpu_mem_place(device_id),
1153                                              e->name, &varptr, &varsize);
1154 
1155       if (err != ATMI_STATUS_SUCCESS) {
1156         DP("Loading global '%s' (Failed)\n", e->name);
1157         // Inform the user what symbol prevented offloading
1158         fprintf(stderr, "Loading global '%s' (Failed)\n", e->name);
1159         return NULL;
1160       }
1161 
1162       if (varsize != e->size) {
1163         DP("Loading global '%s' - size mismatch (%u != %lu)\n", e->name,
1164            varsize, e->size);
1165         return NULL;
1166       }
1167 
1168       DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",
1169          DPxPTR(e - HostBegin), e->name, DPxPTR(varptr));
1170       entry.addr = (void *)varptr;
1171 
1172       DeviceInfo.addOffloadEntry(device_id, entry);
1173 
1174       if (DeviceInfo.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY &&
1175           e->flags & OMP_DECLARE_TARGET_LINK) {
1176         // If unified memory is present any target link variables
1177         // can access host addresses directly. There is no longer a
1178         // need for device copies.
1179         err = DeviceInfo.freesignalpool_memcpy_h2d(varptr, e->addr,
1180                                                    sizeof(void *), device_id);
1181         if (err != ATMI_STATUS_SUCCESS)
1182           DP("Error when copying USM\n");
1183         DP("Copy linked variable host address (" DPxMOD ")"
1184            "to device address (" DPxMOD ")\n",
1185            DPxPTR(*((void **)e->addr)), DPxPTR(varptr));
1186       }
1187 
1188       continue;
1189     }
1190 
1191     DP("to find the kernel name: %s size: %lu\n", e->name, strlen(e->name));
1192 
1193     atmi_mem_place_t place = get_gpu_mem_place(device_id);
1194     uint32_t kernarg_segment_size;
1195     err = atmi_interop_hsa_get_kernel_info(
1196         place, e->name, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
1197         &kernarg_segment_size);
1198 
1199     // each arg is a void * in this openmp implementation
1200     uint32_t arg_num = kernarg_segment_size / sizeof(void *);
1201     std::vector<size_t> arg_sizes(arg_num);
1202     for (std::vector<size_t>::iterator it = arg_sizes.begin();
1203          it != arg_sizes.end(); it++) {
1204       *it = sizeof(void *);
1205     }
1206 
1207     // default value GENERIC (in case symbol is missing from cubin file)
1208     int8_t ExecModeVal = ExecutionModeType::GENERIC;
1209 
1210     // get flat group size if present, else Default_WG_Size
1211     int16_t WGSizeVal = RTLDeviceInfoTy::Default_WG_Size;
1212 
1213     // get Kernel Descriptor if present.
1214     // Keep struct in sync wih getTgtAttributeStructQTy in CGOpenMPRuntime.cpp
1215     struct KernDescValType {
1216       uint16_t Version;
1217       uint16_t TSize;
1218       uint16_t WG_Size;
1219       uint8_t Mode;
1220     };
1221     struct KernDescValType KernDescVal;
1222     std::string KernDescNameStr(e->name);
1223     KernDescNameStr += "_kern_desc";
1224     const char *KernDescName = KernDescNameStr.c_str();
1225 
1226     void *KernDescPtr;
1227     uint32_t KernDescSize;
1228     void *CallStackAddr = nullptr;
1229     err = interop_get_symbol_info((char *)image->ImageStart, img_size,
1230                                   KernDescName, &KernDescPtr, &KernDescSize);
1231 
1232     if (err == ATMI_STATUS_SUCCESS) {
1233       if ((size_t)KernDescSize != sizeof(KernDescVal))
1234         DP("Loading global computation properties '%s' - size mismatch (%u != "
1235            "%lu)\n",
1236            KernDescName, KernDescSize, sizeof(KernDescVal));
1237 
1238       memcpy(&KernDescVal, KernDescPtr, (size_t)KernDescSize);
1239 
1240       // Check structure size against recorded size.
1241       if ((size_t)KernDescSize != KernDescVal.TSize)
1242         DP("KernDescVal size %lu does not match advertized size %d for '%s'\n",
1243            sizeof(KernDescVal), KernDescVal.TSize, KernDescName);
1244 
1245       DP("After loading global for %s KernDesc \n", KernDescName);
1246       DP("KernDesc: Version: %d\n", KernDescVal.Version);
1247       DP("KernDesc: TSize: %d\n", KernDescVal.TSize);
1248       DP("KernDesc: WG_Size: %d\n", KernDescVal.WG_Size);
1249       DP("KernDesc: Mode: %d\n", KernDescVal.Mode);
1250 
1251       // Get ExecMode
1252       ExecModeVal = KernDescVal.Mode;
1253       DP("ExecModeVal %d\n", ExecModeVal);
1254       if (KernDescVal.WG_Size == 0) {
1255         KernDescVal.WG_Size = RTLDeviceInfoTy::Default_WG_Size;
1256         DP("Setting KernDescVal.WG_Size to default %d\n", KernDescVal.WG_Size);
1257       }
1258       WGSizeVal = KernDescVal.WG_Size;
1259       DP("WGSizeVal %d\n", WGSizeVal);
1260       check("Loading KernDesc computation property", err);
1261     } else {
1262       DP("Warning: Loading KernDesc '%s' - symbol not found, ", KernDescName);
1263 
1264       // Generic
1265       std::string ExecModeNameStr(e->name);
1266       ExecModeNameStr += "_exec_mode";
1267       const char *ExecModeName = ExecModeNameStr.c_str();
1268 
1269       void *ExecModePtr;
1270       uint32_t varsize;
1271       err = interop_get_symbol_info((char *)image->ImageStart, img_size,
1272                                     ExecModeName, &ExecModePtr, &varsize);
1273 
1274       if (err == ATMI_STATUS_SUCCESS) {
1275         if ((size_t)varsize != sizeof(int8_t)) {
1276           DP("Loading global computation properties '%s' - size mismatch(%u != "
1277              "%lu)\n",
1278              ExecModeName, varsize, sizeof(int8_t));
1279           return NULL;
1280         }
1281 
1282         memcpy(&ExecModeVal, ExecModePtr, (size_t)varsize);
1283 
1284         DP("After loading global for %s ExecMode = %d\n", ExecModeName,
1285            ExecModeVal);
1286 
1287         if (ExecModeVal < 0 || ExecModeVal > 1) {
1288           DP("Error wrong exec_mode value specified in HSA code object file: "
1289              "%d\n",
1290              ExecModeVal);
1291           return NULL;
1292         }
1293       } else {
1294         DP("Loading global exec_mode '%s' - symbol missing, using default "
1295            "value "
1296            "GENERIC (1)\n",
1297            ExecModeName);
1298       }
1299       check("Loading computation property", err);
1300 
1301       // Flat group size
1302       std::string WGSizeNameStr(e->name);
1303       WGSizeNameStr += "_wg_size";
1304       const char *WGSizeName = WGSizeNameStr.c_str();
1305 
1306       void *WGSizePtr;
1307       uint32_t WGSize;
1308       err = interop_get_symbol_info((char *)image->ImageStart, img_size,
1309                                     WGSizeName, &WGSizePtr, &WGSize);
1310 
1311       if (err == ATMI_STATUS_SUCCESS) {
1312         if ((size_t)WGSize != sizeof(int16_t)) {
1313           DP("Loading global computation properties '%s' - size mismatch (%u "
1314              "!= "
1315              "%lu)\n",
1316              WGSizeName, WGSize, sizeof(int16_t));
1317           return NULL;
1318         }
1319 
1320         memcpy(&WGSizeVal, WGSizePtr, (size_t)WGSize);
1321 
1322         DP("After loading global for %s WGSize = %d\n", WGSizeName, WGSizeVal);
1323 
1324         if (WGSizeVal < RTLDeviceInfoTy::Default_WG_Size ||
1325             WGSizeVal > RTLDeviceInfoTy::Max_WG_Size) {
1326           DP("Error wrong WGSize value specified in HSA code object file: "
1327              "%d\n",
1328              WGSizeVal);
1329           WGSizeVal = RTLDeviceInfoTy::Default_WG_Size;
1330         }
1331       } else {
1332         DP("Warning: Loading WGSize '%s' - symbol not found, "
1333            "using default value %d\n",
1334            WGSizeName, WGSizeVal);
1335       }
1336 
1337       check("Loading WGSize computation property", err);
1338     }
1339 
1340     KernelsList.push_back(KernelTy(ExecModeVal, WGSizeVal, device_id,
1341                                    CallStackAddr, e->name,
1342                                    kernarg_segment_size));
1343     __tgt_offload_entry entry = *e;
1344     entry.addr = (void *)&KernelsList.back();
1345     DeviceInfo.addOffloadEntry(device_id, entry);
1346     DP("Entry point %ld maps to %s\n", e - HostBegin, e->name);
1347   }
1348 
1349   return DeviceInfo.getOffloadEntriesTable(device_id);
1350 }
1351 
__tgt_rtl_data_alloc(int device_id,int64_t size,void *)1352 void *__tgt_rtl_data_alloc(int device_id, int64_t size, void *) {
1353   void *ptr = NULL;
1354   assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large");
1355   atmi_status_t err = atmi_malloc(&ptr, size, get_gpu_mem_place(device_id));
1356   DP("Tgt alloc data %ld bytes, (tgt:%016llx).\n", size,
1357      (long long unsigned)(Elf64_Addr)ptr);
1358   ptr = (err == ATMI_STATUS_SUCCESS) ? ptr : NULL;
1359   return ptr;
1360 }
1361 
__tgt_rtl_data_submit(int device_id,void * tgt_ptr,void * hst_ptr,int64_t size)1362 int32_t __tgt_rtl_data_submit(int device_id, void *tgt_ptr, void *hst_ptr,
1363                               int64_t size) {
1364   assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large");
1365   __tgt_async_info async_info;
1366   int32_t rc = dataSubmit(device_id, tgt_ptr, hst_ptr, size, &async_info);
1367   if (rc != OFFLOAD_SUCCESS)
1368     return OFFLOAD_FAIL;
1369 
1370   return __tgt_rtl_synchronize(device_id, &async_info);
1371 }
1372 
__tgt_rtl_data_submit_async(int device_id,void * tgt_ptr,void * hst_ptr,int64_t size,__tgt_async_info * async_info_ptr)1373 int32_t __tgt_rtl_data_submit_async(int device_id, void *tgt_ptr, void *hst_ptr,
1374                                     int64_t size,
1375                                     __tgt_async_info *async_info_ptr) {
1376   assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large");
1377   if (async_info_ptr) {
1378     initAsyncInfoPtr(async_info_ptr);
1379     return dataSubmit(device_id, tgt_ptr, hst_ptr, size, async_info_ptr);
1380   } else {
1381     return __tgt_rtl_data_submit(device_id, tgt_ptr, hst_ptr, size);
1382   }
1383 }
1384 
__tgt_rtl_data_retrieve(int device_id,void * hst_ptr,void * tgt_ptr,int64_t size)1385 int32_t __tgt_rtl_data_retrieve(int device_id, void *hst_ptr, void *tgt_ptr,
1386                                 int64_t size) {
1387   assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large");
1388   __tgt_async_info async_info;
1389   int32_t rc = dataRetrieve(device_id, hst_ptr, tgt_ptr, size, &async_info);
1390   if (rc != OFFLOAD_SUCCESS)
1391     return OFFLOAD_FAIL;
1392 
1393   return __tgt_rtl_synchronize(device_id, &async_info);
1394 }
1395 
__tgt_rtl_data_retrieve_async(int device_id,void * hst_ptr,void * tgt_ptr,int64_t size,__tgt_async_info * async_info_ptr)1396 int32_t __tgt_rtl_data_retrieve_async(int device_id, void *hst_ptr,
1397                                       void *tgt_ptr, int64_t size,
1398                                       __tgt_async_info *async_info_ptr) {
1399   assert(async_info_ptr && "async_info is nullptr");
1400   assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large");
1401   initAsyncInfoPtr(async_info_ptr);
1402   return dataRetrieve(device_id, hst_ptr, tgt_ptr, size, async_info_ptr);
1403 }
1404 
__tgt_rtl_data_delete(int device_id,void * tgt_ptr)1405 int32_t __tgt_rtl_data_delete(int device_id, void *tgt_ptr) {
1406   assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large");
1407   atmi_status_t err;
1408   DP("Tgt free data (tgt:%016llx).\n", (long long unsigned)(Elf64_Addr)tgt_ptr);
1409   err = atmi_free(tgt_ptr);
1410   if (err != ATMI_STATUS_SUCCESS) {
1411     DP("Error when freeing CUDA memory\n");
1412     return OFFLOAD_FAIL;
1413   }
1414   return OFFLOAD_SUCCESS;
1415 }
1416 
1417 // Determine launch values for threadsPerGroup and num_groups.
1418 // Outputs: treadsPerGroup, num_groups
1419 // Inputs: Max_Teams, Max_WG_Size, Warp_Size, ExecutionMode,
1420 //         EnvTeamLimit, EnvNumTeams, num_teams, thread_limit,
1421 //         loop_tripcount.
getLaunchVals(int & threadsPerGroup,int & num_groups,int ConstWGSize,int ExecutionMode,int EnvTeamLimit,int EnvNumTeams,int num_teams,int thread_limit,uint64_t loop_tripcount)1422 void getLaunchVals(int &threadsPerGroup, int &num_groups, int ConstWGSize,
1423                    int ExecutionMode, int EnvTeamLimit, int EnvNumTeams,
1424                    int num_teams, int thread_limit, uint64_t loop_tripcount) {
1425 
1426   int Max_Teams = DeviceInfo.EnvMaxTeamsDefault > 0
1427                       ? DeviceInfo.EnvMaxTeamsDefault
1428                       : DeviceInfo.Max_Teams;
1429   if (Max_Teams > DeviceInfo.HardTeamLimit)
1430     Max_Teams = DeviceInfo.HardTeamLimit;
1431 
1432   if (print_kernel_trace == 4) {
1433     fprintf(stderr, "RTLDeviceInfoTy::Max_Teams: %d\n",
1434             RTLDeviceInfoTy::Max_Teams);
1435     fprintf(stderr, "Max_Teams: %d\n", Max_Teams);
1436     fprintf(stderr, "RTLDeviceInfoTy::Warp_Size: %d\n",
1437             RTLDeviceInfoTy::Warp_Size);
1438     fprintf(stderr, "RTLDeviceInfoTy::Max_WG_Size: %d\n",
1439             RTLDeviceInfoTy::Max_WG_Size);
1440     fprintf(stderr, "RTLDeviceInfoTy::Default_WG_Size: %d\n",
1441             RTLDeviceInfoTy::Default_WG_Size);
1442     fprintf(stderr, "thread_limit: %d\n", thread_limit);
1443     fprintf(stderr, "threadsPerGroup: %d\n", threadsPerGroup);
1444     fprintf(stderr, "ConstWGSize: %d\n", ConstWGSize);
1445   }
1446   // check for thread_limit() clause
1447   if (thread_limit > 0) {
1448     threadsPerGroup = thread_limit;
1449     DP("Setting threads per block to requested %d\n", thread_limit);
1450     if (ExecutionMode == GENERIC) { // Add master warp for GENERIC
1451       threadsPerGroup += RTLDeviceInfoTy::Warp_Size;
1452       DP("Adding master wavefront: +%d threads\n", RTLDeviceInfoTy::Warp_Size);
1453     }
1454     if (threadsPerGroup > RTLDeviceInfoTy::Max_WG_Size) { // limit to max
1455       threadsPerGroup = RTLDeviceInfoTy::Max_WG_Size;
1456       DP("Setting threads per block to maximum %d\n", threadsPerGroup);
1457     }
1458   }
1459   // check flat_max_work_group_size attr here
1460   if (threadsPerGroup > ConstWGSize) {
1461     threadsPerGroup = ConstWGSize;
1462     DP("Reduced threadsPerGroup to flat-attr-group-size limit %d\n",
1463        threadsPerGroup);
1464   }
1465   if (print_kernel_trace == 4)
1466     fprintf(stderr, "threadsPerGroup: %d\n", threadsPerGroup);
1467   DP("Preparing %d threads\n", threadsPerGroup);
1468 
1469   // Set default num_groups (teams)
1470   if (DeviceInfo.EnvTeamLimit > 0)
1471     num_groups = (Max_Teams < DeviceInfo.EnvTeamLimit)
1472                      ? Max_Teams
1473                      : DeviceInfo.EnvTeamLimit;
1474   else
1475     num_groups = Max_Teams;
1476   DP("Set default num of groups %d\n", num_groups);
1477 
1478   if (print_kernel_trace == 4) {
1479     fprintf(stderr, "num_groups: %d\n", num_groups);
1480     fprintf(stderr, "num_teams: %d\n", num_teams);
1481   }
1482 
1483   // Reduce num_groups if threadsPerGroup exceeds RTLDeviceInfoTy::Max_WG_Size
1484   // This reduction is typical for default case (no thread_limit clause).
1485   // or when user goes crazy with num_teams clause.
1486   // FIXME: We cant distinguish between a constant or variable thread limit.
1487   // So we only handle constant thread_limits.
1488   if (threadsPerGroup >
1489       RTLDeviceInfoTy::Default_WG_Size) //  256 < threadsPerGroup <= 1024
1490     // Should we round threadsPerGroup up to nearest RTLDeviceInfoTy::Warp_Size
1491     // here?
1492     num_groups = (Max_Teams * RTLDeviceInfoTy::Max_WG_Size) / threadsPerGroup;
1493 
1494   // check for num_teams() clause
1495   if (num_teams > 0) {
1496     num_groups = (num_teams < num_groups) ? num_teams : num_groups;
1497   }
1498   if (print_kernel_trace == 4) {
1499     fprintf(stderr, "num_groups: %d\n", num_groups);
1500     fprintf(stderr, "DeviceInfo.EnvNumTeams %d\n", DeviceInfo.EnvNumTeams);
1501     fprintf(stderr, "DeviceInfo.EnvTeamLimit %d\n", DeviceInfo.EnvTeamLimit);
1502   }
1503 
1504   if (DeviceInfo.EnvNumTeams > 0) {
1505     num_groups = (DeviceInfo.EnvNumTeams < num_groups) ? DeviceInfo.EnvNumTeams
1506                                                        : num_groups;
1507     DP("Modifying teams based on EnvNumTeams %d\n", DeviceInfo.EnvNumTeams);
1508   } else if (DeviceInfo.EnvTeamLimit > 0) {
1509     num_groups = (DeviceInfo.EnvTeamLimit < num_groups)
1510                      ? DeviceInfo.EnvTeamLimit
1511                      : num_groups;
1512     DP("Modifying teams based on EnvTeamLimit%d\n", DeviceInfo.EnvTeamLimit);
1513   } else {
1514     if (num_teams <= 0) {
1515       if (loop_tripcount > 0) {
1516         if (ExecutionMode == SPMD) {
1517           // round up to the nearest integer
1518           num_groups = ((loop_tripcount - 1) / threadsPerGroup) + 1;
1519         } else {
1520           num_groups = loop_tripcount;
1521         }
1522         DP("Using %d teams due to loop trip count %" PRIu64 " and number of "
1523            "threads per block %d\n",
1524            num_groups, loop_tripcount, threadsPerGroup);
1525       }
1526     } else {
1527       num_groups = num_teams;
1528     }
1529     if (num_groups > Max_Teams) {
1530       num_groups = Max_Teams;
1531       if (print_kernel_trace == 4)
1532         fprintf(stderr, "Limiting num_groups %d to Max_Teams %d \n", num_groups,
1533                 Max_Teams);
1534     }
1535     if (num_groups > num_teams && num_teams > 0) {
1536       num_groups = num_teams;
1537       if (print_kernel_trace == 4)
1538         fprintf(stderr, "Limiting num_groups %d to clause num_teams %d \n",
1539                 num_groups, num_teams);
1540     }
1541   }
1542 
1543   // num_teams clause always honored, no matter what, unless DEFAULT is active.
1544   if (num_teams > 0) {
1545     num_groups = num_teams;
1546     // Cap num_groups to EnvMaxTeamsDefault if set.
1547     if (DeviceInfo.EnvMaxTeamsDefault > 0 &&
1548         num_groups > DeviceInfo.EnvMaxTeamsDefault)
1549       num_groups = DeviceInfo.EnvMaxTeamsDefault;
1550   }
1551   if (print_kernel_trace == 4) {
1552     fprintf(stderr, "threadsPerGroup: %d\n", threadsPerGroup);
1553     fprintf(stderr, "num_groups: %d\n", num_groups);
1554     fprintf(stderr, "loop_tripcount: %ld\n", loop_tripcount);
1555   }
1556   DP("Final %d num_groups and %d threadsPerGroup\n", num_groups,
1557      threadsPerGroup);
1558 }
1559 
acquire_available_packet_id(hsa_queue_t * queue)1560 static uint64_t acquire_available_packet_id(hsa_queue_t *queue) {
1561   uint64_t packet_id = hsa_queue_add_write_index_relaxed(queue, 1);
1562   bool full = true;
1563   while (full) {
1564     full =
1565         packet_id >= (queue->size + hsa_queue_load_read_index_scacquire(queue));
1566   }
1567   return packet_id;
1568 }
1569 
1570 extern bool g_atmi_hostcall_required; // declared without header by atmi
1571 
1572 static int32_t __tgt_rtl_run_target_team_region_locked(
1573     int32_t device_id, void *tgt_entry_ptr, void **tgt_args,
1574     ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t num_teams,
1575     int32_t thread_limit, uint64_t loop_tripcount);
1576 
__tgt_rtl_run_target_team_region(int32_t device_id,void * tgt_entry_ptr,void ** tgt_args,ptrdiff_t * tgt_offsets,int32_t arg_num,int32_t num_teams,int32_t thread_limit,uint64_t loop_tripcount)1577 int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
1578                                          void **tgt_args,
1579                                          ptrdiff_t *tgt_offsets,
1580                                          int32_t arg_num, int32_t num_teams,
1581                                          int32_t thread_limit,
1582                                          uint64_t loop_tripcount) {
1583 
1584   DeviceInfo.load_run_lock.lock_shared();
1585   int32_t res = __tgt_rtl_run_target_team_region_locked(
1586       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, num_teams,
1587       thread_limit, loop_tripcount);
1588 
1589   DeviceInfo.load_run_lock.unlock_shared();
1590   return res;
1591 }
1592 
__tgt_rtl_run_target_team_region_locked(int32_t device_id,void * tgt_entry_ptr,void ** tgt_args,ptrdiff_t * tgt_offsets,int32_t arg_num,int32_t num_teams,int32_t thread_limit,uint64_t loop_tripcount)1593 int32_t __tgt_rtl_run_target_team_region_locked(
1594     int32_t device_id, void *tgt_entry_ptr, void **tgt_args,
1595     ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t num_teams,
1596     int32_t thread_limit, uint64_t loop_tripcount) {
1597   // Set the context we are using
1598   // update thread limit content in gpu memory if un-initialized or specified
1599   // from host
1600 
1601   DP("Run target team region thread_limit %d\n", thread_limit);
1602 
1603   // All args are references.
1604   std::vector<void *> args(arg_num);
1605   std::vector<void *> ptrs(arg_num);
1606 
1607   DP("Arg_num: %d\n", arg_num);
1608   for (int32_t i = 0; i < arg_num; ++i) {
1609     ptrs[i] = (void *)((intptr_t)tgt_args[i] + tgt_offsets[i]);
1610     args[i] = &ptrs[i];
1611     DP("Offseted base: arg[%d]:" DPxMOD "\n", i, DPxPTR(ptrs[i]));
1612   }
1613 
1614   KernelTy *KernelInfo = (KernelTy *)tgt_entry_ptr;
1615 
1616   /*
1617    * Set limit based on ThreadsPerGroup and GroupsPerDevice
1618    */
1619   int num_groups = 0;
1620 
1621   int threadsPerGroup = RTLDeviceInfoTy::Default_WG_Size;
1622 
1623   getLaunchVals(threadsPerGroup, num_groups, KernelInfo->ConstWGSize,
1624                 KernelInfo->ExecutionMode, DeviceInfo.EnvTeamLimit,
1625                 DeviceInfo.EnvNumTeams,
1626                 num_teams,     // From run_region arg
1627                 thread_limit,  // From run_region arg
1628                 loop_tripcount // From run_region arg
1629   );
1630 
1631   if (print_kernel_trace == 4)
1632     // enum modes are SPMD, GENERIC, NONE 0,1,2
1633     fprintf(stderr,
1634             "DEVID:%2d SGN:%1d ConstWGSize:%-4d args:%2d teamsXthrds:(%4dX%4d) "
1635             "reqd:(%4dX%4d) n:%s\n",
1636             device_id, KernelInfo->ExecutionMode, KernelInfo->ConstWGSize,
1637             arg_num, num_groups, threadsPerGroup, num_teams, thread_limit,
1638             KernelInfo->Name);
1639 
1640   // Run on the device.
1641   {
1642     hsa_queue_t *queue = DeviceInfo.HSAQueues[device_id];
1643     uint64_t packet_id = acquire_available_packet_id(queue);
1644 
1645     const uint32_t mask = queue->size - 1; // size is a power of 2
1646     hsa_kernel_dispatch_packet_t *packet =
1647         (hsa_kernel_dispatch_packet_t *)queue->base_address +
1648         (packet_id & mask);
1649 
1650     // packet->header is written last
1651     packet->setup = UINT16_C(1) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
1652     packet->workgroup_size_x = threadsPerGroup;
1653     packet->workgroup_size_y = 1;
1654     packet->workgroup_size_z = 1;
1655     packet->reserved0 = 0;
1656     packet->grid_size_x = num_groups * threadsPerGroup;
1657     packet->grid_size_y = 1;
1658     packet->grid_size_z = 1;
1659     packet->private_segment_size = 0;
1660     packet->group_segment_size = 0;
1661     packet->kernel_object = 0;
1662     packet->kernarg_address = 0;     // use the block allocator
1663     packet->reserved2 = 0;           // atmi writes id_ here
1664     packet->completion_signal = {0}; // may want a pool of signals
1665 
1666     std::string kernel_name = std::string(KernelInfo->Name);
1667     {
1668       assert(KernelInfoTable[device_id].find(kernel_name) !=
1669              KernelInfoTable[device_id].end());
1670       auto it = KernelInfoTable[device_id][kernel_name];
1671       packet->kernel_object = it.kernel_object;
1672       packet->private_segment_size = it.private_segment_size;
1673       packet->group_segment_size = it.group_segment_size;
1674       assert(arg_num == (int)it.num_args);
1675     }
1676 
1677     KernelArgPool *ArgPool = nullptr;
1678     {
1679       auto it = KernelArgPoolMap.find(std::string(KernelInfo->Name));
1680       if (it != KernelArgPoolMap.end()) {
1681         ArgPool = (it->second).get();
1682       }
1683     }
1684     if (!ArgPool) {
1685       fprintf(stderr, "Warning: No ArgPool for %s on device %d\n",
1686               KernelInfo->Name, device_id);
1687     }
1688     {
1689       void *kernarg = nullptr;
1690       if (ArgPool) {
1691         assert(ArgPool->kernarg_segment_size == (arg_num * sizeof(void *)));
1692         kernarg = ArgPool->allocate(arg_num);
1693       }
1694       if (!kernarg) {
1695         printf("Allocate kernarg failed\n");
1696         exit(1);
1697       }
1698 
1699       // Copy explicit arguments
1700       for (int i = 0; i < arg_num; i++) {
1701         memcpy((char *)kernarg + sizeof(void *) * i, args[i], sizeof(void *));
1702       }
1703 
1704       // Initialize implicit arguments. ATMI seems to leave most fields
1705       // uninitialized
1706       atmi_implicit_args_t *impl_args =
1707           reinterpret_cast<atmi_implicit_args_t *>(
1708               static_cast<char *>(kernarg) + ArgPool->kernarg_segment_size);
1709       memset(impl_args, 0,
1710              sizeof(atmi_implicit_args_t)); // may not be necessary
1711       impl_args->offset_x = 0;
1712       impl_args->offset_y = 0;
1713       impl_args->offset_z = 0;
1714 
1715       // assign a hostcall buffer for the selected Q
1716       if (g_atmi_hostcall_required) {
1717         // hostrpc_assign_buffer is not thread safe, and this function is
1718         // under a multiple reader lock, not a writer lock.
1719         static pthread_mutex_t hostcall_init_lock = PTHREAD_MUTEX_INITIALIZER;
1720         pthread_mutex_lock(&hostcall_init_lock);
1721         impl_args->hostcall_ptr = hostrpc_assign_buffer(
1722             DeviceInfo.HSAAgents[device_id], queue, device_id);
1723         pthread_mutex_unlock(&hostcall_init_lock);
1724         if (!impl_args->hostcall_ptr) {
1725           DP("hostrpc_assign_buffer failed, gpu would dereference null and "
1726              "error\n");
1727           return OFFLOAD_FAIL;
1728         }
1729       }
1730 
1731       packet->kernarg_address = kernarg;
1732     }
1733 
1734     {
1735       hsa_signal_t s = DeviceInfo.FreeSignalPool.pop();
1736       if (s.handle == 0) {
1737         printf("Failed to get signal instance\n");
1738         exit(1);
1739       }
1740       packet->completion_signal = s;
1741       hsa_signal_store_relaxed(packet->completion_signal, 1);
1742     }
1743 
1744     core::packet_store_release(
1745         reinterpret_cast<uint32_t *>(packet),
1746         core::create_header(HSA_PACKET_TYPE_KERNEL_DISPATCH, 0,
1747                             ATMI_FENCE_SCOPE_SYSTEM, ATMI_FENCE_SCOPE_SYSTEM),
1748         packet->setup);
1749 
1750     hsa_signal_store_relaxed(queue->doorbell_signal, packet_id);
1751 
1752     while (hsa_signal_wait_scacquire(packet->completion_signal,
1753                                      HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX,
1754                                      HSA_WAIT_STATE_BLOCKED) != 0)
1755       ;
1756 
1757     assert(ArgPool);
1758     ArgPool->deallocate(packet->kernarg_address);
1759     DeviceInfo.FreeSignalPool.push(packet->completion_signal);
1760   }
1761 
1762   DP("Kernel completed\n");
1763   return OFFLOAD_SUCCESS;
1764 }
1765 
__tgt_rtl_run_target_region(int32_t device_id,void * tgt_entry_ptr,void ** tgt_args,ptrdiff_t * tgt_offsets,int32_t arg_num)1766 int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr,
1767                                     void **tgt_args, ptrdiff_t *tgt_offsets,
1768                                     int32_t arg_num) {
1769   // use one team and one thread
1770   // fix thread num
1771   int32_t team_num = 1;
1772   int32_t thread_limit = 0; // use default
1773   return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args,
1774                                           tgt_offsets, arg_num, team_num,
1775                                           thread_limit, 0);
1776 }
1777 
__tgt_rtl_run_target_region_async(int32_t device_id,void * tgt_entry_ptr,void ** tgt_args,ptrdiff_t * tgt_offsets,int32_t arg_num,__tgt_async_info * async_info_ptr)1778 int32_t __tgt_rtl_run_target_region_async(int32_t device_id,
1779                                           void *tgt_entry_ptr, void **tgt_args,
1780                                           ptrdiff_t *tgt_offsets,
1781                                           int32_t arg_num,
1782                                           __tgt_async_info *async_info_ptr) {
1783   assert(async_info_ptr && "async_info is nullptr");
1784   initAsyncInfoPtr(async_info_ptr);
1785 
1786   // use one team and one thread
1787   // fix thread num
1788   int32_t team_num = 1;
1789   int32_t thread_limit = 0; // use default
1790   return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args,
1791                                           tgt_offsets, arg_num, team_num,
1792                                           thread_limit, 0);
1793 }
1794 
__tgt_rtl_synchronize(int32_t device_id,__tgt_async_info * async_info_ptr)1795 int32_t __tgt_rtl_synchronize(int32_t device_id,
1796                               __tgt_async_info *async_info_ptr) {
1797   assert(async_info_ptr && "async_info is nullptr");
1798 
1799   // Cuda asserts that async_info_ptr->Queue is non-null, but this invariant
1800   // is not ensured by devices.cpp for amdgcn
1801   // assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr");
1802   if (async_info_ptr->Queue) {
1803     finiAsyncInfoPtr(async_info_ptr);
1804   }
1805   return OFFLOAD_SUCCESS;
1806 }
1807