1 //===-RTLs/nec-aurora/src/rtl.cpp - Target RTLs Implementation - C++ -*-======//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is dual licensed under the MIT and the University of Illinois Open
6 // Source Licenses. See LICENSE.txt for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // RTL for NEC Aurora TSUBASA machines
11 //
12 //===----------------------------------------------------------------------===//
13
14 #include <algorithm>
15 #include <cassert>
16 #include <cerrno>
17 #include <cstring>
18 #include <list>
19 #include <stdlib.h>
20 #include <string>
21 #include <sys/stat.h>
22 #include <ve_offload.h>
23 #include <vector>
24 #include <veosinfo/veosinfo.h>
25
26 #include "Debug.h"
27 #include "omptargetplugin.h"
28
29 #ifndef TARGET_NAME
30 #define TARGET_NAME VE
31 #endif
32
33 #define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL"
34
35 #ifndef TARGET_ELF_ID
36 #define TARGET_ELF_ID 0
37 #endif
38
39 #include "../../common/elf_common.c"
40
41 struct DynLibTy {
42 char *FileName;
43 uint64_t VeoLibHandle;
44 };
45
46 /// Keep entries table per device.
47 struct FuncOrGblEntryTy {
48 __tgt_target_table Table;
49 std::vector<__tgt_offload_entry> Entries;
50 };
51
52 class RTLDeviceInfoTy {
53 std::vector<std::list<FuncOrGblEntryTy>> FuncOrGblEntry;
54
55 public:
56 std::vector<struct veo_proc_handle *> ProcHandles;
57 std::vector<struct veo_thr_ctxt *> Contexts;
58 std::vector<uint64_t> LibraryHandles;
59 std::list<DynLibTy> DynLibs;
60 // Maps OpenMP device Ids to Ve nodeids
61 std::vector<int> NodeIds;
62
buildOffloadTableFromHost(int32_t device_id,uint64_t VeoLibHandle,__tgt_offload_entry * HostBegin,__tgt_offload_entry * HostEnd)63 void buildOffloadTableFromHost(int32_t device_id, uint64_t VeoLibHandle,
64 __tgt_offload_entry *HostBegin,
65 __tgt_offload_entry *HostEnd) {
66 FuncOrGblEntry[device_id].emplace_back();
67 std::vector<__tgt_offload_entry> &T =
68 FuncOrGblEntry[device_id].back().Entries;
69 T.clear();
70 for (__tgt_offload_entry *i = HostBegin; i != HostEnd; ++i) {
71 char *SymbolName = i->name;
72 // we have not enough access to the target memory to conveniently parse
73 // the offload table there so we need to lookup every symbol with the host
74 // table
75 DP("Looking up symbol: %s\n", SymbolName);
76 uint64_t SymbolTargetAddr =
77 veo_get_sym(ProcHandles[device_id], VeoLibHandle, SymbolName);
78 __tgt_offload_entry Entry;
79
80 if (!SymbolTargetAddr) {
81 DP("Symbol %s not found in target image\n", SymbolName);
82 Entry = {NULL, NULL, 0, 0, 0};
83 } else {
84 DP("Found symbol %s successfully in target image (addr: %p)\n",
85 SymbolName, reinterpret_cast<void *>(SymbolTargetAddr));
86 Entry = { reinterpret_cast<void *>(SymbolTargetAddr),
87 i->name,
88 i->size,
89 i->flags,
90 0 };
91 }
92
93 T.push_back(Entry);
94 }
95
96 FuncOrGblEntry[device_id].back().Table.EntriesBegin = &T.front();
97 FuncOrGblEntry[device_id].back().Table.EntriesEnd = &T.back() + 1;
98 }
99
getOffloadTable(int32_t device_id)100 __tgt_target_table *getOffloadTable(int32_t device_id) {
101 return &FuncOrGblEntry[device_id].back().Table;
102 }
103
RTLDeviceInfoTy()104 RTLDeviceInfoTy() {
105
106 struct ve_nodeinfo node_info;
107 ve_node_info(&node_info);
108
109 // Build a predictable mapping between VE node ids and OpenMP device ids.
110 // This is necessary, because nodes can be missing or offline and (active)
111 // node ids are thus not consecutive. The entries in ve_nodeinfo may also
112 // not be in the order of their node ids.
113 for (int i = 0; i < node_info.total_node_count; ++i) {
114 if (node_info.status[i] == 0) {
115 NodeIds.push_back(node_info.nodeid[i]);
116 }
117 }
118
119 // Because the entries in ve_nodeinfo may not be in the order of their node
120 // ids, we sort NodeIds to get a predictable mapping.
121 std::sort(NodeIds.begin(), NodeIds.end());
122
123 int NumDevices = NodeIds.size();
124 DP("Found %i VE devices\n", NumDevices);
125 ProcHandles.resize(NumDevices, NULL);
126 Contexts.resize(NumDevices, NULL);
127 FuncOrGblEntry.resize(NumDevices);
128 LibraryHandles.resize(NumDevices);
129 }
130
~RTLDeviceInfoTy()131 ~RTLDeviceInfoTy() {
132 for (auto &ctx : Contexts) {
133 if (ctx != NULL) {
134 if (veo_context_close(ctx) != 0) {
135 DP("Failed to close VEO context.\n");
136 }
137 }
138 }
139
140 for (auto &hdl : ProcHandles) {
141 if (hdl != NULL) {
142 veo_proc_destroy(hdl);
143 }
144 }
145
146 for (auto &lib : DynLibs) {
147 if (lib.FileName) {
148 remove(lib.FileName);
149 }
150 }
151 }
152 };
153
154 static RTLDeviceInfoTy DeviceInfo;
155
target_run_function_wait(uint32_t DeviceID,uint64_t FuncAddr,struct veo_args * args,uint64_t * RetVal)156 static int target_run_function_wait(uint32_t DeviceID, uint64_t FuncAddr,
157 struct veo_args *args, uint64_t *RetVal) {
158 DP("Running function with entry point %p\n",
159 reinterpret_cast<void *>(FuncAddr));
160 uint64_t RequestHandle =
161 veo_call_async(DeviceInfo.Contexts[DeviceID], FuncAddr, args);
162 if (RequestHandle == VEO_REQUEST_ID_INVALID) {
163 DP("Execution of entry point %p failed\n",
164 reinterpret_cast<void *>(FuncAddr));
165 return OFFLOAD_FAIL;
166 }
167
168 DP("Function at address %p called (VEO request ID: %" PRIu64 ")\n",
169 reinterpret_cast<void *>(FuncAddr), RequestHandle);
170
171 int ret = veo_call_wait_result(DeviceInfo.Contexts[DeviceID], RequestHandle,
172 RetVal);
173 if (ret != 0) {
174 DP("Waiting for entry point %p failed (Error code %d)\n",
175 reinterpret_cast<void *>(FuncAddr), ret);
176 return OFFLOAD_FAIL;
177 }
178 return OFFLOAD_SUCCESS;
179 }
180
181
182 // Return the number of available devices of the type supported by the
183 // target RTL.
__tgt_rtl_number_of_devices(void)184 int32_t __tgt_rtl_number_of_devices(void) { return DeviceInfo.NodeIds.size(); }
185
186 // Return an integer different from zero if the provided device image can be
187 // supported by the runtime. The functionality is similar to comparing the
188 // result of __tgt__rtl__load__binary to NULL. However, this is meant to be a
189 // lightweight query to determine if the RTL is suitable for an image without
190 // having to load the library, which can be expensive.
__tgt_rtl_is_valid_binary(__tgt_device_image * Image)191 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *Image) {
192 #if TARGET_ELF_ID < 1
193 return 0;
194 #else
195 return elf_check_machine(Image, TARGET_ELF_ID);
196 #endif
197 }
198
199 // Initialize the specified device. In case of success return 0; otherwise
200 // return an error code.
__tgt_rtl_init_device(int32_t ID)201 int32_t __tgt_rtl_init_device(int32_t ID) {
202 DP("Available VEO version: %i\n", veo_api_version());
203
204 // At the moment we do not really initialize (i.e. create a process or
205 // context on) the device here, but in "__tgt_rtl_load_binary".
206 // The reason for this is, that, when we create a process for a statically
207 // linked binary, the VEO api needs us to already supply the binary (but we
208 // can load a dynamically linked binary later, after we create the process).
209 // At this stage, we cannot check if we have a dynamically or statically
210 // linked binary so we defer process creation until we know.
211 return OFFLOAD_SUCCESS;
212 }
213
214 // Pass an executable image section described by image to the specified
215 // device and prepare an address table of target entities. In case of error,
216 // return NULL. Otherwise, return a pointer to the built address table.
217 // Individual entries in the table may also be NULL, when the corresponding
218 // offload region is not supported on the target device.
__tgt_rtl_load_binary(int32_t ID,__tgt_device_image * Image)219 __tgt_target_table *__tgt_rtl_load_binary(int32_t ID,
220 __tgt_device_image *Image) {
221 DP("Dev %d: load binary from " DPxMOD " image\n", ID,
222 DPxPTR(Image->ImageStart));
223
224 assert(ID >= 0 && "bad dev id");
225
226 size_t ImageSize = (size_t)Image->ImageEnd - (size_t)Image->ImageStart;
227 size_t NumEntries = (size_t)(Image->EntriesEnd - Image->EntriesBegin);
228 DP("Expecting to have %zd entries defined.\n", NumEntries);
229
230 // load dynamic library and get the entry points. We use the dl library
231 // to do the loading of the library, but we could do it directly to avoid the
232 // dump to the temporary file.
233 //
234 // 1) Create tmp file with the library contents.
235 // 2) Use dlopen to load the file and dlsym to retrieve the symbols.
236 char tmp_name[] = "/tmp/tmpfile_XXXXXX";
237 int tmp_fd = mkstemp(tmp_name);
238
239 if (tmp_fd == -1) {
240 return NULL;
241 }
242
243 FILE *ftmp = fdopen(tmp_fd, "wb");
244
245 if (!ftmp) {
246 DP("fdopen() for %s failed. Could not write target image\n", tmp_name);
247 return NULL;
248 }
249
250 fwrite(Image->ImageStart, ImageSize, 1, ftmp);
251
252 // at least for the static case we need to change the permissions
253 chmod(tmp_name, 0700);
254
255 DP("Wrote target image to %s. ImageSize=%zu\n", tmp_name, ImageSize);
256
257 fclose(ftmp);
258
259 // See comment in "__tgt_rtl_init_device"
260 bool is_dyn = true;
261 if (DeviceInfo.ProcHandles[ID] == NULL) {
262 struct veo_proc_handle *proc_handle;
263 is_dyn = elf_is_dynamic(Image);
264 // If we have a dynamically linked image, we create the process handle, then
265 // the thread, and then load the image.
266 // If we have a statically linked image, we need to create the process
267 // handle and load the image at the same time with veo_proc_create_static().
268 if (is_dyn) {
269 proc_handle = veo_proc_create(DeviceInfo.NodeIds[ID]);
270 if (!proc_handle) {
271 DP("veo_proc_create() failed for device %d\n", ID);
272 return NULL;
273 }
274 } else {
275 proc_handle = veo_proc_create_static(DeviceInfo.NodeIds[ID], tmp_name);
276 if (!proc_handle) {
277 DP("veo_proc_create_static() failed for device %d, image=%s\n", ID,
278 tmp_name);
279 return NULL;
280 }
281 }
282 DeviceInfo.ProcHandles[ID] = proc_handle;
283 }
284
285 if (DeviceInfo.Contexts[ID] == NULL) {
286 struct veo_thr_ctxt *ctx = veo_context_open(DeviceInfo.ProcHandles[ID]);
287
288 if (!ctx) {
289 DP("veo_context_open() failed: %s\n", std::strerror(errno));
290 return NULL;
291 }
292
293 DeviceInfo.Contexts[ID] = ctx;
294 }
295
296 DP("Aurora device successfully initialized with loaded binary: "
297 "proc_handle=%p, ctx=%p\n",
298 DeviceInfo.ProcHandles[ID], DeviceInfo.Contexts[ID]);
299
300 uint64_t LibHandle = 0UL;
301 if (is_dyn) {
302 LibHandle = veo_load_library(DeviceInfo.ProcHandles[ID], tmp_name);
303
304 if (!LibHandle) {
305 DP("veo_load_library() failed: LibHandle=%" PRIu64
306 " Name=%s. Set env VEORUN_BIN for static linked target code.\n",
307 LibHandle, tmp_name);
308 return NULL;
309 }
310
311 DP("Successfully loaded library dynamically\n");
312 } else {
313 DP("Symbol table is expected to have been created by "
314 "veo_create_proc_static()\n");
315 }
316
317 DynLibTy Lib = {tmp_name, LibHandle};
318 DeviceInfo.DynLibs.push_back(Lib);
319 DeviceInfo.LibraryHandles[ID] = LibHandle;
320
321 DeviceInfo.buildOffloadTableFromHost(ID, LibHandle, Image->EntriesBegin,
322 Image->EntriesEnd);
323
324 return DeviceInfo.getOffloadTable(ID);
325 }
326
327 // Allocate data on the particular target device, of the specified size.
328 // HostPtr is a address of the host data the allocated target data
329 // will be associated with (HostPtr may be NULL if it is not known at
330 // allocation time, like for example it would be for target data that
331 // is allocated by omp_target_alloc() API). Return address of the
332 // allocated data on the target that will be used by libomptarget.so to
333 // initialize the target data mapping structures. These addresses are
334 // used to generate a table of target variables to pass to
335 // __tgt_rtl_run_region(). The __tgt_rtl_data_alloc() returns NULL in
336 // case an error occurred on the target device.
__tgt_rtl_data_alloc(int32_t ID,int64_t Size,void * HostPtr)337 void *__tgt_rtl_data_alloc(int32_t ID, int64_t Size, void *HostPtr) {
338 int ret;
339 uint64_t addr;
340
341 if (DeviceInfo.ProcHandles[ID] == NULL) {
342 struct veo_proc_handle *proc_handle;
343 proc_handle = veo_proc_create(DeviceInfo.NodeIds[ID]);
344 if (!proc_handle) {
345 DP("veo_proc_create() failed for device %d\n", ID);
346 return NULL;
347 }
348 DeviceInfo.ProcHandles[ID] = proc_handle;
349 DP("Aurora device successfully initialized: proc_handle=%p", proc_handle);
350 }
351
352 ret = veo_alloc_mem(DeviceInfo.ProcHandles[ID], &addr, Size);
353 DP("Allocate target memory: device=%d, target addr=%p, size=%" PRIu64 "\n",
354 ID, reinterpret_cast<void *>(addr), Size);
355 if (ret != 0) {
356 DP("veo_alloc_mem(%d, %p, %" PRIu64 ") failed with error code %d\n",
357 ID, reinterpret_cast<void *>(addr), Size, ret);
358 return NULL;
359 }
360
361 return reinterpret_cast<void *>(addr);
362 }
363
364 // Pass the data content to the target device using the target address.
365 // In case of success, return zero. Otherwise, return an error code.
__tgt_rtl_data_submit(int32_t ID,void * TargetPtr,void * HostPtr,int64_t Size)366 int32_t __tgt_rtl_data_submit(int32_t ID, void *TargetPtr, void *HostPtr,
367 int64_t Size) {
368 int ret = veo_write_mem(DeviceInfo.ProcHandles[ID], (uint64_t)TargetPtr,
369 HostPtr, (size_t)Size);
370 if (ret != 0) {
371 DP("veo_write_mem() failed with error code %d\n", ret);
372 return OFFLOAD_FAIL;
373 }
374 return OFFLOAD_SUCCESS;
375 }
376
377 // Retrieve the data content from the target device using its address.
378 // In case of success, return zero. Otherwise, return an error code.
__tgt_rtl_data_retrieve(int32_t ID,void * HostPtr,void * TargetPtr,int64_t Size)379 int32_t __tgt_rtl_data_retrieve(int32_t ID, void *HostPtr, void *TargetPtr,
380 int64_t Size) {
381 int ret = veo_read_mem(DeviceInfo.ProcHandles[ID], HostPtr,
382 (uint64_t)TargetPtr, Size);
383 if (ret != 0) {
384 DP("veo_read_mem() failed with error code %d\n", ret);
385 return OFFLOAD_FAIL;
386 }
387 return OFFLOAD_SUCCESS;
388 }
389
390 // De-allocate the data referenced by target ptr on the device. In case of
391 // success, return zero. Otherwise, return an error code.
__tgt_rtl_data_delete(int32_t ID,void * TargetPtr)392 int32_t __tgt_rtl_data_delete(int32_t ID, void *TargetPtr) {
393 int ret = veo_free_mem(DeviceInfo.ProcHandles[ID], (uint64_t)TargetPtr);
394
395 if (ret != 0) {
396 DP("veo_free_mem() failed with error code %d\n", ret);
397 return OFFLOAD_FAIL;
398 }
399 return OFFLOAD_SUCCESS;
400 }
401
402 // Similar to __tgt_rtl_run_target_region, but additionally specify the
403 // number of teams to be created and a number of threads in each team.
__tgt_rtl_run_target_team_region(int32_t ID,void * Entry,void ** Args,ptrdiff_t * Offsets,int32_t NumArgs,int32_t NumTeams,int32_t ThreadLimit,uint64_t loop_tripcount)404 int32_t __tgt_rtl_run_target_team_region(int32_t ID, void *Entry, void **Args,
405 ptrdiff_t *Offsets, int32_t NumArgs,
406 int32_t NumTeams, int32_t ThreadLimit,
407 uint64_t loop_tripcount) {
408 int ret;
409
410 // ignore team num and thread limit.
411 std::vector<void *> ptrs(NumArgs);
412
413 struct veo_args *TargetArgs;
414 TargetArgs = veo_args_alloc();
415
416 if (TargetArgs == NULL) {
417 DP("Could not allocate VEO args\n");
418 return OFFLOAD_FAIL;
419 }
420
421 for (int i = 0; i < NumArgs; ++i) {
422 ret = veo_args_set_u64(TargetArgs, i, (intptr_t)Args[i]);
423
424 if (ret != 0) {
425 DP("veo_args_set_u64() has returned %d for argnum=%d and value %p\n",
426 ret, i, Args[i]);
427 return OFFLOAD_FAIL;
428 }
429 }
430
431 uint64_t RetVal;
432 if (target_run_function_wait(ID, reinterpret_cast<uint64_t>(Entry),
433 TargetArgs, &RetVal) != OFFLOAD_SUCCESS) {
434 veo_args_free(TargetArgs);
435 return OFFLOAD_FAIL;
436 }
437 veo_args_free(TargetArgs);
438 return OFFLOAD_SUCCESS;
439 }
440
441 // Transfer control to the offloaded entry Entry on the target device.
442 // Args and Offsets are arrays of NumArgs size of target addresses and
443 // offsets. An offset should be added to the target address before passing it
444 // to the outlined function on device side. In case of success, return zero.
445 // Otherwise, return an error code.
__tgt_rtl_run_target_region(int32_t ID,void * Entry,void ** Args,ptrdiff_t * Offsets,int32_t NumArgs)446 int32_t __tgt_rtl_run_target_region(int32_t ID, void *Entry, void **Args,
447 ptrdiff_t *Offsets, int32_t NumArgs) {
448 return __tgt_rtl_run_target_team_region(ID, Entry, Args, Offsets, NumArgs, 1,
449 1, 0);
450 }
451