1 //===------ omptarget.cpp - Target independent OpenMP target RTL -- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // Implementation of the interface to be used by Clang during the codegen of a
10 // target region.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "device.h"
15 #include "private.h"
16 #include "rtl.h"
17 
18 #include <cassert>
19 #include <vector>
20 
21 /* All begin addresses for partially mapped structs must be 8-aligned in order
22  * to ensure proper alignment of members. E.g.
23  *
24  * struct S {
25  *   int a;   // 4-aligned
26  *   int b;   // 4-aligned
27  *   int *p;  // 8-aligned
28  * } s1;
29  * ...
30  * #pragma omp target map(tofrom: s1.b, s1.p[0:N])
31  * {
32  *   s1.b = 5;
33  *   for (int i...) s1.p[i] = ...;
34  * }
35  *
36  * Here we are mapping s1 starting from member b, so BaseAddress=&s1=&s1.a and
37  * BeginAddress=&s1.b. Let's assume that the struct begins at address 0x100,
38  * then &s1.a=0x100, &s1.b=0x104, &s1.p=0x108. Each member obeys the alignment
39  * requirements for its type. Now, when we allocate memory on the device, in
40  * CUDA's case cuMemAlloc() returns an address which is at least 256-aligned.
41  * This means that the chunk of the struct on the device will start at a
42  * 256-aligned address, let's say 0x200. Then the address of b will be 0x200 and
43  * address of p will be a misaligned 0x204 (on the host there was no need to add
44  * padding between b and p, so p comes exactly 4 bytes after b). If the device
45  * kernel tries to access s1.p, a misaligned address error occurs (as reported
46  * by the CUDA plugin). By padding the begin address down to a multiple of 8 and
47  * extending the size of the allocated chuck accordingly, the chuck on the
48  * device will start at 0x200 with the padding (4 bytes), then &s1.b=0x204 and
49  * &s1.p=0x208, as they should be to satisfy the alignment requirements.
50  */
51 static const int64_t Alignment = 8;
52 
53 /// Map global data and execute pending ctors
InitLibrary(DeviceTy & Device)54 static int InitLibrary(DeviceTy& Device) {
55   /*
56    * Map global data
57    */
58   int32_t device_id = Device.DeviceID;
59   int rc = OFFLOAD_SUCCESS;
60 
61   Device.PendingGlobalsMtx.lock();
62   PM->TrlTblMtx.lock();
63   for (HostEntriesBeginToTransTableTy::iterator entry_it =
64            PM->HostEntriesBeginToTransTable.begin();
65        entry_it != PM->HostEntriesBeginToTransTable.end(); ++entry_it) {
66     TranslationTable *TransTable = &entry_it->second;
67     if (TransTable->HostTable.EntriesBegin ==
68         TransTable->HostTable.EntriesEnd) {
69       // No host entry so no need to proceed
70       continue;
71     }
72     if (TransTable->TargetsTable[device_id] != 0) {
73       // Library entries have already been processed
74       continue;
75     }
76 
77     // 1) get image.
78     assert(TransTable->TargetsImages.size() > (size_t)device_id &&
79            "Not expecting a device ID outside the table's bounds!");
80     __tgt_device_image *img = TransTable->TargetsImages[device_id];
81     if (!img) {
82       REPORT("No image loaded for device id %d.\n", device_id);
83       rc = OFFLOAD_FAIL;
84       break;
85     }
86     // 2) load image into the target table.
87     __tgt_target_table *TargetTable =
88         TransTable->TargetsTable[device_id] = Device.load_binary(img);
89     // Unable to get table for this image: invalidate image and fail.
90     if (!TargetTable) {
91       REPORT("Unable to generate entries table for device id %d.\n", device_id);
92       TransTable->TargetsImages[device_id] = 0;
93       rc = OFFLOAD_FAIL;
94       break;
95     }
96 
97     // Verify whether the two table sizes match.
98     size_t hsize =
99         TransTable->HostTable.EntriesEnd - TransTable->HostTable.EntriesBegin;
100     size_t tsize = TargetTable->EntriesEnd - TargetTable->EntriesBegin;
101 
102     // Invalid image for these host entries!
103     if (hsize != tsize) {
104       REPORT("Host and Target tables mismatch for device id %d [%zx != %zx].\n",
105              device_id, hsize, tsize);
106       TransTable->TargetsImages[device_id] = 0;
107       TransTable->TargetsTable[device_id] = 0;
108       rc = OFFLOAD_FAIL;
109       break;
110     }
111 
112     // process global data that needs to be mapped.
113     Device.DataMapMtx.lock();
114     __tgt_target_table *HostTable = &TransTable->HostTable;
115     for (__tgt_offload_entry *CurrDeviceEntry = TargetTable->EntriesBegin,
116                              *CurrHostEntry = HostTable->EntriesBegin,
117                              *EntryDeviceEnd = TargetTable->EntriesEnd;
118          CurrDeviceEntry != EntryDeviceEnd;
119          CurrDeviceEntry++, CurrHostEntry++) {
120       if (CurrDeviceEntry->size != 0) {
121         // has data.
122         assert(CurrDeviceEntry->size == CurrHostEntry->size &&
123                "data size mismatch");
124 
125         // Fortran may use multiple weak declarations for the same symbol,
126         // therefore we must allow for multiple weak symbols to be loaded from
127         // the fat binary. Treat these mappings as any other "regular" mapping.
128         // Add entry to map.
129         if (Device.getTgtPtrBegin(CurrHostEntry->addr, CurrHostEntry->size))
130           continue;
131         DP("Add mapping from host " DPxMOD " to device " DPxMOD " with size %zu"
132             "\n", DPxPTR(CurrHostEntry->addr), DPxPTR(CurrDeviceEntry->addr),
133             CurrDeviceEntry->size);
134         Device.HostDataToTargetMap.emplace(
135             (uintptr_t)CurrHostEntry->addr /*HstPtrBase*/,
136             (uintptr_t)CurrHostEntry->addr /*HstPtrBegin*/,
137             (uintptr_t)CurrHostEntry->addr + CurrHostEntry->size /*HstPtrEnd*/,
138             (uintptr_t)CurrDeviceEntry->addr /*TgtPtrBegin*/, nullptr,
139             true /*IsRefCountINF*/);
140       }
141     }
142     Device.DataMapMtx.unlock();
143   }
144   PM->TrlTblMtx.unlock();
145 
146   if (rc != OFFLOAD_SUCCESS) {
147     Device.PendingGlobalsMtx.unlock();
148     return rc;
149   }
150 
151   /*
152    * Run ctors for static objects
153    */
154   if (!Device.PendingCtorsDtors.empty()) {
155     // Call all ctors for all libraries registered so far
156     for (auto &lib : Device.PendingCtorsDtors) {
157       if (!lib.second.PendingCtors.empty()) {
158         DP("Has pending ctors... call now\n");
159         for (auto &entry : lib.second.PendingCtors) {
160           void *ctor = entry;
161           int rc = target(device_id, ctor, 0, nullptr, nullptr, nullptr,
162                           nullptr, nullptr, nullptr, 1, 1, true /*team*/);
163           if (rc != OFFLOAD_SUCCESS) {
164             REPORT("Running ctor " DPxMOD " failed.\n", DPxPTR(ctor));
165             Device.PendingGlobalsMtx.unlock();
166             return OFFLOAD_FAIL;
167           }
168         }
169         // Clear the list to indicate that this device has been used
170         lib.second.PendingCtors.clear();
171         DP("Done with pending ctors for lib " DPxMOD "\n", DPxPTR(lib.first));
172       }
173     }
174   }
175   Device.HasPendingGlobals = false;
176   Device.PendingGlobalsMtx.unlock();
177 
178   return OFFLOAD_SUCCESS;
179 }
180 
181 // Check whether a device has been initialized, global ctors have been
182 // executed and global data has been mapped; do so if not already done.
CheckDeviceAndCtors(int64_t device_id)183 int CheckDeviceAndCtors(int64_t device_id) {
184   // Is device ready?
185   if (!device_is_ready(device_id)) {
186     REPORT("Device %" PRId64 " is not ready.\n", device_id);
187     return OFFLOAD_FAIL;
188   }
189 
190   // Get device info.
191   DeviceTy &Device = PM->Devices[device_id];
192 
193   // Check whether global data has been mapped for this device
194   Device.PendingGlobalsMtx.lock();
195   bool hasPendingGlobals = Device.HasPendingGlobals;
196   Device.PendingGlobalsMtx.unlock();
197   if (hasPendingGlobals && InitLibrary(Device) != OFFLOAD_SUCCESS) {
198     REPORT("Failed to init globals on device %" PRId64 "\n", device_id);
199     return OFFLOAD_FAIL;
200   }
201 
202   return OFFLOAD_SUCCESS;
203 }
204 
getParentIndex(int64_t type)205 static int32_t getParentIndex(int64_t type) {
206   return ((type & OMP_TGT_MAPTYPE_MEMBER_OF) >> 48) - 1;
207 }
208 
209 /// Call the user-defined mapper function followed by the appropriate
210 // target_data_* function (target_data_{begin,end,update}).
targetDataMapper(DeviceTy & Device,void * arg_base,void * arg,int64_t arg_size,int64_t arg_type,void * arg_mapper,TargetDataFuncPtrTy target_data_function)211 int targetDataMapper(DeviceTy &Device, void *arg_base, void *arg,
212                      int64_t arg_size, int64_t arg_type, void *arg_mapper,
213                      TargetDataFuncPtrTy target_data_function) {
214   DP("Calling the mapper function " DPxMOD "\n", DPxPTR(arg_mapper));
215 
216   // The mapper function fills up Components.
217   MapperComponentsTy MapperComponents;
218   MapperFuncPtrTy MapperFuncPtr = (MapperFuncPtrTy)(arg_mapper);
219   (*MapperFuncPtr)((void *)&MapperComponents, arg_base, arg, arg_size,
220       arg_type);
221 
222   // Construct new arrays for args_base, args, arg_sizes and arg_types
223   // using the information in MapperComponents and call the corresponding
224   // target_data_* function using these new arrays.
225   std::vector<void *> MapperArgsBase(MapperComponents.Components.size());
226   std::vector<void *> MapperArgs(MapperComponents.Components.size());
227   std::vector<int64_t> MapperArgSizes(MapperComponents.Components.size());
228   std::vector<int64_t> MapperArgTypes(MapperComponents.Components.size());
229 
230   for (unsigned I = 0, E = MapperComponents.Components.size(); I < E; ++I) {
231     auto &C =
232         MapperComponents
233             .Components[target_data_function == targetDataEnd ? I : E - I - 1];
234     MapperArgsBase[I] = C.Base;
235     MapperArgs[I] = C.Begin;
236     MapperArgSizes[I] = C.Size;
237     MapperArgTypes[I] = C.Type;
238   }
239 
240   int rc = target_data_function(Device, MapperComponents.Components.size(),
241                                 MapperArgsBase.data(), MapperArgs.data(),
242                                 MapperArgSizes.data(), MapperArgTypes.data(),
243                                 /*arg_names*/ nullptr, /*arg_mappers*/ nullptr,
244                                 /*__tgt_async_info*/ nullptr);
245 
246   return rc;
247 }
248 
249 /// Internal function to do the mapping and transfer the data to the device
targetDataBegin(DeviceTy & Device,int32_t arg_num,void ** args_base,void ** args,int64_t * arg_sizes,int64_t * arg_types,map_var_info_t * arg_names,void ** arg_mappers,__tgt_async_info * async_info_ptr)250 int targetDataBegin(DeviceTy &Device, int32_t arg_num, void **args_base,
251                     void **args, int64_t *arg_sizes, int64_t *arg_types,
252                     map_var_info_t *arg_names, void **arg_mappers,
253                     __tgt_async_info *async_info_ptr) {
254   // process each input.
255   for (int32_t i = 0; i < arg_num; ++i) {
256     // Ignore private variables and arrays - there is no mapping for them.
257     if ((arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) ||
258         (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE))
259       continue;
260 
261     if (arg_mappers && arg_mappers[i]) {
262       // Instead of executing the regular path of targetDataBegin, call the
263       // targetDataMapper variant which will call targetDataBegin again
264       // with new arguments.
265       DP("Calling targetDataMapper for the %dth argument\n", i);
266 
267       int rc = targetDataMapper(Device, args_base[i], args[i], arg_sizes[i],
268                                 arg_types[i], arg_mappers[i], targetDataBegin);
269 
270       if (rc != OFFLOAD_SUCCESS) {
271         REPORT("Call to targetDataBegin via targetDataMapper for custom mapper"
272                " failed.\n");
273         return OFFLOAD_FAIL;
274       }
275 
276       // Skip the rest of this function, continue to the next argument.
277       continue;
278     }
279 
280     void *HstPtrBegin = args[i];
281     void *HstPtrBase = args_base[i];
282     int64_t data_size = arg_sizes[i];
283     map_var_info_t HstPtrName = (!arg_names) ? nullptr : arg_names[i];
284 
285     // Adjust for proper alignment if this is a combined entry (for structs).
286     // Look at the next argument - if that is MEMBER_OF this one, then this one
287     // is a combined entry.
288     int64_t padding = 0;
289     const int next_i = i+1;
290     if (getParentIndex(arg_types[i]) < 0 && next_i < arg_num &&
291         getParentIndex(arg_types[next_i]) == i) {
292       padding = (int64_t)HstPtrBegin % Alignment;
293       if (padding) {
294         DP("Using a padding of %" PRId64 " bytes for begin address " DPxMOD
295             "\n", padding, DPxPTR(HstPtrBegin));
296         HstPtrBegin = (char *) HstPtrBegin - padding;
297         data_size += padding;
298       }
299     }
300 
301     // Address of pointer on the host and device, respectively.
302     void *Pointer_HstPtrBegin, *PointerTgtPtrBegin;
303     bool IsNew, Pointer_IsNew;
304     bool IsHostPtr = false;
305     bool IsImplicit = arg_types[i] & OMP_TGT_MAPTYPE_IMPLICIT;
306     // Force the creation of a device side copy of the data when:
307     // a close map modifier was associated with a map that contained a to.
308     bool HasCloseModifier = arg_types[i] & OMP_TGT_MAPTYPE_CLOSE;
309     bool HasPresentModifier = arg_types[i] & OMP_TGT_MAPTYPE_PRESENT;
310     // UpdateRef is based on MEMBER_OF instead of TARGET_PARAM because if we
311     // have reached this point via __tgt_target_data_begin and not __tgt_target
312     // then no argument is marked as TARGET_PARAM ("omp target data map" is not
313     // associated with a target region, so there are no target parameters). This
314     // may be considered a hack, we could revise the scheme in the future.
315     bool UpdateRef = !(arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF);
316     if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) {
317       DP("Has a pointer entry: \n");
318       // Base is address of pointer.
319       //
320       // Usually, the pointer is already allocated by this time.  For example:
321       //
322       //   #pragma omp target map(s.p[0:N])
323       //
324       // The map entry for s comes first, and the PTR_AND_OBJ entry comes
325       // afterward, so the pointer is already allocated by the time the
326       // PTR_AND_OBJ entry is handled below, and PointerTgtPtrBegin is thus
327       // non-null.  However, "declare target link" can produce a PTR_AND_OBJ
328       // entry for a global that might not already be allocated by the time the
329       // PTR_AND_OBJ entry is handled below, and so the allocation might fail
330       // when HasPresentModifier.
331       PointerTgtPtrBegin = Device.getOrAllocTgtPtr(
332           HstPtrBase, HstPtrBase, sizeof(void *), HstPtrName, Pointer_IsNew,
333           IsHostPtr, IsImplicit, UpdateRef, HasCloseModifier,
334           HasPresentModifier);
335       if (!PointerTgtPtrBegin) {
336         REPORT("Call to getOrAllocTgtPtr returned null pointer (%s).\n",
337                HasPresentModifier ? "'present' map type modifier"
338                                   : "device failure or illegal mapping");
339         return OFFLOAD_FAIL;
340       }
341       DP("There are %zu bytes allocated at target address " DPxMOD " - is%s new"
342           "\n", sizeof(void *), DPxPTR(PointerTgtPtrBegin),
343           (Pointer_IsNew ? "" : " not"));
344       Pointer_HstPtrBegin = HstPtrBase;
345       // modify current entry.
346       HstPtrBase = *(void **)HstPtrBase;
347       UpdateRef = true; // subsequently update ref count of pointee
348     }
349 
350     void *TgtPtrBegin = Device.getOrAllocTgtPtr(
351         HstPtrBegin, HstPtrBase, data_size, HstPtrName, IsNew, IsHostPtr,
352         IsImplicit, UpdateRef, HasCloseModifier, HasPresentModifier);
353     // If data_size==0, then the argument could be a zero-length pointer to
354     // NULL, so getOrAlloc() returning NULL is not an error.
355     if (!TgtPtrBegin && (data_size || HasPresentModifier)) {
356       REPORT("Call to getOrAllocTgtPtr returned null pointer (%s).\n",
357              HasPresentModifier ? "'present' map type modifier"
358                                 : "device failure or illegal mapping");
359       return OFFLOAD_FAIL;
360     }
361     DP("There are %" PRId64 " bytes allocated at target address " DPxMOD
362         " - is%s new\n", data_size, DPxPTR(TgtPtrBegin),
363         (IsNew ? "" : " not"));
364 
365     if (arg_types[i] & OMP_TGT_MAPTYPE_RETURN_PARAM) {
366       uintptr_t Delta = (uintptr_t)HstPtrBegin - (uintptr_t)HstPtrBase;
367       void *TgtPtrBase = (void *)((uintptr_t)TgtPtrBegin - Delta);
368       DP("Returning device pointer " DPxMOD "\n", DPxPTR(TgtPtrBase));
369       args_base[i] = TgtPtrBase;
370     }
371 
372     if (arg_types[i] & OMP_TGT_MAPTYPE_TO) {
373       bool copy = false;
374       if (!(PM->RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) ||
375           HasCloseModifier) {
376         if (IsNew || (arg_types[i] & OMP_TGT_MAPTYPE_ALWAYS)) {
377           copy = true;
378         } else if ((arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) &&
379                    !(arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) {
380           // Copy data only if the "parent" struct has RefCount==1.
381           // If this is a PTR_AND_OBJ entry, the OBJ is not part of the struct,
382           // so exclude it from this check.
383           int32_t parent_idx = getParentIndex(arg_types[i]);
384           uint64_t parent_rc = Device.getMapEntryRefCnt(args[parent_idx]);
385           assert(parent_rc > 0 && "parent struct not found");
386           if (parent_rc == 1) {
387             copy = true;
388           }
389         }
390       }
391 
392       if (copy && !IsHostPtr) {
393         DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n",
394            data_size, DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin));
395         int rt = Device.submitData(TgtPtrBegin, HstPtrBegin, data_size,
396                                    async_info_ptr);
397         if (rt != OFFLOAD_SUCCESS) {
398           REPORT("Copying data to device failed.\n");
399           return OFFLOAD_FAIL;
400         }
401       }
402     }
403 
404     if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ && !IsHostPtr) {
405       DP("Update pointer (" DPxMOD ") -> [" DPxMOD "]\n",
406          DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin));
407       uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase;
408       void *TgtPtrBase = (void *)((uint64_t)TgtPtrBegin - Delta);
409       int rt = Device.submitData(PointerTgtPtrBegin, &TgtPtrBase,
410                                  sizeof(void *), async_info_ptr);
411       if (rt != OFFLOAD_SUCCESS) {
412         REPORT("Copying data to device failed.\n");
413         return OFFLOAD_FAIL;
414       }
415       // create shadow pointers for this entry
416       Device.ShadowMtx.lock();
417       Device.ShadowPtrMap[Pointer_HstPtrBegin] = {
418           HstPtrBase, PointerTgtPtrBegin, TgtPtrBase};
419       Device.ShadowMtx.unlock();
420     }
421   }
422 
423   return OFFLOAD_SUCCESS;
424 }
425 
426 namespace {
427 /// This structure contains information to deallocate a target pointer, aka.
428 /// used to call the function \p DeviceTy::deallocTgtPtr.
429 struct DeallocTgtPtrInfo {
430   /// Host pointer used to look up into the map table
431   void *HstPtrBegin;
432   /// Size of the data
433   int64_t DataSize;
434   /// Whether it is forced to be removed from the map table
435   bool ForceDelete;
436   /// Whether it has \p close modifier
437   bool HasCloseModifier;
438 
DeallocTgtPtrInfo__anone97b13310111::DeallocTgtPtrInfo439   DeallocTgtPtrInfo(void *HstPtr, int64_t Size, bool ForceDelete,
440                     bool HasCloseModifier)
441       : HstPtrBegin(HstPtr), DataSize(Size), ForceDelete(ForceDelete),
442         HasCloseModifier(HasCloseModifier) {}
443 };
444 } // namespace
445 
446 /// Internal function to undo the mapping and retrieve the data from the device.
targetDataEnd(DeviceTy & Device,int32_t ArgNum,void ** ArgBases,void ** Args,int64_t * ArgSizes,int64_t * ArgTypes,map_var_info_t * ArgNames,void ** ArgMappers,__tgt_async_info * AsyncInfo)447 int targetDataEnd(DeviceTy &Device, int32_t ArgNum, void **ArgBases,
448                   void **Args, int64_t *ArgSizes, int64_t *ArgTypes,
449                   map_var_info_t *ArgNames, void **ArgMappers,
450                   __tgt_async_info *AsyncInfo) {
451   int Ret;
452   std::vector<DeallocTgtPtrInfo> DeallocTgtPtrs;
453   // process each input.
454   for (int32_t I = ArgNum - 1; I >= 0; --I) {
455     // Ignore private variables and arrays - there is no mapping for them.
456     // Also, ignore the use_device_ptr directive, it has no effect here.
457     if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) ||
458         (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE))
459       continue;
460 
461     if (ArgMappers && ArgMappers[I]) {
462       // Instead of executing the regular path of targetDataEnd, call the
463       // targetDataMapper variant which will call targetDataEnd again
464       // with new arguments.
465       DP("Calling targetDataMapper for the %dth argument\n", I);
466 
467       Ret = targetDataMapper(Device, ArgBases[I], Args[I], ArgSizes[I],
468                              ArgTypes[I], ArgMappers[I], targetDataEnd);
469 
470       if (Ret != OFFLOAD_SUCCESS) {
471         REPORT("Call to targetDataEnd via targetDataMapper for custom mapper"
472                " failed.\n");
473         return OFFLOAD_FAIL;
474       }
475 
476       // Skip the rest of this function, continue to the next argument.
477       continue;
478     }
479 
480     void *HstPtrBegin = Args[I];
481     int64_t DataSize = ArgSizes[I];
482     // Adjust for proper alignment if this is a combined entry (for structs).
483     // Look at the next argument - if that is MEMBER_OF this one, then this one
484     // is a combined entry.
485     const int NextI = I + 1;
486     if (getParentIndex(ArgTypes[I]) < 0 && NextI < ArgNum &&
487         getParentIndex(ArgTypes[NextI]) == I) {
488       int64_t Padding = (int64_t)HstPtrBegin % Alignment;
489       if (Padding) {
490         DP("Using a Padding of %" PRId64 " bytes for begin address " DPxMOD
491            "\n",
492            Padding, DPxPTR(HstPtrBegin));
493         HstPtrBegin = (char *)HstPtrBegin - Padding;
494         DataSize += Padding;
495       }
496     }
497 
498     bool IsLast, IsHostPtr;
499     bool IsImplicit = ArgTypes[I] & OMP_TGT_MAPTYPE_IMPLICIT;
500     bool UpdateRef = !(ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) ||
501                      (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ);
502     bool ForceDelete = ArgTypes[I] & OMP_TGT_MAPTYPE_DELETE;
503     bool HasCloseModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_CLOSE;
504     bool HasPresentModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_PRESENT;
505 
506     // If PTR_AND_OBJ, HstPtrBegin is address of pointee
507     void *TgtPtrBegin = Device.getTgtPtrBegin(
508         HstPtrBegin, DataSize, IsLast, UpdateRef, IsHostPtr, !IsImplicit);
509     if (!TgtPtrBegin && (DataSize || HasPresentModifier)) {
510       DP("Mapping does not exist (%s)\n",
511          (HasPresentModifier ? "'present' map type modifier" : "ignored"));
512       if (HasPresentModifier) {
513         // This should be an error upon entering an "omp target exit data".  It
514         // should not be an error upon exiting an "omp target data" or "omp
515         // target".  For "omp target data", Clang thus doesn't include present
516         // modifiers for end calls.  For "omp target", we have not found a valid
517         // OpenMP program for which the error matters: it appears that, if a
518         // program can guarantee that data is present at the beginning of an
519         // "omp target" region so that there's no error there, that data is also
520         // guaranteed to be present at the end.
521         MESSAGE("device mapping required by 'present' map type modifier does "
522                 "not exist for host address " DPxMOD " (%" PRId64 " bytes)",
523                 DPxPTR(HstPtrBegin), DataSize);
524         return OFFLOAD_FAIL;
525       }
526     } else {
527       DP("There are %" PRId64 " bytes allocated at target address " DPxMOD
528          " - is%s last\n",
529          DataSize, DPxPTR(TgtPtrBegin), (IsLast ? "" : " not"));
530     }
531 
532     bool DelEntry = IsLast || ForceDelete;
533 
534     if ((ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) &&
535         !(ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) {
536       DelEntry = false; // protect parent struct from being deallocated
537     }
538 
539     if ((ArgTypes[I] & OMP_TGT_MAPTYPE_FROM) || DelEntry) {
540       // Move data back to the host
541       if (ArgTypes[I] & OMP_TGT_MAPTYPE_FROM) {
542         bool Always = ArgTypes[I] & OMP_TGT_MAPTYPE_ALWAYS;
543         bool CopyMember = false;
544         if (!(PM->RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) ||
545             HasCloseModifier) {
546           if ((ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) &&
547               !(ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) {
548             // Copy data only if the "parent" struct has RefCount==1.
549             int32_t ParentIdx = getParentIndex(ArgTypes[I]);
550             uint64_t ParentRC = Device.getMapEntryRefCnt(Args[ParentIdx]);
551             assert(ParentRC > 0 && "parent struct not found");
552             if (ParentRC == 1)
553               CopyMember = true;
554           }
555         }
556 
557         if ((DelEntry || Always || CopyMember) &&
558             !(PM->RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY &&
559               TgtPtrBegin == HstPtrBegin)) {
560           DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
561              DataSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
562           Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, DataSize,
563                                     AsyncInfo);
564           if (Ret != OFFLOAD_SUCCESS) {
565             REPORT("Copying data from device failed.\n");
566             return OFFLOAD_FAIL;
567           }
568         }
569       }
570 
571       // If we copied back to the host a struct/array containing pointers, we
572       // need to restore the original host pointer values from their shadow
573       // copies. If the struct is going to be deallocated, remove any remaining
574       // shadow pointer entries for this struct.
575       uintptr_t LB = (uintptr_t)HstPtrBegin;
576       uintptr_t UB = (uintptr_t)HstPtrBegin + DataSize;
577       Device.ShadowMtx.lock();
578       for (ShadowPtrListTy::iterator Itr = Device.ShadowPtrMap.begin();
579            Itr != Device.ShadowPtrMap.end();) {
580         void **ShadowHstPtrAddr = (void **)Itr->first;
581 
582         // An STL map is sorted on its keys; use this property
583         // to quickly determine when to break out of the loop.
584         if ((uintptr_t)ShadowHstPtrAddr < LB) {
585           ++Itr;
586           continue;
587         }
588         if ((uintptr_t)ShadowHstPtrAddr >= UB)
589           break;
590 
591         // If we copied the struct to the host, we need to restore the pointer.
592         if (ArgTypes[I] & OMP_TGT_MAPTYPE_FROM) {
593           DP("Restoring original host pointer value " DPxMOD " for host "
594              "pointer " DPxMOD "\n",
595              DPxPTR(Itr->second.HstPtrVal), DPxPTR(ShadowHstPtrAddr));
596           *ShadowHstPtrAddr = Itr->second.HstPtrVal;
597         }
598         // If the struct is to be deallocated, remove the shadow entry.
599         if (DelEntry) {
600           DP("Removing shadow pointer " DPxMOD "\n", DPxPTR(ShadowHstPtrAddr));
601           Itr = Device.ShadowPtrMap.erase(Itr);
602         } else {
603           ++Itr;
604         }
605       }
606       Device.ShadowMtx.unlock();
607 
608       // Add pointer to the buffer for later deallocation
609       if (DelEntry)
610         DeallocTgtPtrs.emplace_back(HstPtrBegin, DataSize, ForceDelete,
611                                     HasCloseModifier);
612     }
613   }
614 
615   // We need to synchronize before deallocating data.
616   // If AsyncInfo is nullptr, the previous data transfer (if has) will be
617   // synchronous, so we don't need to synchronize again. If AsyncInfo->Queue is
618   // nullptr, there is no data transfer happened because once there is,
619   // AsyncInfo->Queue will not be nullptr, so again, we don't need to
620   // synchronize.
621   if (AsyncInfo && AsyncInfo->Queue) {
622     Ret = Device.synchronize(AsyncInfo);
623     if (Ret != OFFLOAD_SUCCESS) {
624       REPORT("Failed to synchronize device.\n");
625       return OFFLOAD_FAIL;
626     }
627   }
628 
629   // Deallocate target pointer
630   for (DeallocTgtPtrInfo &Info : DeallocTgtPtrs) {
631     Ret = Device.deallocTgtPtr(Info.HstPtrBegin, Info.DataSize,
632                                Info.ForceDelete, Info.HasCloseModifier);
633     if (Ret != OFFLOAD_SUCCESS) {
634       REPORT("Deallocating data from device failed.\n");
635       return OFFLOAD_FAIL;
636     }
637   }
638 
639   return OFFLOAD_SUCCESS;
640 }
641 
targetDataContiguous(DeviceTy & Device,void * ArgsBase,void * HstPtrBegin,int64_t ArgSize,int64_t ArgType)642 static int targetDataContiguous(DeviceTy &Device, void *ArgsBase,
643                                 void *HstPtrBegin, int64_t ArgSize,
644                                 int64_t ArgType) {
645   bool IsLast, IsHostPtr;
646   void *TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, ArgSize, IsLast, false,
647                                             IsHostPtr, /*MustContain=*/true);
648   if (!TgtPtrBegin) {
649     DP("hst data:" DPxMOD " not found, becomes a noop\n", DPxPTR(HstPtrBegin));
650     if (ArgType & OMP_TGT_MAPTYPE_PRESENT) {
651       MESSAGE("device mapping required by 'present' motion modifier does not "
652               "exist for host address " DPxMOD " (%" PRId64 " bytes)",
653               DPxPTR(HstPtrBegin), ArgSize);
654       return OFFLOAD_FAIL;
655     }
656     return OFFLOAD_SUCCESS;
657   }
658 
659   if (PM->RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY &&
660       TgtPtrBegin == HstPtrBegin) {
661     DP("hst data:" DPxMOD " unified and shared, becomes a noop\n",
662        DPxPTR(HstPtrBegin));
663     return OFFLOAD_SUCCESS;
664   }
665 
666   if (ArgType & OMP_TGT_MAPTYPE_FROM) {
667     DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
668        ArgSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
669     int Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, ArgSize, nullptr);
670     if (Ret != OFFLOAD_SUCCESS) {
671       REPORT("Copying data from device failed.\n");
672       return OFFLOAD_FAIL;
673     }
674 
675     uintptr_t LB = (uintptr_t)HstPtrBegin;
676     uintptr_t UB = (uintptr_t)HstPtrBegin + ArgSize;
677     Device.ShadowMtx.lock();
678     for (ShadowPtrListTy::iterator IT = Device.ShadowPtrMap.begin();
679          IT != Device.ShadowPtrMap.end(); ++IT) {
680       void **ShadowHstPtrAddr = (void **)IT->first;
681       if ((uintptr_t)ShadowHstPtrAddr < LB)
682         continue;
683       if ((uintptr_t)ShadowHstPtrAddr >= UB)
684         break;
685       DP("Restoring original host pointer value " DPxMOD
686          " for host pointer " DPxMOD "\n",
687          DPxPTR(IT->second.HstPtrVal), DPxPTR(ShadowHstPtrAddr));
688       *ShadowHstPtrAddr = IT->second.HstPtrVal;
689     }
690     Device.ShadowMtx.unlock();
691   }
692 
693   if (ArgType & OMP_TGT_MAPTYPE_TO) {
694     DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n",
695        ArgSize, DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin));
696     int Ret = Device.submitData(TgtPtrBegin, HstPtrBegin, ArgSize, nullptr);
697     if (Ret != OFFLOAD_SUCCESS) {
698       REPORT("Copying data to device failed.\n");
699       return OFFLOAD_FAIL;
700     }
701 
702     uintptr_t LB = (uintptr_t)HstPtrBegin;
703     uintptr_t UB = (uintptr_t)HstPtrBegin + ArgSize;
704     Device.ShadowMtx.lock();
705     for (ShadowPtrListTy::iterator IT = Device.ShadowPtrMap.begin();
706          IT != Device.ShadowPtrMap.end(); ++IT) {
707       void **ShadowHstPtrAddr = (void **)IT->first;
708       if ((uintptr_t)ShadowHstPtrAddr < LB)
709         continue;
710       if ((uintptr_t)ShadowHstPtrAddr >= UB)
711         break;
712       DP("Restoring original target pointer value " DPxMOD " for target "
713          "pointer " DPxMOD "\n",
714          DPxPTR(IT->second.TgtPtrVal), DPxPTR(IT->second.TgtPtrAddr));
715       Ret = Device.submitData(IT->second.TgtPtrAddr, &IT->second.TgtPtrVal,
716                               sizeof(void *), nullptr);
717       if (Ret != OFFLOAD_SUCCESS) {
718         REPORT("Copying data to device failed.\n");
719         Device.ShadowMtx.unlock();
720         return OFFLOAD_FAIL;
721       }
722     }
723     Device.ShadowMtx.unlock();
724   }
725   return OFFLOAD_SUCCESS;
726 }
727 
targetDataNonContiguous(DeviceTy & Device,void * ArgsBase,__tgt_target_non_contig * NonContig,uint64_t Size,int64_t ArgType,int CurrentDim,int DimSize,uint64_t Offset)728 static int targetDataNonContiguous(DeviceTy &Device, void *ArgsBase,
729                                    __tgt_target_non_contig *NonContig,
730                                    uint64_t Size, int64_t ArgType,
731                                    int CurrentDim, int DimSize,
732                                    uint64_t Offset) {
733   int Ret = OFFLOAD_SUCCESS;
734   if (CurrentDim < DimSize) {
735     for (unsigned int I = 0; I < NonContig[CurrentDim].Count; ++I) {
736       uint64_t CurOffset =
737           (NonContig[CurrentDim].Offset + I) * NonContig[CurrentDim].Stride;
738       // we only need to transfer the first element for the last dimension
739       // since we've already got a contiguous piece.
740       if (CurrentDim != DimSize - 1 || I == 0) {
741         Ret = targetDataNonContiguous(Device, ArgsBase, NonContig, Size,
742                                       ArgType, CurrentDim + 1, DimSize,
743                                       Offset + CurOffset);
744         // Stop the whole process if any contiguous piece returns anything
745         // other than OFFLOAD_SUCCESS.
746         if (Ret != OFFLOAD_SUCCESS)
747           return Ret;
748       }
749     }
750   } else {
751     char *Ptr = (char *)ArgsBase + Offset;
752     DP("Transfer of non-contiguous : host ptr %lx offset %ld len %ld\n",
753        (uint64_t)Ptr, Offset, Size);
754     Ret = targetDataContiguous(Device, ArgsBase, Ptr, Size, ArgType);
755   }
756   return Ret;
757 }
758 
getNonContigMergedDimension(__tgt_target_non_contig * NonContig,int32_t DimSize)759 static int getNonContigMergedDimension(__tgt_target_non_contig *NonContig,
760                                        int32_t DimSize) {
761   int RemovedDim = 0;
762   for (int I = DimSize - 1; I > 0; --I) {
763     if (NonContig[I].Count * NonContig[I].Stride == NonContig[I - 1].Stride)
764       RemovedDim++;
765   }
766   return RemovedDim;
767 }
768 
769 /// Internal function to pass data to/from the target.
770 // async_info_ptr is currently unused, added here so targetDataUpdate has the
771 // same signature as targetDataBegin and targetDataEnd.
targetDataUpdate(DeviceTy & Device,int32_t ArgNum,void ** ArgsBase,void ** Args,int64_t * ArgSizes,int64_t * ArgTypes,map_var_info_t * ArgNames,void ** ArgMappers,__tgt_async_info * AsyncInfoPtr)772 int targetDataUpdate(DeviceTy &Device, int32_t ArgNum, void **ArgsBase,
773                      void **Args, int64_t *ArgSizes, int64_t *ArgTypes,
774                      map_var_info_t *ArgNames, void **ArgMappers,
775                      __tgt_async_info *AsyncInfoPtr) {
776   // process each input.
777   for (int32_t I = 0; I < ArgNum; ++I) {
778     if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) ||
779         (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE))
780       continue;
781 
782     if (ArgMappers && ArgMappers[I]) {
783       // Instead of executing the regular path of targetDataUpdate, call the
784       // targetDataMapper variant which will call targetDataUpdate again
785       // with new arguments.
786       DP("Calling targetDataMapper for the %dth argument\n", I);
787 
788       int Ret = targetDataMapper(Device, ArgsBase[I], Args[I], ArgSizes[I],
789                                  ArgTypes[I], ArgMappers[I], targetDataUpdate);
790 
791       if (Ret != OFFLOAD_SUCCESS) {
792         REPORT("Call to targetDataUpdate via targetDataMapper for custom mapper"
793                " failed.\n");
794         return OFFLOAD_FAIL;
795       }
796 
797       // Skip the rest of this function, continue to the next argument.
798       continue;
799     }
800 
801     int Ret = OFFLOAD_SUCCESS;
802 
803     if (ArgTypes[I] & OMP_TGT_MAPTYPE_NON_CONTIG) {
804       __tgt_target_non_contig *NonContig = (__tgt_target_non_contig *)Args[I];
805       int32_t DimSize = ArgSizes[I];
806       uint64_t Size =
807           NonContig[DimSize - 1].Count * NonContig[DimSize - 1].Stride;
808       int32_t MergedDim = getNonContigMergedDimension(NonContig, DimSize);
809       Ret = targetDataNonContiguous(
810           Device, ArgsBase[I], NonContig, Size, ArgTypes[I],
811           /*current_dim=*/0, DimSize - MergedDim, /*offset=*/0);
812     } else {
813       Ret = targetDataContiguous(Device, ArgsBase[I], Args[I], ArgSizes[I],
814                                  ArgTypes[I]);
815     }
816     if (Ret == OFFLOAD_FAIL)
817       return OFFLOAD_FAIL;
818   }
819   return OFFLOAD_SUCCESS;
820 }
821 
822 static const unsigned LambdaMapping = OMP_TGT_MAPTYPE_PTR_AND_OBJ |
823                                       OMP_TGT_MAPTYPE_LITERAL |
824                                       OMP_TGT_MAPTYPE_IMPLICIT;
isLambdaMapping(int64_t Mapping)825 static bool isLambdaMapping(int64_t Mapping) {
826   return (Mapping & LambdaMapping) == LambdaMapping;
827 }
828 
829 namespace {
830 /// Find the table information in the map or look it up in the translation
831 /// tables.
getTableMap(void * HostPtr)832 TableMap *getTableMap(void *HostPtr) {
833   std::lock_guard<std::mutex> TblMapLock(PM->TblMapMtx);
834   HostPtrToTableMapTy::iterator TableMapIt =
835       PM->HostPtrToTableMap.find(HostPtr);
836 
837   if (TableMapIt != PM->HostPtrToTableMap.end())
838     return &TableMapIt->second;
839 
840   // We don't have a map. So search all the registered libraries.
841   TableMap *TM = nullptr;
842   std::lock_guard<std::mutex> TrlTblLock(PM->TrlTblMtx);
843   for (HostEntriesBeginToTransTableTy::iterator Itr =
844            PM->HostEntriesBeginToTransTable.begin();
845        Itr != PM->HostEntriesBeginToTransTable.end(); ++Itr) {
846     // get the translation table (which contains all the good info).
847     TranslationTable *TransTable = &Itr->second;
848     // iterate over all the host table entries to see if we can locate the
849     // host_ptr.
850     __tgt_offload_entry *Cur = TransTable->HostTable.EntriesBegin;
851     for (uint32_t I = 0; Cur < TransTable->HostTable.EntriesEnd; ++Cur, ++I) {
852       if (Cur->addr != HostPtr)
853         continue;
854       // we got a match, now fill the HostPtrToTableMap so that we
855       // may avoid this search next time.
856       TM = &(PM->HostPtrToTableMap)[HostPtr];
857       TM->Table = TransTable;
858       TM->Index = I;
859       return TM;
860     }
861   }
862 
863   return nullptr;
864 }
865 
866 /// Get loop trip count
867 /// FIXME: This function will not work right if calling
868 /// __kmpc_push_target_tripcount in one thread but doing offloading in another
869 /// thread, which might occur when we call task yield.
getLoopTripCount(int64_t DeviceId)870 uint64_t getLoopTripCount(int64_t DeviceId) {
871   DeviceTy &Device = PM->Devices[DeviceId];
872   uint64_t LoopTripCount = 0;
873 
874   {
875     std::lock_guard<std::mutex> TblMapLock(PM->TblMapMtx);
876     auto I = Device.LoopTripCnt.find(__kmpc_global_thread_num(NULL));
877     if (I != Device.LoopTripCnt.end()) {
878       LoopTripCount = I->second;
879       Device.LoopTripCnt.erase(I);
880       DP("loop trip count is %lu.\n", LoopTripCount);
881     }
882   }
883 
884   return LoopTripCount;
885 }
886 
887 /// A class manages private arguments in a target region.
888 class PrivateArgumentManagerTy {
889   /// A data structure for the information of first-private arguments. We can
890   /// use this information to optimize data transfer by packing all
891   /// first-private arguments and transfer them all at once.
892   struct FirstPrivateArgInfoTy {
893     /// The index of the element in \p TgtArgs corresponding to the argument
894     const int Index;
895     /// Host pointer begin
896     const char *HstPtrBegin;
897     /// Host pointer end
898     const char *HstPtrEnd;
899     /// Aligned size
900     const int64_t AlignedSize;
901     /// Host pointer name
902     const map_var_info_t HstPtrName = nullptr;
903 
FirstPrivateArgInfoTy__anone97b13310211::PrivateArgumentManagerTy::FirstPrivateArgInfoTy904     FirstPrivateArgInfoTy(int Index, const void *HstPtr, int64_t Size,
905                           const map_var_info_t HstPtrName = nullptr)
906         : Index(Index), HstPtrBegin(reinterpret_cast<const char *>(HstPtr)),
907           HstPtrEnd(HstPtrBegin + Size), AlignedSize(Size + Size % Alignment),
908           HstPtrName(HstPtrName) {}
909   };
910 
911   /// A vector of target pointers for all private arguments
912   std::vector<void *> TgtPtrs;
913 
914   /// A vector of information of all first-private arguments to be packed
915   std::vector<FirstPrivateArgInfoTy> FirstPrivateArgInfo;
916   /// Host buffer for all arguments to be packed
917   std::vector<char> FirstPrivateArgBuffer;
918   /// The total size of all arguments to be packed
919   int64_t FirstPrivateArgSize = 0;
920 
921   /// A reference to the \p DeviceTy object
922   DeviceTy &Device;
923   /// A pointer to a \p __tgt_async_info object
924   __tgt_async_info *AsyncInfo;
925 
926   // TODO: What would be the best value here? Should we make it configurable?
927   // If the size is larger than this threshold, we will allocate and transfer it
928   // immediately instead of packing it.
929   static constexpr const int64_t FirstPrivateArgSizeThreshold = 1024;
930 
931 public:
932   /// Constructor
PrivateArgumentManagerTy(DeviceTy & Dev,__tgt_async_info * AsyncInfo)933   PrivateArgumentManagerTy(DeviceTy &Dev, __tgt_async_info *AsyncInfo)
934       : Device(Dev), AsyncInfo(AsyncInfo) {}
935 
936   /// Add a private argument
addArg(void * HstPtr,int64_t ArgSize,int64_t ArgOffset,bool IsFirstPrivate,void * & TgtPtr,int TgtArgsIndex,const map_var_info_t HstPtrName=nullptr)937   int addArg(void *HstPtr, int64_t ArgSize, int64_t ArgOffset,
938              bool IsFirstPrivate, void *&TgtPtr, int TgtArgsIndex,
939              const map_var_info_t HstPtrName = nullptr) {
940     // If the argument is not first-private, or its size is greater than a
941     // predefined threshold, we will allocate memory and issue the transfer
942     // immediately.
943     if (ArgSize > FirstPrivateArgSizeThreshold || !IsFirstPrivate) {
944       TgtPtr = Device.allocData(ArgSize, HstPtr);
945       if (!TgtPtr) {
946         DP("Data allocation for %sprivate array " DPxMOD " failed.\n",
947            (IsFirstPrivate ? "first-" : ""), DPxPTR(HstPtr));
948         return OFFLOAD_FAIL;
949       }
950 #ifdef OMPTARGET_DEBUG
951       void *TgtPtrBase = (void *)((intptr_t)TgtPtr + ArgOffset);
952       DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD
953          " for %sprivate array " DPxMOD " - pushing target argument " DPxMOD
954          "\n",
955          ArgSize, DPxPTR(TgtPtr), (IsFirstPrivate ? "first-" : ""),
956          DPxPTR(HstPtr), DPxPTR(TgtPtrBase));
957 #endif
958       // If first-private, copy data from host
959       if (IsFirstPrivate) {
960         int Ret = Device.submitData(TgtPtr, HstPtr, ArgSize, AsyncInfo);
961         if (Ret != OFFLOAD_SUCCESS) {
962           DP("Copying data to device failed, failed.\n");
963           return OFFLOAD_FAIL;
964         }
965       }
966       TgtPtrs.push_back(TgtPtr);
967     } else {
968       DP("Firstprivate array " DPxMOD " of size %" PRId64 " will be packed\n",
969          DPxPTR(HstPtr), ArgSize);
970       // When reach this point, the argument must meet all following
971       // requirements:
972       // 1. Its size does not exceed the threshold (see the comment for
973       // FirstPrivateArgSizeThreshold);
974       // 2. It must be first-private (needs to be mapped to target device).
975       // We will pack all this kind of arguments to transfer them all at once
976       // to reduce the number of data transfer. We will not take
977       // non-first-private arguments, aka. private arguments that doesn't need
978       // to be mapped to target device, into account because data allocation
979       // can be very efficient with memory manager.
980 
981       // Placeholder value
982       TgtPtr = nullptr;
983       FirstPrivateArgInfo.emplace_back(TgtArgsIndex, HstPtr, ArgSize,
984                                        HstPtrName);
985       FirstPrivateArgSize += FirstPrivateArgInfo.back().AlignedSize;
986     }
987 
988     return OFFLOAD_SUCCESS;
989   }
990 
991   /// Pack first-private arguments, replace place holder pointers in \p TgtArgs,
992   /// and start the transfer.
packAndTransfer(std::vector<void * > & TgtArgs)993   int packAndTransfer(std::vector<void *> &TgtArgs) {
994     if (!FirstPrivateArgInfo.empty()) {
995       assert(FirstPrivateArgSize != 0 &&
996              "FirstPrivateArgSize is 0 but FirstPrivateArgInfo is empty");
997       FirstPrivateArgBuffer.resize(FirstPrivateArgSize, 0);
998       auto Itr = FirstPrivateArgBuffer.begin();
999       // Copy all host data to this buffer
1000       for (FirstPrivateArgInfoTy &Info : FirstPrivateArgInfo) {
1001         std::copy(Info.HstPtrBegin, Info.HstPtrEnd, Itr);
1002         Itr = std::next(Itr, Info.AlignedSize);
1003       }
1004       // Allocate target memory
1005       void *TgtPtr =
1006           Device.allocData(FirstPrivateArgSize, FirstPrivateArgBuffer.data());
1007       if (TgtPtr == nullptr) {
1008         DP("Failed to allocate target memory for private arguments.\n");
1009         return OFFLOAD_FAIL;
1010       }
1011       TgtPtrs.push_back(TgtPtr);
1012       DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD "\n",
1013          FirstPrivateArgSize, DPxPTR(TgtPtr));
1014       // Transfer data to target device
1015       int Ret = Device.submitData(TgtPtr, FirstPrivateArgBuffer.data(),
1016                                   FirstPrivateArgSize, AsyncInfo);
1017       if (Ret != OFFLOAD_SUCCESS) {
1018         DP("Failed to submit data of private arguments.\n");
1019         return OFFLOAD_FAIL;
1020       }
1021       // Fill in all placeholder pointers
1022       auto TP = reinterpret_cast<uintptr_t>(TgtPtr);
1023       for (FirstPrivateArgInfoTy &Info : FirstPrivateArgInfo) {
1024         void *&Ptr = TgtArgs[Info.Index];
1025         assert(Ptr == nullptr && "Target pointer is already set by mistaken");
1026         Ptr = reinterpret_cast<void *>(TP);
1027         TP += Info.AlignedSize;
1028         DP("Firstprivate array " DPxMOD " of size %" PRId64 " mapped to " DPxMOD
1029            "\n",
1030            DPxPTR(Info.HstPtrBegin), Info.HstPtrEnd - Info.HstPtrBegin,
1031            DPxPTR(Ptr));
1032       }
1033     }
1034 
1035     return OFFLOAD_SUCCESS;
1036   }
1037 
1038   /// Free all target memory allocated for private arguments
free()1039   int free() {
1040     for (void *P : TgtPtrs) {
1041       int Ret = Device.deleteData(P);
1042       if (Ret != OFFLOAD_SUCCESS) {
1043         DP("Deallocation of (first-)private arrays failed.\n");
1044         return OFFLOAD_FAIL;
1045       }
1046     }
1047 
1048     TgtPtrs.clear();
1049 
1050     return OFFLOAD_SUCCESS;
1051   }
1052 };
1053 
1054 /// Process data before launching the kernel, including calling targetDataBegin
1055 /// to map and transfer data to target device, transferring (first-)private
1056 /// variables.
processDataBefore(int64_t DeviceId,void * HostPtr,int32_t ArgNum,void ** ArgBases,void ** Args,int64_t * ArgSizes,int64_t * ArgTypes,map_var_info_t * ArgNames,void ** ArgMappers,std::vector<void * > & TgtArgs,std::vector<ptrdiff_t> & TgtOffsets,PrivateArgumentManagerTy & PrivateArgumentManager,__tgt_async_info * AsyncInfo)1057 int processDataBefore(int64_t DeviceId, void *HostPtr, int32_t ArgNum,
1058                       void **ArgBases, void **Args, int64_t *ArgSizes,
1059                       int64_t *ArgTypes, map_var_info_t *ArgNames,
1060                       void **ArgMappers, std::vector<void *> &TgtArgs,
1061                       std::vector<ptrdiff_t> &TgtOffsets,
1062                       PrivateArgumentManagerTy &PrivateArgumentManager,
1063                       __tgt_async_info *AsyncInfo) {
1064   DeviceTy &Device = PM->Devices[DeviceId];
1065   int Ret = targetDataBegin(Device, ArgNum, ArgBases, Args, ArgSizes, ArgTypes,
1066                             ArgNames, ArgMappers, AsyncInfo);
1067   if (Ret != OFFLOAD_SUCCESS) {
1068     REPORT("Call to targetDataBegin failed, abort target.\n");
1069     return OFFLOAD_FAIL;
1070   }
1071 
1072   // List of (first-)private arrays allocated for this target region
1073   std::vector<int> TgtArgsPositions(ArgNum, -1);
1074 
1075   for (int32_t I = 0; I < ArgNum; ++I) {
1076     if (!(ArgTypes[I] & OMP_TGT_MAPTYPE_TARGET_PARAM)) {
1077       // This is not a target parameter, do not push it into TgtArgs.
1078       // Check for lambda mapping.
1079       if (isLambdaMapping(ArgTypes[I])) {
1080         assert((ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) &&
1081                "PTR_AND_OBJ must be also MEMBER_OF.");
1082         unsigned Idx = getParentIndex(ArgTypes[I]);
1083         int TgtIdx = TgtArgsPositions[Idx];
1084         assert(TgtIdx != -1 && "Base address must be translated already.");
1085         // The parent lambda must be processed already and it must be the last
1086         // in TgtArgs and TgtOffsets arrays.
1087         void *HstPtrVal = Args[I];
1088         void *HstPtrBegin = ArgBases[I];
1089         void *HstPtrBase = Args[Idx];
1090         bool IsLast, IsHostPtr; // unused.
1091         void *TgtPtrBase =
1092             (void *)((intptr_t)TgtArgs[TgtIdx] + TgtOffsets[TgtIdx]);
1093         DP("Parent lambda base " DPxMOD "\n", DPxPTR(TgtPtrBase));
1094         uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase;
1095         void *TgtPtrBegin = (void *)((uintptr_t)TgtPtrBase + Delta);
1096         void *PointerTgtPtrBegin = Device.getTgtPtrBegin(
1097             HstPtrVal, ArgSizes[I], IsLast, false, IsHostPtr);
1098         if (!PointerTgtPtrBegin) {
1099           DP("No lambda captured variable mapped (" DPxMOD ") - ignored\n",
1100              DPxPTR(HstPtrVal));
1101           continue;
1102         }
1103         if (PM->RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY &&
1104             TgtPtrBegin == HstPtrBegin) {
1105           DP("Unified memory is active, no need to map lambda captured"
1106              "variable (" DPxMOD ")\n",
1107              DPxPTR(HstPtrVal));
1108           continue;
1109         }
1110         DP("Update lambda reference (" DPxMOD ") -> [" DPxMOD "]\n",
1111            DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin));
1112         Ret = Device.submitData(TgtPtrBegin, &PointerTgtPtrBegin,
1113                                 sizeof(void *), AsyncInfo);
1114         if (Ret != OFFLOAD_SUCCESS) {
1115           REPORT("Copying data to device failed.\n");
1116           return OFFLOAD_FAIL;
1117         }
1118       }
1119       continue;
1120     }
1121     void *HstPtrBegin = Args[I];
1122     void *HstPtrBase = ArgBases[I];
1123     void *TgtPtrBegin;
1124     map_var_info_t HstPtrName = (!ArgNames) ? nullptr : ArgNames[I];
1125     ptrdiff_t TgtBaseOffset;
1126     bool IsLast, IsHostPtr; // unused.
1127     if (ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) {
1128       DP("Forwarding first-private value " DPxMOD " to the target construct\n",
1129          DPxPTR(HstPtrBase));
1130       TgtPtrBegin = HstPtrBase;
1131       TgtBaseOffset = 0;
1132     } else if (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE) {
1133       TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin;
1134       // Can be marked for optimization if the next argument(s) do(es) not
1135       // depend on this one.
1136       const bool IsFirstPrivate =
1137           (I >= ArgNum - 1 || !(ArgTypes[I + 1] & OMP_TGT_MAPTYPE_MEMBER_OF));
1138       Ret = PrivateArgumentManager.addArg(
1139           HstPtrBegin, ArgSizes[I], TgtBaseOffset, IsFirstPrivate, TgtPtrBegin,
1140           TgtArgs.size(), HstPtrName);
1141       if (Ret != OFFLOAD_SUCCESS) {
1142         REPORT("Failed to process %sprivate argument " DPxMOD "\n",
1143                (IsFirstPrivate ? "first-" : ""), DPxPTR(HstPtrBegin));
1144         return OFFLOAD_FAIL;
1145       }
1146     } else {
1147       if (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)
1148         HstPtrBase = *reinterpret_cast<void **>(HstPtrBase);
1149       TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, ArgSizes[I], IsLast,
1150                                           false, IsHostPtr);
1151       TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin;
1152 #ifdef OMPTARGET_DEBUG
1153       void *TgtPtrBase = (void *)((intptr_t)TgtPtrBegin + TgtBaseOffset);
1154       DP("Obtained target argument " DPxMOD " from host pointer " DPxMOD "\n",
1155          DPxPTR(TgtPtrBase), DPxPTR(HstPtrBegin));
1156 #endif
1157     }
1158     TgtArgsPositions[I] = TgtArgs.size();
1159     TgtArgs.push_back(TgtPtrBegin);
1160     TgtOffsets.push_back(TgtBaseOffset);
1161   }
1162 
1163   assert(TgtArgs.size() == TgtOffsets.size() &&
1164          "Size mismatch in arguments and offsets");
1165 
1166   // Pack and transfer first-private arguments
1167   Ret = PrivateArgumentManager.packAndTransfer(TgtArgs);
1168   if (Ret != OFFLOAD_SUCCESS) {
1169     DP("Failed to pack and transfer first private arguments\n");
1170     return OFFLOAD_FAIL;
1171   }
1172 
1173   return OFFLOAD_SUCCESS;
1174 }
1175 
1176 /// Process data after launching the kernel, including transferring data back to
1177 /// host if needed and deallocating target memory of (first-)private variables.
processDataAfter(int64_t DeviceId,void * HostPtr,int32_t ArgNum,void ** ArgBases,void ** Args,int64_t * ArgSizes,int64_t * ArgTypes,map_var_info_t * ArgNames,void ** ArgMappers,PrivateArgumentManagerTy & PrivateArgumentManager,__tgt_async_info * AsyncInfo)1178 int processDataAfter(int64_t DeviceId, void *HostPtr, int32_t ArgNum,
1179                      void **ArgBases, void **Args, int64_t *ArgSizes,
1180                      int64_t *ArgTypes, map_var_info_t *ArgNames,
1181                      void **ArgMappers,
1182                      PrivateArgumentManagerTy &PrivateArgumentManager,
1183                      __tgt_async_info *AsyncInfo) {
1184   DeviceTy &Device = PM->Devices[DeviceId];
1185 
1186   // Move data from device.
1187   int Ret = targetDataEnd(Device, ArgNum, ArgBases, Args, ArgSizes, ArgTypes,
1188                           ArgNames, ArgMappers, AsyncInfo);
1189   if (Ret != OFFLOAD_SUCCESS) {
1190     REPORT("Call to targetDataEnd failed, abort target.\n");
1191     return OFFLOAD_FAIL;
1192   }
1193 
1194   // Free target memory for private arguments
1195   Ret = PrivateArgumentManager.free();
1196   if (Ret != OFFLOAD_SUCCESS) {
1197     REPORT("Failed to deallocate target memory for private args\n");
1198     return OFFLOAD_FAIL;
1199   }
1200 
1201   return OFFLOAD_SUCCESS;
1202 }
1203 } // namespace
1204 
1205 /// performs the same actions as data_begin in case arg_num is
1206 /// non-zero and initiates run of the offloaded region on the target platform;
1207 /// if arg_num is non-zero after the region execution is done it also
1208 /// performs the same action as data_update and data_end above. This function
1209 /// returns 0 if it was able to transfer the execution to a target and an
1210 /// integer different from zero otherwise.
target(int64_t DeviceId,void * HostPtr,int32_t ArgNum,void ** ArgBases,void ** Args,int64_t * ArgSizes,int64_t * ArgTypes,map_var_info_t * ArgNames,void ** ArgMappers,int32_t TeamNum,int32_t ThreadLimit,int IsTeamConstruct)1211 int target(int64_t DeviceId, void *HostPtr, int32_t ArgNum, void **ArgBases,
1212            void **Args, int64_t *ArgSizes, int64_t *ArgTypes,
1213            map_var_info_t *ArgNames, void **ArgMappers, int32_t TeamNum,
1214            int32_t ThreadLimit, int IsTeamConstruct) {
1215   DeviceTy &Device = PM->Devices[DeviceId];
1216 
1217   TableMap *TM = getTableMap(HostPtr);
1218   // No map for this host pointer found!
1219   if (!TM) {
1220     REPORT("Host ptr " DPxMOD " does not have a matching target pointer.\n",
1221            DPxPTR(HostPtr));
1222     return OFFLOAD_FAIL;
1223   }
1224 
1225   // get target table.
1226   __tgt_target_table *TargetTable = nullptr;
1227   {
1228     std::lock_guard<std::mutex> TrlTblLock(PM->TrlTblMtx);
1229     assert(TM->Table->TargetsTable.size() > (size_t)DeviceId &&
1230            "Not expecting a device ID outside the table's bounds!");
1231     TargetTable = TM->Table->TargetsTable[DeviceId];
1232   }
1233   assert(TargetTable && "Global data has not been mapped\n");
1234 
1235   __tgt_async_info AsyncInfo;
1236 
1237   std::vector<void *> TgtArgs;
1238   std::vector<ptrdiff_t> TgtOffsets;
1239 
1240   PrivateArgumentManagerTy PrivateArgumentManager(Device, &AsyncInfo);
1241 
1242   // Process data, such as data mapping, before launching the kernel
1243   int Ret = processDataBefore(DeviceId, HostPtr, ArgNum, ArgBases, Args,
1244                               ArgSizes, ArgTypes, ArgNames, ArgMappers, TgtArgs,
1245                               TgtOffsets, PrivateArgumentManager, &AsyncInfo);
1246   if (Ret != OFFLOAD_SUCCESS) {
1247     REPORT("Failed to process data before launching the kernel.\n");
1248     return OFFLOAD_FAIL;
1249   }
1250 
1251   // Get loop trip count
1252   uint64_t LoopTripCount = getLoopTripCount(DeviceId);
1253 
1254   // Launch device execution.
1255   void *TgtEntryPtr = TargetTable->EntriesBegin[TM->Index].addr;
1256   DP("Launching target execution %s with pointer " DPxMOD " (index=%d).\n",
1257      TargetTable->EntriesBegin[TM->Index].name, DPxPTR(TgtEntryPtr), TM->Index);
1258 
1259   if (IsTeamConstruct)
1260     Ret = Device.runTeamRegion(TgtEntryPtr, &TgtArgs[0], &TgtOffsets[0],
1261                                TgtArgs.size(), TeamNum, ThreadLimit,
1262                                LoopTripCount, &AsyncInfo);
1263   else
1264     Ret = Device.runRegion(TgtEntryPtr, &TgtArgs[0], &TgtOffsets[0],
1265                            TgtArgs.size(), &AsyncInfo);
1266 
1267   if (Ret != OFFLOAD_SUCCESS) {
1268     REPORT("Executing target region abort target.\n");
1269     return OFFLOAD_FAIL;
1270   }
1271 
1272   // Transfer data back and deallocate target memory for (first-)private
1273   // variables
1274   Ret = processDataAfter(DeviceId, HostPtr, ArgNum, ArgBases, Args, ArgSizes,
1275                          ArgTypes, ArgNames, ArgMappers, PrivateArgumentManager,
1276                          &AsyncInfo);
1277   if (Ret != OFFLOAD_SUCCESS) {
1278     REPORT("Failed to process data after launching the kernel.\n");
1279     return OFFLOAD_FAIL;
1280   }
1281 
1282   return OFFLOAD_SUCCESS;
1283 }
1284