1 //===---- omptarget.h - OpenMP GPU initialization ---------------- CUDA -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file contains the declarations of all library macros, types,
10 // and functions.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #ifndef OMPTARGET_H
15 #define OMPTARGET_H
16 
17 #include "target_impl.h"
18 #include "common/debug.h"     // debug
19 #include "interface.h" // interfaces with omp, compiler, and user
20 #include "common/state-queue.h"
21 #include "common/support.h"
22 
23 #define OMPTARGET_NVPTX_VERSION 1.1
24 
25 // used by the library for the interface with the app
26 #define DISPATCH_FINISHED 0
27 #define DISPATCH_NOTFINISHED 1
28 
29 // used by dynamic scheduling
30 #define FINISHED 0
31 #define NOT_FINISHED 1
32 #define LAST_CHUNK 2
33 
34 #define BARRIER_COUNTER 0
35 #define ORDERED_COUNTER 1
36 
37 // arguments needed for L0 parallelism only.
38 class omptarget_nvptx_SharedArgs {
39 public:
40   // All these methods must be called by the master thread only.
Init()41   INLINE void Init() {
42     args  = buffer;
43     nArgs = MAX_SHARED_ARGS;
44   }
DeInit()45   INLINE void DeInit() {
46     // Free any memory allocated for outlined parallel function with a large
47     // number of arguments.
48     if (nArgs > MAX_SHARED_ARGS) {
49       SafeFree(args, "new extended args");
50       Init();
51     }
52   }
EnsureSize(size_t size)53   INLINE void EnsureSize(size_t size) {
54     if (size > nArgs) {
55       if (nArgs > MAX_SHARED_ARGS) {
56         SafeFree(args, "new extended args");
57       }
58       args = (void **)SafeMalloc(size * sizeof(void *), "new extended args");
59       nArgs = size;
60     }
61   }
62   // Called by all threads.
GetArgs()63   INLINE void **GetArgs() const { return args; };
64 private:
65   // buffer of pre-allocated arguments.
66   void *buffer[MAX_SHARED_ARGS];
67   // pointer to arguments buffer.
68   // starts off as a pointer to 'buffer' but can be dynamically allocated.
69   void **args;
70   // starts off as MAX_SHARED_ARGS but can increase in size.
71   uint32_t nArgs;
72 };
73 
74 extern DEVICE SHARED omptarget_nvptx_SharedArgs
75     omptarget_nvptx_globalArgs;
76 
77 // Data structure to keep in shared memory that traces the current slot, stack,
78 // and frame pointer as well as the active threads that didn't exit the current
79 // environment.
80 struct DataSharingStateTy {
81   __kmpc_data_sharing_slot *SlotPtr[DS_Max_Warp_Number];
82   void *StackPtr[DS_Max_Warp_Number];
83   void * volatile FramePtr[DS_Max_Warp_Number];
84   __kmpc_impl_lanemask_t ActiveThreads[DS_Max_Warp_Number];
85 };
86 // Additional worker slot type which is initialized with the default worker slot
87 // size of 4*32 bytes.
88 struct __kmpc_data_sharing_worker_slot_static {
89   __kmpc_data_sharing_slot *Next;
90   __kmpc_data_sharing_slot *Prev;
91   void *PrevSlotStackPtr;
92   void *DataEnd;
93   char Data[DS_Worker_Warp_Slot_Size];
94 };
95 
96 extern DEVICE SHARED DataSharingStateTy DataSharingState;
97 
98 ////////////////////////////////////////////////////////////////////////////////
99 // task ICV and (implicit & explicit) task state
100 
101 class omptarget_nvptx_TaskDescr {
102 public:
103   // methods for flags
104   INLINE omp_sched_t GetRuntimeSched() const;
105   INLINE void SetRuntimeSched(omp_sched_t sched);
InParallelRegion()106   INLINE int InParallelRegion() const { return items.flags & TaskDescr_InPar; }
InL2OrHigherParallelRegion()107   INLINE int InL2OrHigherParallelRegion() const {
108     return items.flags & TaskDescr_InParL2P;
109   }
IsParallelConstruct()110   INLINE int IsParallelConstruct() const {
111     return items.flags & TaskDescr_IsParConstr;
112   }
IsTaskConstruct()113   INLINE int IsTaskConstruct() const { return !IsParallelConstruct(); }
114   // methods for other fields
ThreadId()115   INLINE uint16_t &ThreadId() { return items.threadId; }
RuntimeChunkSize()116   INLINE uint64_t &RuntimeChunkSize() { return items.runtimeChunkSize; }
GetPrevTaskDescr()117   INLINE omptarget_nvptx_TaskDescr *GetPrevTaskDescr() const { return prev; }
SetPrevTaskDescr(omptarget_nvptx_TaskDescr * taskDescr)118   INLINE void SetPrevTaskDescr(omptarget_nvptx_TaskDescr *taskDescr) {
119     prev = taskDescr;
120   }
121   // init & copy
122   INLINE void InitLevelZeroTaskDescr();
123   INLINE void InitLevelOneTaskDescr(omptarget_nvptx_TaskDescr *parentTaskDescr);
124   INLINE void Copy(omptarget_nvptx_TaskDescr *sourceTaskDescr);
125   INLINE void CopyData(omptarget_nvptx_TaskDescr *sourceTaskDescr);
126   INLINE void CopyParent(omptarget_nvptx_TaskDescr *parentTaskDescr);
127   INLINE void CopyForExplicitTask(omptarget_nvptx_TaskDescr *parentTaskDescr);
128   INLINE void CopyToWorkDescr(omptarget_nvptx_TaskDescr *masterTaskDescr);
129   INLINE void CopyFromWorkDescr(omptarget_nvptx_TaskDescr *workTaskDescr);
130   INLINE void CopyConvergentParent(omptarget_nvptx_TaskDescr *parentTaskDescr,
131                                    uint16_t tid, uint16_t tnum);
132   INLINE void SaveLoopData();
133   INLINE void RestoreLoopData() const;
134 
135 private:
136   // bits for flags: (6 used, 2 free)
137   //   3 bits (SchedMask) for runtime schedule
138   //   1 bit (InPar) if this thread has encountered one or more parallel region
139   //   1 bit (IsParConstr) if ICV for a parallel region (false = explicit task)
140   //   1 bit (InParL2+) if this thread has encountered L2 or higher parallel
141   //   region
142   static const uint8_t TaskDescr_SchedMask = (0x1 | 0x2 | 0x4);
143   static const uint8_t TaskDescr_InPar = 0x10;
144   static const uint8_t TaskDescr_IsParConstr = 0x20;
145   static const uint8_t TaskDescr_InParL2P = 0x40;
146 
147   struct SavedLoopDescr_items {
148     int64_t loopUpperBound;
149     int64_t nextLowerBound;
150     int64_t chunk;
151     int64_t stride;
152     kmp_sched_t schedule;
153   } loopData;
154 
155   struct TaskDescr_items {
156     uint8_t flags; // 6 bit used (see flag above)
157     uint8_t unused;
158     uint16_t threadId;         // thread id
159     uint64_t runtimeChunkSize; // runtime chunk size
160   } items;
161   omptarget_nvptx_TaskDescr *prev;
162 };
163 
164 // build on kmp
165 typedef struct omptarget_nvptx_ExplicitTaskDescr {
166   omptarget_nvptx_TaskDescr
167       taskDescr; // omptarget_nvptx task description (must be first)
168   kmp_TaskDescr kmpTaskDescr; // kmp task description (must be last)
169 } omptarget_nvptx_ExplicitTaskDescr;
170 
171 ////////////////////////////////////////////////////////////////////////////////
172 // Descriptor of a parallel region (worksharing in general)
173 
174 class omptarget_nvptx_WorkDescr {
175 
176 public:
177   // access to data
WorkTaskDescr()178   INLINE omptarget_nvptx_TaskDescr *WorkTaskDescr() { return &masterTaskICV; }
179 
180 private:
181   omptarget_nvptx_TaskDescr masterTaskICV;
182 };
183 
184 ////////////////////////////////////////////////////////////////////////////////
185 
186 class omptarget_nvptx_TeamDescr {
187 public:
188   // access to data
LevelZeroTaskDescr()189   INLINE omptarget_nvptx_TaskDescr *LevelZeroTaskDescr() {
190     return &levelZeroTaskDescr;
191   }
WorkDescr()192   INLINE omptarget_nvptx_WorkDescr &WorkDescr() {
193     return workDescrForActiveParallel;
194   }
195 
196   // init
197   INLINE void InitTeamDescr();
198 
GetPreallocatedSlotAddr(int wid)199   INLINE __kmpc_data_sharing_slot *GetPreallocatedSlotAddr(int wid) {
200     worker_rootS[wid].DataEnd =
201         &worker_rootS[wid].Data[0] + DS_Worker_Warp_Slot_Size;
202     // We currently do not have a next slot.
203     worker_rootS[wid].Next = 0;
204     worker_rootS[wid].Prev = 0;
205     worker_rootS[wid].PrevSlotStackPtr = 0;
206     return (__kmpc_data_sharing_slot *)&worker_rootS[wid];
207   }
208 
209 private:
210   omptarget_nvptx_TaskDescr
211       levelZeroTaskDescr; // icv for team master initial thread
212   omptarget_nvptx_WorkDescr
213       workDescrForActiveParallel; // one, ONLY for the active par
214 
215   ALIGN(16)
216   __kmpc_data_sharing_worker_slot_static worker_rootS[DS_Max_Warp_Number];
217 };
218 
219 ////////////////////////////////////////////////////////////////////////////////
220 // thread private data (struct of arrays for better coalescing)
221 // tid refers here to the global thread id
222 // do not support multiple concurrent kernel a this time
223 class omptarget_nvptx_ThreadPrivateContext {
224 public:
225   // task
Level1TaskDescr(int tid)226   INLINE omptarget_nvptx_TaskDescr *Level1TaskDescr(int tid) {
227     return &levelOneTaskDescr[tid];
228   }
SetTopLevelTaskDescr(int tid,omptarget_nvptx_TaskDescr * taskICV)229   INLINE void SetTopLevelTaskDescr(int tid,
230                                    omptarget_nvptx_TaskDescr *taskICV) {
231     topTaskDescr[tid] = taskICV;
232   }
233   INLINE omptarget_nvptx_TaskDescr *GetTopLevelTaskDescr(int tid) const;
234   // parallel
NumThreadsForNextParallel(int tid)235   INLINE uint16_t &NumThreadsForNextParallel(int tid) {
236     return nextRegion.tnum[tid];
237   }
238   // schedule (for dispatch)
ScheduleType(int tid)239   INLINE kmp_sched_t &ScheduleType(int tid) { return schedule[tid]; }
Chunk(int tid)240   INLINE int64_t &Chunk(int tid) { return chunk[tid]; }
LoopUpperBound(int tid)241   INLINE int64_t &LoopUpperBound(int tid) { return loopUpperBound[tid]; }
NextLowerBound(int tid)242   INLINE int64_t &NextLowerBound(int tid) { return nextLowerBound[tid]; }
Stride(int tid)243   INLINE int64_t &Stride(int tid) { return stride[tid]; }
244 
TeamContext()245   INLINE omptarget_nvptx_TeamDescr &TeamContext() { return teamContext; }
246 
247   INLINE void InitThreadPrivateContext(int tid);
Cnt()248   INLINE uint64_t &Cnt() { return cnt; }
249 
250 private:
251   // team context for this team
252   omptarget_nvptx_TeamDescr teamContext;
253   // task ICV for implicit threads in the only parallel region
254   omptarget_nvptx_TaskDescr levelOneTaskDescr[MAX_THREADS_PER_TEAM];
255   // pointer where to find the current task ICV (top of the stack)
256   omptarget_nvptx_TaskDescr *topTaskDescr[MAX_THREADS_PER_TEAM];
257   union {
258     // Only one of the two is live at the same time.
259     // parallel
260     uint16_t tnum[MAX_THREADS_PER_TEAM];
261   } nextRegion;
262   // schedule (for dispatch)
263   kmp_sched_t schedule[MAX_THREADS_PER_TEAM]; // remember schedule type for #for
264   int64_t chunk[MAX_THREADS_PER_TEAM];
265   int64_t loopUpperBound[MAX_THREADS_PER_TEAM];
266   // state for dispatch with dyn/guided OR static (never use both at a time)
267   int64_t nextLowerBound[MAX_THREADS_PER_TEAM];
268   int64_t stride[MAX_THREADS_PER_TEAM];
269   uint64_t cnt;
270 };
271 
272 /// Memory manager for statically allocated memory.
273 class omptarget_nvptx_SimpleMemoryManager {
274 private:
275   ALIGN(128) struct MemDataTy {
276     volatile unsigned keys[OMP_STATE_COUNT];
277   } MemData[MAX_SM];
278 
hash(unsigned key)279   INLINE static uint32_t hash(unsigned key) {
280     return key & (OMP_STATE_COUNT - 1);
281   }
282 
283 public:
284   INLINE void Release();
285   INLINE const void *Acquire(const void *buf, size_t size);
286 };
287 
288 ////////////////////////////////////////////////////////////////////////////////
289 
290 ////////////////////////////////////////////////////////////////////////////////
291 // global data tables
292 ////////////////////////////////////////////////////////////////////////////////
293 
294 extern DEVICE omptarget_nvptx_SimpleMemoryManager
295     omptarget_nvptx_simpleMemoryManager;
296 extern DEVICE SHARED uint32_t usedMemIdx;
297 extern DEVICE SHARED uint32_t usedSlotIdx;
298 extern DEVICE SHARED uint8_t
299     parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE];
300 extern DEVICE SHARED uint16_t threadLimit;
301 extern DEVICE SHARED uint16_t threadsInTeam;
302 extern DEVICE SHARED uint16_t nThreads;
303 extern DEVICE SHARED
304     omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext;
305 
306 extern DEVICE SHARED uint32_t execution_param;
307 extern DEVICE SHARED void *ReductionScratchpadPtr;
308 
309 ////////////////////////////////////////////////////////////////////////////////
310 // work function (outlined parallel/simd functions) and arguments.
311 // needed for L1 parallelism only.
312 ////////////////////////////////////////////////////////////////////////////////
313 
314 typedef void *omptarget_nvptx_WorkFn;
315 extern volatile DEVICE SHARED omptarget_nvptx_WorkFn
316     omptarget_nvptx_workFn;
317 
318 ////////////////////////////////////////////////////////////////////////////////
319 // get private data structures
320 ////////////////////////////////////////////////////////////////////////////////
321 
322 INLINE omptarget_nvptx_TeamDescr &getMyTeamDescriptor();
323 INLINE omptarget_nvptx_WorkDescr &getMyWorkDescriptor();
324 INLINE omptarget_nvptx_TaskDescr *
325 getMyTopTaskDescriptor(bool isSPMDExecutionMode);
326 INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor(int globalThreadId);
327 
328 ////////////////////////////////////////////////////////////////////////////////
329 // inlined implementation
330 ////////////////////////////////////////////////////////////////////////////////
331 
332 #include "common/omptargeti.h"
333 
334 #endif
335