1 //===---- parallel.cu - GPU OpenMP parallel implementation ------- CUDA -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // Parallel implementation in the GPU. Here is the pattern:
10 //
11 //    while (not finished) {
12 //
13 //    if (master) {
14 //      sequential code, decide which par loop to do, or if finished
15 //     __kmpc_kernel_prepare_parallel() // exec by master only
16 //    }
17 //    syncthreads // A
18 //    __kmpc_kernel_parallel() // exec by all
19 //    if (this thread is included in the parallel) {
20 //      switch () for all parallel loops
21 //      __kmpc_kernel_end_parallel() // exec only by threads in parallel
22 //    }
23 //
24 //
25 //    The reason we don't exec end_parallel for the threads not included
26 //    in the parallel loop is that for each barrier in the parallel
27 //    region, these non-included threads will cycle through the
28 //    syncthread A. Thus they must preserve their current threadId that
29 //    is larger than thread in team.
30 //
31 //    To make a long story short...
32 //
33 //===----------------------------------------------------------------------===//
34 
35 #include "common/omptarget.h"
36 #include "target_impl.h"
37 
38 ////////////////////////////////////////////////////////////////////////////////
39 // support for parallel that goes parallel (1 static level only)
40 ////////////////////////////////////////////////////////////////////////////////
41 
determineNumberOfThreads(uint16_t NumThreadsClause,uint16_t NThreadsICV,uint16_t ThreadLimit)42 INLINE static uint16_t determineNumberOfThreads(uint16_t NumThreadsClause,
43                                                 uint16_t NThreadsICV,
44                                                 uint16_t ThreadLimit) {
45   uint16_t ThreadsRequested = NThreadsICV;
46   if (NumThreadsClause != 0) {
47     ThreadsRequested = NumThreadsClause;
48   }
49 
50   uint16_t ThreadsAvailable = GetNumberOfWorkersInTeam();
51   if (ThreadLimit != 0 && ThreadLimit < ThreadsAvailable) {
52     ThreadsAvailable = ThreadLimit;
53   }
54 
55   uint16_t NumThreads = ThreadsAvailable;
56   if (ThreadsRequested != 0 && ThreadsRequested < NumThreads) {
57     NumThreads = ThreadsRequested;
58   }
59 
60 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
61   // On Volta and newer architectures we require that all lanes in
62   // a warp participate in the parallel region.  Round down to a
63   // multiple of WARPSIZE since it is legal to do so in OpenMP.
64   if (NumThreads < WARPSIZE) {
65     NumThreads = 1;
66   } else {
67     NumThreads = (NumThreads & ~((uint16_t)WARPSIZE - 1));
68   }
69 #endif
70 
71   return NumThreads;
72 }
73 
74 // This routine is always called by the team master..
__kmpc_kernel_prepare_parallel(void * WorkFn)75 EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn) {
76   PRINT0(LD_IO, "call to __kmpc_kernel_prepare_parallel\n");
77 
78   omptarget_nvptx_workFn = WorkFn;
79 
80   // This routine is only called by the team master.  The team master is
81   // the first thread of the last warp.  It always has the logical thread
82   // id of 0 (since it is a shadow for the first worker thread).
83   const int threadId = 0;
84   omptarget_nvptx_TaskDescr *currTaskDescr =
85       omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
86   ASSERT0(LT_FUSSY, currTaskDescr, "expected a top task descr");
87   ASSERT0(LT_FUSSY, !currTaskDescr->InParallelRegion(),
88           "cannot be called in a parallel region.");
89   if (currTaskDescr->InParallelRegion()) {
90     PRINT0(LD_PAR, "already in parallel: go seq\n");
91     return;
92   }
93 
94   uint16_t &NumThreadsClause =
95       omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId);
96 
97   uint16_t NumThreads =
98       determineNumberOfThreads(NumThreadsClause, nThreads, threadLimit);
99 
100   if (NumThreadsClause != 0) {
101     // Reset request to avoid propagating to successive #parallel
102     NumThreadsClause = 0;
103   }
104 
105   ASSERT(LT_FUSSY, NumThreads > 0, "bad thread request of %d threads",
106          (int)NumThreads);
107   ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(),
108           "only team master can create parallel");
109 
110   // Set number of threads on work descriptor.
111   omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor();
112   workDescr.WorkTaskDescr()->CopyToWorkDescr(currTaskDescr);
113   threadsInTeam = NumThreads;
114 }
115 
116 // All workers call this function.  Deactivate those not needed.
117 // Fn - the outlined work function to execute.
118 // returns True if this thread is active, else False.
119 //
120 // Only the worker threads call this routine.
__kmpc_kernel_parallel(void ** WorkFn)121 EXTERN bool __kmpc_kernel_parallel(void **WorkFn) {
122   PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_parallel\n");
123 
124   // Work function and arguments for L1 parallel region.
125   *WorkFn = omptarget_nvptx_workFn;
126 
127   // If this is the termination signal from the master, quit early.
128   if (!*WorkFn) {
129     PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_parallel finished\n");
130     return false;
131   }
132 
133   // Only the worker threads call this routine and the master warp
134   // never arrives here.  Therefore, use the nvptx thread id.
135   int threadId = GetThreadIdInBlock();
136   omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor();
137   // Set to true for workers participating in the parallel region.
138   bool isActive = false;
139   // Initialize state for active threads.
140   if (threadId < threadsInTeam) {
141     // init work descriptor from workdesccr
142     omptarget_nvptx_TaskDescr *newTaskDescr =
143         omptarget_nvptx_threadPrivateContext->Level1TaskDescr(threadId);
144     ASSERT0(LT_FUSSY, newTaskDescr, "expected a task descr");
145     newTaskDescr->CopyFromWorkDescr(workDescr.WorkTaskDescr());
146     // install new top descriptor
147     omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,
148                                                                newTaskDescr);
149     // init private from int value
150     PRINT(LD_PAR,
151           "thread will execute parallel region with id %d in a team of "
152           "%d threads\n",
153           (int)newTaskDescr->ThreadId(), (int)nThreads);
154 
155     isActive = true;
156     // Reconverge the threads at the end of the parallel region to correctly
157     // handle parallel levels.
158     // In Cuda9+ in non-SPMD mode we have either 1 worker thread or the whole
159     // warp. If only 1 thread is active, not need to reconverge the threads.
160     // If we have the whole warp, reconverge all the threads in the warp before
161     // actually trying to change the parallel level. Otherwise, parallel level
162     // can be changed incorrectly because of threads divergence.
163     bool IsActiveParallelRegion = threadsInTeam != 1;
164     IncParallelLevel(IsActiveParallelRegion,
165                      IsActiveParallelRegion ? __kmpc_impl_all_lanes : 1u);
166   }
167 
168   return isActive;
169 }
170 
__kmpc_kernel_end_parallel()171 EXTERN void __kmpc_kernel_end_parallel() {
172   // pop stack
173   PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_end_parallel\n");
174   ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime.");
175 
176   // Only the worker threads call this routine and the master warp
177   // never arrives here.  Therefore, use the nvptx thread id.
178   int threadId = GetThreadIdInBlock();
179   omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId);
180   omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
181       threadId, currTaskDescr->GetPrevTaskDescr());
182 
183   // Reconverge the threads at the end of the parallel region to correctly
184   // handle parallel levels.
185   // In Cuda9+ in non-SPMD mode we have either 1 worker thread or the whole
186   // warp. If only 1 thread is active, not need to reconverge the threads.
187   // If we have the whole warp, reconverge all the threads in the warp before
188   // actually trying to change the parallel level. Otherwise, parallel level can
189   // be changed incorrectly because of threads divergence.
190     bool IsActiveParallelRegion = threadsInTeam != 1;
191     DecParallelLevel(IsActiveParallelRegion,
192                      IsActiveParallelRegion ? __kmpc_impl_all_lanes : 1u);
193 }
194 
195 ////////////////////////////////////////////////////////////////////////////////
196 // support for parallel that goes sequential
197 ////////////////////////////////////////////////////////////////////////////////
198 
__kmpc_serialized_parallel(kmp_Ident * loc,uint32_t global_tid)199 EXTERN void __kmpc_serialized_parallel(kmp_Ident *loc, uint32_t global_tid) {
200   PRINT0(LD_IO, "call to __kmpc_serialized_parallel\n");
201 
202   IncParallelLevel(/*ActiveParallel=*/false, __kmpc_impl_activemask());
203 
204   if (checkRuntimeUninitialized(loc)) {
205     ASSERT0(LT_FUSSY, checkSPMDMode(loc),
206             "Expected SPMD mode with uninitialized runtime.");
207     return;
208   }
209 
210   // assume this is only called for nested parallel
211   int threadId = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
212 
213   // unlike actual parallel, threads in the same team do not share
214   // the workTaskDescr in this case and num threads is fixed to 1
215 
216   // get current task
217   omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId);
218   currTaskDescr->SaveLoopData();
219 
220   // allocate new task descriptor and copy value from current one, set prev to
221   // it
222   omptarget_nvptx_TaskDescr *newTaskDescr =
223       (omptarget_nvptx_TaskDescr *)SafeMalloc(sizeof(omptarget_nvptx_TaskDescr),
224                                               "new seq parallel task");
225   newTaskDescr->CopyParent(currTaskDescr);
226 
227   // tweak values for serialized parallel case:
228   // - each thread becomes ID 0 in its serialized parallel, and
229   // - there is only one thread per team
230   newTaskDescr->ThreadId() = 0;
231 
232   // set new task descriptor as top
233   omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,
234                                                              newTaskDescr);
235 }
236 
__kmpc_end_serialized_parallel(kmp_Ident * loc,uint32_t global_tid)237 EXTERN void __kmpc_end_serialized_parallel(kmp_Ident *loc,
238                                            uint32_t global_tid) {
239   PRINT0(LD_IO, "call to __kmpc_end_serialized_parallel\n");
240 
241   DecParallelLevel(/*ActiveParallel=*/false, __kmpc_impl_activemask());
242 
243   if (checkRuntimeUninitialized(loc)) {
244     ASSERT0(LT_FUSSY, checkSPMDMode(loc),
245             "Expected SPMD mode with uninitialized runtime.");
246     return;
247   }
248 
249   // pop stack
250   int threadId = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
251   omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId);
252   // set new top
253   omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
254       threadId, currTaskDescr->GetPrevTaskDescr());
255   // free
256   SafeFree(currTaskDescr, "new seq parallel task");
257   currTaskDescr = getMyTopTaskDescriptor(threadId);
258   currTaskDescr->RestoreLoopData();
259 }
260 
__kmpc_parallel_level(kmp_Ident * loc,uint32_t global_tid)261 EXTERN uint16_t __kmpc_parallel_level(kmp_Ident *loc, uint32_t global_tid) {
262   PRINT0(LD_IO, "call to __kmpc_parallel_level\n");
263 
264   return parallelLevel[GetWarpId()] & (OMP_ACTIVE_PARALLEL_LEVEL - 1);
265 }
266 
267 // This kmpc call returns the thread id across all teams. It's value is
268 // cached by the compiler and used when calling the runtime. On nvptx
269 // it's cheap to recalculate this value so we never use the result
270 // of this call.
__kmpc_global_thread_num(kmp_Ident * loc)271 EXTERN int32_t __kmpc_global_thread_num(kmp_Ident *loc) {
272   int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
273   return GetOmpThreadId(tid, checkSPMDMode(loc));
274 }
275 
276 ////////////////////////////////////////////////////////////////////////////////
277 // push params
278 ////////////////////////////////////////////////////////////////////////////////
279 
__kmpc_push_num_threads(kmp_Ident * loc,int32_t tid,int32_t num_threads)280 EXTERN void __kmpc_push_num_threads(kmp_Ident *loc, int32_t tid,
281                                     int32_t num_threads) {
282   PRINT(LD_IO, "call kmpc_push_num_threads %d\n", num_threads);
283   ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc), "Runtime must be initialized.");
284   tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
285   omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(tid) =
286       num_threads;
287 }
288 
289 // Do nothing. The host guarantees we started the requested number of
290 // teams and we only need inspection of gridDim.
291 
__kmpc_push_num_teams(kmp_Ident * loc,int32_t tid,int32_t num_teams,int32_t thread_limit)292 EXTERN void __kmpc_push_num_teams(kmp_Ident *loc, int32_t tid,
293                                   int32_t num_teams, int32_t thread_limit) {
294   PRINT(LD_IO, "call kmpc_push_num_teams %d\n", (int)num_teams);
295   ASSERT0(LT_FUSSY, 0,
296           "should never have anything with new teams on device");
297 }
298 
__kmpc_push_proc_bind(kmp_Ident * loc,uint32_t tid,int proc_bind)299 EXTERN void __kmpc_push_proc_bind(kmp_Ident *loc, uint32_t tid,
300                                   int proc_bind) {
301   PRINT(LD_IO, "call kmpc_push_proc_bind %d\n", (int)proc_bind);
302 }
303