1 //===------------ loop.cu - NVPTX OpenMP loop constructs --------- CUDA -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file contains the implementation of the KMPC interface
10 // for the loop construct plus other worksharing constructs that use the same
11 // interface as loops.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "common/omptarget.h"
16 #include "target_impl.h"
17 #include "common/target_atomic.h"
18 
19 ////////////////////////////////////////////////////////////////////////////////
20 ////////////////////////////////////////////////////////////////////////////////
21 // template class that encapsulate all the helper functions
22 //
23 // T is loop iteration type (32 | 64)  (unsigned | signed)
24 // ST is the signed version of T
25 ////////////////////////////////////////////////////////////////////////////////
26 ////////////////////////////////////////////////////////////////////////////////
27 
28 template <typename T, typename ST> class omptarget_nvptx_LoopSupport {
29 public:
30   ////////////////////////////////////////////////////////////////////////////////
31   // Loop with static scheduling with chunk
32 
33   // Generic implementation of OMP loop scheduling with static policy
34   /*! \brief Calculate initial bounds for static loop and stride
35    *  @param[in] loc location in code of the call (not used here)
36    *  @param[in] global_tid global thread id
37    *  @param[in] schetype type of scheduling (see omptarget-nvptx.h)
38    *  @param[in] plastiter pointer to last iteration
39    *  @param[in,out] pointer to loop lower bound. it will contain value of
40    *  lower bound of first chunk
41    *  @param[in,out] pointer to loop upper bound. It will contain value of
42    *  upper bound of first chunk
43    *  @param[in,out] pointer to loop stride. It will contain value of stride
44    *  between two successive chunks executed by the same thread
45    *  @param[in] loop increment bump
46    *  @param[in] chunk size
47    */
48 
49   // helper function for static chunk
ForStaticChunk(int & last,T & lb,T & ub,ST & stride,ST chunk,T entityId,T numberOfEntities)50   INLINE static void ForStaticChunk(int &last, T &lb, T &ub, ST &stride,
51                                     ST chunk, T entityId, T numberOfEntities) {
52     // each thread executes multiple chunks all of the same size, except
53     // the last one
54 
55     // distance between two successive chunks
56     stride = numberOfEntities * chunk;
57     lb = lb + entityId * chunk;
58     T inputUb = ub;
59     ub = lb + chunk - 1; // Clang uses i <= ub
60     // Say ub' is the begining of the last chunk. Then who ever has a
61     // lower bound plus a multiple of the increment equal to ub' is
62     // the last one.
63     T beginingLastChunk = inputUb - (inputUb % chunk);
64     last = ((beginingLastChunk - lb) % stride) == 0;
65   }
66 
67   ////////////////////////////////////////////////////////////////////////////////
68   // Loop with static scheduling without chunk
69 
70   // helper function for static no chunk
ForStaticNoChunk(int & last,T & lb,T & ub,ST & stride,ST & chunk,T entityId,T numberOfEntities)71   INLINE static void ForStaticNoChunk(int &last, T &lb, T &ub, ST &stride,
72                                       ST &chunk, T entityId,
73                                       T numberOfEntities) {
74     // No chunk size specified.  Each thread or warp gets at most one
75     // chunk; chunks are all almost of equal size
76     T loopSize = ub - lb + 1;
77 
78     chunk = loopSize / numberOfEntities;
79     T leftOver = loopSize - chunk * numberOfEntities;
80 
81     if (entityId < leftOver) {
82       chunk++;
83       lb = lb + entityId * chunk;
84     } else {
85       lb = lb + entityId * chunk + leftOver;
86     }
87 
88     T inputUb = ub;
89     ub = lb + chunk - 1; // Clang uses i <= ub
90     last = lb <= inputUb && inputUb <= ub;
91     stride = loopSize; // make sure we only do 1 chunk per warp
92   }
93 
94   ////////////////////////////////////////////////////////////////////////////////
95   // Support for Static Init
96 
for_static_init(int32_t gtid,int32_t schedtype,int32_t * plastiter,T * plower,T * pupper,ST * pstride,ST chunk,bool IsSPMDExecutionMode)97   INLINE static void for_static_init(int32_t gtid, int32_t schedtype,
98                                      int32_t *plastiter, T *plower, T *pupper,
99                                      ST *pstride, ST chunk,
100                                      bool IsSPMDExecutionMode) {
101     // When IsRuntimeUninitialized is true, we assume that the caller is
102     // in an L0 parallel region and that all worker threads participate.
103 
104     // Assume we are in teams region or that we use a single block
105     // per target region
106     ST numberOfActiveOMPThreads = GetNumberOfOmpThreads(IsSPMDExecutionMode);
107 
108     // All warps that are in excess of the maximum requested, do
109     // not execute the loop
110     PRINT(LD_LOOP,
111           "OMP Thread %d: schedule type %d, chunk size = %lld, mytid "
112           "%d, num tids %d\n",
113           (int)gtid, (int)schedtype, (long long)chunk, (int)gtid,
114           (int)numberOfActiveOMPThreads);
115     ASSERT0(LT_FUSSY, gtid < numberOfActiveOMPThreads,
116             "current thread is not needed here; error");
117 
118     // copy
119     int lastiter = 0;
120     T lb = *plower;
121     T ub = *pupper;
122     ST stride = *pstride;
123     // init
124     switch (SCHEDULE_WITHOUT_MODIFIERS(schedtype)) {
125     case kmp_sched_static_chunk: {
126       if (chunk > 0) {
127         ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
128                        numberOfActiveOMPThreads);
129         break;
130       }
131     } // note: if chunk <=0, use nochunk
132     case kmp_sched_static_balanced_chunk: {
133       if (chunk > 0) {
134         // round up to make sure the chunk is enough to cover all iterations
135         T tripCount = ub - lb + 1; // +1 because ub is inclusive
136         T span = (tripCount + numberOfActiveOMPThreads - 1) /
137                  numberOfActiveOMPThreads;
138         // perform chunk adjustment
139         chunk = (span + chunk - 1) & ~(chunk - 1);
140 
141         ASSERT0(LT_FUSSY, ub >= lb, "ub must be >= lb.");
142         T oldUb = ub;
143         ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
144                        numberOfActiveOMPThreads);
145         if (ub > oldUb)
146           ub = oldUb;
147         break;
148       }
149     } // note: if chunk <=0, use nochunk
150     case kmp_sched_static_nochunk: {
151       ForStaticNoChunk(lastiter, lb, ub, stride, chunk, gtid,
152                        numberOfActiveOMPThreads);
153       break;
154     }
155     case kmp_sched_distr_static_chunk: {
156       if (chunk > 0) {
157         ForStaticChunk(lastiter, lb, ub, stride, chunk, GetOmpTeamId(),
158                        GetNumberOfOmpTeams());
159         break;
160       } // note: if chunk <=0, use nochunk
161     }
162     case kmp_sched_distr_static_nochunk: {
163       ForStaticNoChunk(lastiter, lb, ub, stride, chunk, GetOmpTeamId(),
164                        GetNumberOfOmpTeams());
165       break;
166     }
167     case kmp_sched_distr_static_chunk_sched_static_chunkone: {
168       ForStaticChunk(lastiter, lb, ub, stride, chunk,
169                      numberOfActiveOMPThreads * GetOmpTeamId() + gtid,
170                      GetNumberOfOmpTeams() * numberOfActiveOMPThreads);
171       break;
172     }
173     default: {
174       ASSERT(LT_FUSSY, 0, "unknown schedtype %d", (int)schedtype);
175       PRINT(LD_LOOP, "unknown schedtype %d, revert back to static chunk\n",
176             (int)schedtype);
177       ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
178                      numberOfActiveOMPThreads);
179       break;
180     }
181     }
182     // copy back
183     *plastiter = lastiter;
184     *plower = lb;
185     *pupper = ub;
186     *pstride = stride;
187     PRINT(LD_LOOP,
188           "Got sched: Active %d, total %d: lb %lld, ub %lld, stride %lld, last "
189           "%d\n",
190           (int)numberOfActiveOMPThreads, (int)GetNumberOfWorkersInTeam(),
191           (long long)(*plower), (long long)(*pupper), (long long)(*pstride),
192           (int)lastiter);
193   }
194 
195   ////////////////////////////////////////////////////////////////////////////////
196   // Support for dispatch Init
197 
OrderedSchedule(kmp_sched_t schedule)198   INLINE static int OrderedSchedule(kmp_sched_t schedule) {
199     return schedule >= kmp_sched_ordered_first &&
200            schedule <= kmp_sched_ordered_last;
201   }
202 
dispatch_init(kmp_Ident * loc,int32_t threadId,kmp_sched_t schedule,T lb,T ub,ST st,ST chunk)203   INLINE static void dispatch_init(kmp_Ident *loc, int32_t threadId,
204                                    kmp_sched_t schedule, T lb, T ub, ST st,
205                                    ST chunk) {
206     if (checkRuntimeUninitialized(loc)) {
207       // In SPMD mode no need to check parallelism level - dynamic scheduling
208       // may appear only in L2 parallel regions with lightweight runtime.
209       ASSERT0(LT_FUSSY, checkSPMDMode(loc), "Expected non-SPMD mode.");
210       return;
211     }
212     int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
213     omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(tid);
214     T tnum = GetNumberOfOmpThreads(checkSPMDMode(loc));
215     T tripCount = ub - lb + 1; // +1 because ub is inclusive
216     ASSERT0(LT_FUSSY, threadId < tnum,
217             "current thread is not needed here; error");
218 
219     /* Currently just ignore the monotonic and non-monotonic modifiers
220      * (the compiler isn't producing them * yet anyway).
221      * When it is we'll want to look at them somewhere here and use that
222      * information to add to our schedule choice. We shouldn't need to pass
223      * them on, they merely affect which schedule we can legally choose for
224      * various dynamic cases. (In particular, whether or not a stealing scheme
225      * is legal).
226      */
227     schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
228 
229     // Process schedule.
230     if (tnum == 1 || tripCount <= 1 || OrderedSchedule(schedule)) {
231       if (OrderedSchedule(schedule))
232         __kmpc_barrier(loc, threadId);
233       PRINT(LD_LOOP,
234             "go sequential as tnum=%ld, trip count %lld, ordered sched=%d\n",
235             (long)tnum, (long long)tripCount, (int)schedule);
236       schedule = kmp_sched_static_chunk;
237       chunk = tripCount; // one thread gets the whole loop
238     } else if (schedule == kmp_sched_runtime) {
239       // process runtime
240       omp_sched_t rtSched = currTaskDescr->GetRuntimeSched();
241       chunk = currTaskDescr->RuntimeChunkSize();
242       switch (rtSched) {
243       case omp_sched_static: {
244         if (chunk > 0)
245           schedule = kmp_sched_static_chunk;
246         else
247           schedule = kmp_sched_static_nochunk;
248         break;
249       }
250       case omp_sched_auto: {
251         schedule = kmp_sched_static_chunk;
252         chunk = 1;
253         break;
254       }
255       case omp_sched_dynamic:
256       case omp_sched_guided: {
257         schedule = kmp_sched_dynamic;
258         break;
259       }
260       }
261       PRINT(LD_LOOP, "Runtime sched is %d with chunk %lld\n", (int)schedule,
262             (long long)chunk);
263     } else if (schedule == kmp_sched_auto) {
264       schedule = kmp_sched_static_chunk;
265       chunk = 1;
266       PRINT(LD_LOOP, "Auto sched is %d with chunk %lld\n", (int)schedule,
267             (long long)chunk);
268     } else {
269       PRINT(LD_LOOP, "Dyn sched is %d with chunk %lld\n", (int)schedule,
270             (long long)chunk);
271       ASSERT(LT_FUSSY,
272              schedule == kmp_sched_dynamic || schedule == kmp_sched_guided,
273              "unknown schedule %d & chunk %lld\n", (int)schedule,
274              (long long)chunk);
275     }
276 
277     // init schedules
278     if (schedule == kmp_sched_static_chunk) {
279       ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value");
280       // save sched state
281       omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
282       // save ub
283       omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub;
284       // compute static chunk
285       ST stride;
286       int lastiter = 0;
287       ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
288       // save computed params
289       omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk;
290       omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb;
291       omptarget_nvptx_threadPrivateContext->Stride(tid) = stride;
292       PRINT(LD_LOOP,
293             "dispatch init (static chunk) : num threads = %d, ub =  %" PRId64
294             ", next lower bound = %llu, stride = %llu\n",
295             (int)tnum,
296             omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
297             (unsigned long long)
298                 omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
299             (unsigned long long)omptarget_nvptx_threadPrivateContext->Stride(
300                 tid));
301     } else if (schedule == kmp_sched_static_balanced_chunk) {
302       ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value");
303       // save sched state
304       omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
305       // save ub
306       omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub;
307       // compute static chunk
308       ST stride;
309       int lastiter = 0;
310       // round up to make sure the chunk is enough to cover all iterations
311       T span = (tripCount + tnum - 1) / tnum;
312       // perform chunk adjustment
313       chunk = (span + chunk - 1) & ~(chunk - 1);
314 
315       T oldUb = ub;
316       ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
317       ASSERT0(LT_FUSSY, ub >= lb, "ub must be >= lb.");
318       if (ub > oldUb)
319         ub = oldUb;
320       // save computed params
321       omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk;
322       omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb;
323       omptarget_nvptx_threadPrivateContext->Stride(tid) = stride;
324       PRINT(LD_LOOP,
325             "dispatch init (static chunk) : num threads = %d, ub =  %" PRId64
326             ", next lower bound = %llu, stride = %llu\n",
327             (int)tnum,
328             omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
329             (unsigned long long)
330                 omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
331             (unsigned long long)omptarget_nvptx_threadPrivateContext->Stride(
332                 tid));
333     } else if (schedule == kmp_sched_static_nochunk) {
334       ASSERT0(LT_FUSSY, chunk == 0, "bad chunk value");
335       // save sched state
336       omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
337       // save ub
338       omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub;
339       // compute static chunk
340       ST stride;
341       int lastiter = 0;
342       ForStaticNoChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
343       // save computed params
344       omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk;
345       omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb;
346       omptarget_nvptx_threadPrivateContext->Stride(tid) = stride;
347       PRINT(LD_LOOP,
348             "dispatch init (static nochunk) : num threads = %d, ub = %" PRId64
349             ", next lower bound = %llu, stride = %llu\n",
350             (int)tnum,
351             omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
352             (unsigned long long)
353                 omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
354             (unsigned long long)omptarget_nvptx_threadPrivateContext->Stride(
355                 tid));
356     } else if (schedule == kmp_sched_dynamic || schedule == kmp_sched_guided) {
357       // save data
358       omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
359       if (chunk < 1)
360         chunk = 1;
361       omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk;
362       omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub;
363       omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb;
364       __kmpc_barrier(loc, threadId);
365       if (tid == 0) {
366         omptarget_nvptx_threadPrivateContext->Cnt() = 0;
367         __kmpc_impl_threadfence_block();
368       }
369       __kmpc_barrier(loc, threadId);
370       PRINT(LD_LOOP,
371             "dispatch init (dyn) : num threads = %d, lb = %llu, ub = %" PRId64
372             ", chunk %" PRIu64 "\n",
373             (int)tnum,
374             (unsigned long long)
375                 omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
376             omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
377             omptarget_nvptx_threadPrivateContext->Chunk(tid));
378     }
379   }
380 
381   ////////////////////////////////////////////////////////////////////////////////
382   // Support for dispatch next
383 
Shuffle(__kmpc_impl_lanemask_t active,int64_t val,int leader)384   INLINE static uint64_t Shuffle(__kmpc_impl_lanemask_t active, int64_t val,
385                                  int leader) {
386     uint32_t lo, hi;
387     __kmpc_impl_unpack(val, lo, hi);
388     hi = __kmpc_impl_shfl_sync(active, hi, leader);
389     lo = __kmpc_impl_shfl_sync(active, lo, leader);
390     return __kmpc_impl_pack(lo, hi);
391   }
392 
NextIter()393   INLINE static uint64_t NextIter() {
394     __kmpc_impl_lanemask_t active = __kmpc_impl_activemask();
395     uint32_t leader = __kmpc_impl_ffs(active) - 1;
396     uint32_t change = __kmpc_impl_popc(active);
397     __kmpc_impl_lanemask_t lane_mask_lt = __kmpc_impl_lanemask_lt();
398     unsigned int rank = __kmpc_impl_popc(active & lane_mask_lt);
399     uint64_t warp_res;
400     if (rank == 0) {
401       warp_res = __kmpc_atomic_add(
402           (unsigned long long *)&omptarget_nvptx_threadPrivateContext->Cnt(),
403           (unsigned long long)change);
404     }
405     warp_res = Shuffle(active, warp_res, leader);
406     return warp_res + rank;
407   }
408 
DynamicNextChunk(T & lb,T & ub,T chunkSize,T loopLowerBound,T loopUpperBound)409   INLINE static int DynamicNextChunk(T &lb, T &ub, T chunkSize,
410                                      T loopLowerBound, T loopUpperBound) {
411     T N = NextIter();
412     lb = loopLowerBound + N * chunkSize;
413     ub = lb + chunkSize - 1;  // Clang uses i <= ub
414 
415     // 3 result cases:
416     //  a. lb and ub < loopUpperBound --> NOT_FINISHED
417     //  b. lb < loopUpperBound and ub >= loopUpperBound: last chunk -->
418     //  NOT_FINISHED
419     //  c. lb and ub >= loopUpperBound: empty chunk --> FINISHED
420     // a.
421     if (lb <= loopUpperBound && ub < loopUpperBound) {
422       PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; not finished\n",
423             (long long)lb, (long long)ub, (long long)loopUpperBound);
424       return NOT_FINISHED;
425     }
426     // b.
427     if (lb <= loopUpperBound) {
428       PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; clip to loop ub\n",
429             (long long)lb, (long long)ub, (long long)loopUpperBound);
430       ub = loopUpperBound;
431       return LAST_CHUNK;
432     }
433     // c. if we are here, we are in case 'c'
434     lb = loopUpperBound + 2;
435     ub = loopUpperBound + 1;
436     PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; finished\n", (long long)lb,
437           (long long)ub, (long long)loopUpperBound);
438     return FINISHED;
439   }
440 
dispatch_next(kmp_Ident * loc,int32_t gtid,int32_t * plast,T * plower,T * pupper,ST * pstride)441   INLINE static int dispatch_next(kmp_Ident *loc, int32_t gtid, int32_t *plast,
442                                   T *plower, T *pupper, ST *pstride) {
443     if (checkRuntimeUninitialized(loc)) {
444       // In SPMD mode no need to check parallelism level - dynamic scheduling
445       // may appear only in L2 parallel regions with lightweight runtime.
446       ASSERT0(LT_FUSSY, checkSPMDMode(loc), "Expected non-SPMD mode.");
447       if (*plast)
448         return DISPATCH_FINISHED;
449       *plast = 1;
450       return DISPATCH_NOTFINISHED;
451     }
452     // ID of a thread in its own warp
453 
454     // automatically selects thread or warp ID based on selected implementation
455     int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
456     ASSERT0(LT_FUSSY, gtid < GetNumberOfOmpThreads(checkSPMDMode(loc)),
457             "current thread is not needed here; error");
458     // retrieve schedule
459     kmp_sched_t schedule =
460         omptarget_nvptx_threadPrivateContext->ScheduleType(tid);
461 
462     // xxx reduce to one
463     if (schedule == kmp_sched_static_chunk ||
464         schedule == kmp_sched_static_nochunk) {
465       T myLb = omptarget_nvptx_threadPrivateContext->NextLowerBound(tid);
466       T ub = omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid);
467       // finished?
468       if (myLb > ub) {
469         PRINT(LD_LOOP, "static loop finished with myLb %lld, ub %lld\n",
470               (long long)myLb, (long long)ub);
471         return DISPATCH_FINISHED;
472       }
473       // not finished, save current bounds
474       ST chunk = omptarget_nvptx_threadPrivateContext->Chunk(tid);
475       *plower = myLb;
476       T myUb = myLb + chunk - 1; // Clang uses i <= ub
477       if (myUb > ub)
478         myUb = ub;
479       *pupper = myUb;
480       *plast = (int32_t)(myUb == ub);
481 
482       // increment next lower bound by the stride
483       ST stride = omptarget_nvptx_threadPrivateContext->Stride(tid);
484       omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = myLb + stride;
485       PRINT(LD_LOOP, "static loop continues with myLb %lld, myUb %lld\n",
486             (long long)*plower, (long long)*pupper);
487       return DISPATCH_NOTFINISHED;
488     }
489     ASSERT0(LT_FUSSY,
490             schedule == kmp_sched_dynamic || schedule == kmp_sched_guided,
491             "bad sched");
492     T myLb, myUb;
493     int finished = DynamicNextChunk(
494         myLb, myUb, omptarget_nvptx_threadPrivateContext->Chunk(tid),
495         omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
496         omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid));
497 
498     if (finished == FINISHED)
499       return DISPATCH_FINISHED;
500 
501     // not finished (either not finished or last chunk)
502     *plast = (int32_t)(finished == LAST_CHUNK);
503     *plower = myLb;
504     *pupper = myUb;
505     *pstride = 1;
506 
507     PRINT(LD_LOOP,
508           "Got sched: active %d, total %d: lb %lld, ub %lld, stride = %lld, "
509           "last %d\n",
510           (int)GetNumberOfOmpThreads(isSPMDMode()),
511           (int)GetNumberOfWorkersInTeam(), (long long)*plower,
512           (long long)*pupper, (long long)*pstride, (int)*plast);
513     return DISPATCH_NOTFINISHED;
514   }
515 
dispatch_fini()516   INLINE static void dispatch_fini() {
517     // nothing
518   }
519 
520   ////////////////////////////////////////////////////////////////////////////////
521   // end of template class that encapsulate all the helper functions
522   ////////////////////////////////////////////////////////////////////////////////
523 };
524 
525 ////////////////////////////////////////////////////////////////////////////////
526 // KMP interface implementation (dyn loops)
527 ////////////////////////////////////////////////////////////////////////////////
528 
529 // init
__kmpc_dispatch_init_4(kmp_Ident * loc,int32_t tid,int32_t schedule,int32_t lb,int32_t ub,int32_t st,int32_t chunk)530 EXTERN void __kmpc_dispatch_init_4(kmp_Ident *loc, int32_t tid,
531                                    int32_t schedule, int32_t lb, int32_t ub,
532                                    int32_t st, int32_t chunk) {
533   PRINT0(LD_IO, "call kmpc_dispatch_init_4\n");
534   omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_init(
535       loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk);
536 }
537 
__kmpc_dispatch_init_4u(kmp_Ident * loc,int32_t tid,int32_t schedule,uint32_t lb,uint32_t ub,int32_t st,int32_t chunk)538 EXTERN void __kmpc_dispatch_init_4u(kmp_Ident *loc, int32_t tid,
539                                     int32_t schedule, uint32_t lb, uint32_t ub,
540                                     int32_t st, int32_t chunk) {
541   PRINT0(LD_IO, "call kmpc_dispatch_init_4u\n");
542   omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_init(
543       loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk);
544 }
545 
__kmpc_dispatch_init_8(kmp_Ident * loc,int32_t tid,int32_t schedule,int64_t lb,int64_t ub,int64_t st,int64_t chunk)546 EXTERN void __kmpc_dispatch_init_8(kmp_Ident *loc, int32_t tid,
547                                    int32_t schedule, int64_t lb, int64_t ub,
548                                    int64_t st, int64_t chunk) {
549   PRINT0(LD_IO, "call kmpc_dispatch_init_8\n");
550   omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_init(
551       loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk);
552 }
553 
__kmpc_dispatch_init_8u(kmp_Ident * loc,int32_t tid,int32_t schedule,uint64_t lb,uint64_t ub,int64_t st,int64_t chunk)554 EXTERN void __kmpc_dispatch_init_8u(kmp_Ident *loc, int32_t tid,
555                                     int32_t schedule, uint64_t lb, uint64_t ub,
556                                     int64_t st, int64_t chunk) {
557   PRINT0(LD_IO, "call kmpc_dispatch_init_8u\n");
558   omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_init(
559       loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk);
560 }
561 
562 // next
__kmpc_dispatch_next_4(kmp_Ident * loc,int32_t tid,int32_t * p_last,int32_t * p_lb,int32_t * p_ub,int32_t * p_st)563 EXTERN int __kmpc_dispatch_next_4(kmp_Ident *loc, int32_t tid, int32_t *p_last,
564                                   int32_t *p_lb, int32_t *p_ub, int32_t *p_st) {
565   PRINT0(LD_IO, "call kmpc_dispatch_next_4\n");
566   return omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_next(
567       loc, tid, p_last, p_lb, p_ub, p_st);
568 }
569 
__kmpc_dispatch_next_4u(kmp_Ident * loc,int32_t tid,int32_t * p_last,uint32_t * p_lb,uint32_t * p_ub,int32_t * p_st)570 EXTERN int __kmpc_dispatch_next_4u(kmp_Ident *loc, int32_t tid,
571                                    int32_t *p_last, uint32_t *p_lb,
572                                    uint32_t *p_ub, int32_t *p_st) {
573   PRINT0(LD_IO, "call kmpc_dispatch_next_4u\n");
574   return omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_next(
575       loc, tid, p_last, p_lb, p_ub, p_st);
576 }
577 
__kmpc_dispatch_next_8(kmp_Ident * loc,int32_t tid,int32_t * p_last,int64_t * p_lb,int64_t * p_ub,int64_t * p_st)578 EXTERN int __kmpc_dispatch_next_8(kmp_Ident *loc, int32_t tid, int32_t *p_last,
579                                   int64_t *p_lb, int64_t *p_ub, int64_t *p_st) {
580   PRINT0(LD_IO, "call kmpc_dispatch_next_8\n");
581   return omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_next(
582       loc, tid, p_last, p_lb, p_ub, p_st);
583 }
584 
__kmpc_dispatch_next_8u(kmp_Ident * loc,int32_t tid,int32_t * p_last,uint64_t * p_lb,uint64_t * p_ub,int64_t * p_st)585 EXTERN int __kmpc_dispatch_next_8u(kmp_Ident *loc, int32_t tid,
586                                    int32_t *p_last, uint64_t *p_lb,
587                                    uint64_t *p_ub, int64_t *p_st) {
588   PRINT0(LD_IO, "call kmpc_dispatch_next_8u\n");
589   return omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_next(
590       loc, tid, p_last, p_lb, p_ub, p_st);
591 }
592 
593 // fini
__kmpc_dispatch_fini_4(kmp_Ident * loc,int32_t tid)594 EXTERN void __kmpc_dispatch_fini_4(kmp_Ident *loc, int32_t tid) {
595   PRINT0(LD_IO, "call kmpc_dispatch_fini_4\n");
596   omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_fini();
597 }
598 
__kmpc_dispatch_fini_4u(kmp_Ident * loc,int32_t tid)599 EXTERN void __kmpc_dispatch_fini_4u(kmp_Ident *loc, int32_t tid) {
600   PRINT0(LD_IO, "call kmpc_dispatch_fini_4u\n");
601   omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_fini();
602 }
603 
__kmpc_dispatch_fini_8(kmp_Ident * loc,int32_t tid)604 EXTERN void __kmpc_dispatch_fini_8(kmp_Ident *loc, int32_t tid) {
605   PRINT0(LD_IO, "call kmpc_dispatch_fini_8\n");
606   omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_fini();
607 }
608 
__kmpc_dispatch_fini_8u(kmp_Ident * loc,int32_t tid)609 EXTERN void __kmpc_dispatch_fini_8u(kmp_Ident *loc, int32_t tid) {
610   PRINT0(LD_IO, "call kmpc_dispatch_fini_8u\n");
611   omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_fini();
612 }
613 
614 ////////////////////////////////////////////////////////////////////////////////
615 // KMP interface implementation (static loops)
616 ////////////////////////////////////////////////////////////////////////////////
617 
__kmpc_for_static_init_4(kmp_Ident * loc,int32_t global_tid,int32_t schedtype,int32_t * plastiter,int32_t * plower,int32_t * pupper,int32_t * pstride,int32_t incr,int32_t chunk)618 EXTERN void __kmpc_for_static_init_4(kmp_Ident *loc, int32_t global_tid,
619                                      int32_t schedtype, int32_t *plastiter,
620                                      int32_t *plower, int32_t *pupper,
621                                      int32_t *pstride, int32_t incr,
622                                      int32_t chunk) {
623   PRINT0(LD_IO, "call kmpc_for_static_init_4\n");
624   omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
625       global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
626       checkSPMDMode(loc));
627 }
628 
__kmpc_for_static_init_4u(kmp_Ident * loc,int32_t global_tid,int32_t schedtype,int32_t * plastiter,uint32_t * plower,uint32_t * pupper,int32_t * pstride,int32_t incr,int32_t chunk)629 EXTERN void __kmpc_for_static_init_4u(kmp_Ident *loc, int32_t global_tid,
630                                       int32_t schedtype, int32_t *plastiter,
631                                       uint32_t *plower, uint32_t *pupper,
632                                       int32_t *pstride, int32_t incr,
633                                       int32_t chunk) {
634   PRINT0(LD_IO, "call kmpc_for_static_init_4u\n");
635   omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
636       global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
637       checkSPMDMode(loc));
638 }
639 
__kmpc_for_static_init_8(kmp_Ident * loc,int32_t global_tid,int32_t schedtype,int32_t * plastiter,int64_t * plower,int64_t * pupper,int64_t * pstride,int64_t incr,int64_t chunk)640 EXTERN void __kmpc_for_static_init_8(kmp_Ident *loc, int32_t global_tid,
641                                      int32_t schedtype, int32_t *plastiter,
642                                      int64_t *plower, int64_t *pupper,
643                                      int64_t *pstride, int64_t incr,
644                                      int64_t chunk) {
645   PRINT0(LD_IO, "call kmpc_for_static_init_8\n");
646   omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
647       global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
648       checkSPMDMode(loc));
649 }
650 
__kmpc_for_static_init_8u(kmp_Ident * loc,int32_t global_tid,int32_t schedtype,int32_t * plastiter,uint64_t * plower,uint64_t * pupper,int64_t * pstride,int64_t incr,int64_t chunk)651 EXTERN void __kmpc_for_static_init_8u(kmp_Ident *loc, int32_t global_tid,
652                                       int32_t schedtype, int32_t *plastiter,
653                                       uint64_t *plower, uint64_t *pupper,
654                                       int64_t *pstride, int64_t incr,
655                                       int64_t chunk) {
656   PRINT0(LD_IO, "call kmpc_for_static_init_8u\n");
657   omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
658       global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
659       checkSPMDMode(loc));
660 }
661 
662 EXTERN
__kmpc_for_static_init_4_simple_spmd(kmp_Ident * loc,int32_t global_tid,int32_t schedtype,int32_t * plastiter,int32_t * plower,int32_t * pupper,int32_t * pstride,int32_t incr,int32_t chunk)663 void __kmpc_for_static_init_4_simple_spmd(kmp_Ident *loc, int32_t global_tid,
664                                           int32_t schedtype, int32_t *plastiter,
665                                           int32_t *plower, int32_t *pupper,
666                                           int32_t *pstride, int32_t incr,
667                                           int32_t chunk) {
668   PRINT0(LD_IO, "call kmpc_for_static_init_4_simple_spmd\n");
669   omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
670       global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
671       /*IsSPMDExecutionMode=*/true);
672 }
673 
674 EXTERN
__kmpc_for_static_init_4u_simple_spmd(kmp_Ident * loc,int32_t global_tid,int32_t schedtype,int32_t * plastiter,uint32_t * plower,uint32_t * pupper,int32_t * pstride,int32_t incr,int32_t chunk)675 void __kmpc_for_static_init_4u_simple_spmd(kmp_Ident *loc, int32_t global_tid,
676                                            int32_t schedtype,
677                                            int32_t *plastiter, uint32_t *plower,
678                                            uint32_t *pupper, int32_t *pstride,
679                                            int32_t incr, int32_t chunk) {
680   PRINT0(LD_IO, "call kmpc_for_static_init_4u_simple_spmd\n");
681   omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
682       global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
683       /*IsSPMDExecutionMode=*/true);
684 }
685 
686 EXTERN
__kmpc_for_static_init_8_simple_spmd(kmp_Ident * loc,int32_t global_tid,int32_t schedtype,int32_t * plastiter,int64_t * plower,int64_t * pupper,int64_t * pstride,int64_t incr,int64_t chunk)687 void __kmpc_for_static_init_8_simple_spmd(kmp_Ident *loc, int32_t global_tid,
688                                           int32_t schedtype, int32_t *plastiter,
689                                           int64_t *plower, int64_t *pupper,
690                                           int64_t *pstride, int64_t incr,
691                                           int64_t chunk) {
692   PRINT0(LD_IO, "call kmpc_for_static_init_8_simple_spmd\n");
693   omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
694       global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
695       /*IsSPMDExecutionMode=*/true);
696 }
697 
698 EXTERN
__kmpc_for_static_init_8u_simple_spmd(kmp_Ident * loc,int32_t global_tid,int32_t schedtype,int32_t * plastiter,uint64_t * plower,uint64_t * pupper,int64_t * pstride,int64_t incr,int64_t chunk)699 void __kmpc_for_static_init_8u_simple_spmd(kmp_Ident *loc, int32_t global_tid,
700                                            int32_t schedtype,
701                                            int32_t *plastiter, uint64_t *plower,
702                                            uint64_t *pupper, int64_t *pstride,
703                                            int64_t incr, int64_t chunk) {
704   PRINT0(LD_IO, "call kmpc_for_static_init_8u_simple_spmd\n");
705   omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
706       global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
707       /*IsSPMDExecutionMode=*/true);
708 }
709 
710 EXTERN
__kmpc_for_static_init_4_simple_generic(kmp_Ident * loc,int32_t global_tid,int32_t schedtype,int32_t * plastiter,int32_t * plower,int32_t * pupper,int32_t * pstride,int32_t incr,int32_t chunk)711 void __kmpc_for_static_init_4_simple_generic(
712     kmp_Ident *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter,
713     int32_t *plower, int32_t *pupper, int32_t *pstride, int32_t incr,
714     int32_t chunk) {
715   PRINT0(LD_IO, "call kmpc_for_static_init_4_simple_generic\n");
716   omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
717       global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
718       /*IsSPMDExecutionMode=*/false);
719 }
720 
721 EXTERN
__kmpc_for_static_init_4u_simple_generic(kmp_Ident * loc,int32_t global_tid,int32_t schedtype,int32_t * plastiter,uint32_t * plower,uint32_t * pupper,int32_t * pstride,int32_t incr,int32_t chunk)722 void __kmpc_for_static_init_4u_simple_generic(
723     kmp_Ident *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter,
724     uint32_t *plower, uint32_t *pupper, int32_t *pstride, int32_t incr,
725     int32_t chunk) {
726   PRINT0(LD_IO, "call kmpc_for_static_init_4u_simple_generic\n");
727   omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
728       global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
729       /*IsSPMDExecutionMode=*/false);
730 }
731 
732 EXTERN
__kmpc_for_static_init_8_simple_generic(kmp_Ident * loc,int32_t global_tid,int32_t schedtype,int32_t * plastiter,int64_t * plower,int64_t * pupper,int64_t * pstride,int64_t incr,int64_t chunk)733 void __kmpc_for_static_init_8_simple_generic(
734     kmp_Ident *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter,
735     int64_t *plower, int64_t *pupper, int64_t *pstride, int64_t incr,
736     int64_t chunk) {
737   PRINT0(LD_IO, "call kmpc_for_static_init_8_simple_generic\n");
738   omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
739       global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
740       /*IsSPMDExecutionMode=*/false);
741 }
742 
743 EXTERN
__kmpc_for_static_init_8u_simple_generic(kmp_Ident * loc,int32_t global_tid,int32_t schedtype,int32_t * plastiter,uint64_t * plower,uint64_t * pupper,int64_t * pstride,int64_t incr,int64_t chunk)744 void __kmpc_for_static_init_8u_simple_generic(
745     kmp_Ident *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter,
746     uint64_t *plower, uint64_t *pupper, int64_t *pstride, int64_t incr,
747     int64_t chunk) {
748   PRINT0(LD_IO, "call kmpc_for_static_init_8u_simple_generic\n");
749   omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
750       global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
751       /*IsSPMDExecutionMode=*/false);
752 }
753 
__kmpc_for_static_fini(kmp_Ident * loc,int32_t global_tid)754 EXTERN void __kmpc_for_static_fini(kmp_Ident *loc, int32_t global_tid) {
755   PRINT0(LD_IO, "call kmpc_for_static_fini\n");
756 }
757