1 //===------------ loop.cu - NVPTX OpenMP loop constructs --------- CUDA -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file contains the implementation of the KMPC interface
10 // for the loop construct plus other worksharing constructs that use the same
11 // interface as loops.
12 //
13 //===----------------------------------------------------------------------===//
14
15 #include "common/omptarget.h"
16 #include "target_impl.h"
17 #include "common/target_atomic.h"
18
19 ////////////////////////////////////////////////////////////////////////////////
20 ////////////////////////////////////////////////////////////////////////////////
21 // template class that encapsulate all the helper functions
22 //
23 // T is loop iteration type (32 | 64) (unsigned | signed)
24 // ST is the signed version of T
25 ////////////////////////////////////////////////////////////////////////////////
26 ////////////////////////////////////////////////////////////////////////////////
27
28 template <typename T, typename ST> class omptarget_nvptx_LoopSupport {
29 public:
30 ////////////////////////////////////////////////////////////////////////////////
31 // Loop with static scheduling with chunk
32
33 // Generic implementation of OMP loop scheduling with static policy
34 /*! \brief Calculate initial bounds for static loop and stride
35 * @param[in] loc location in code of the call (not used here)
36 * @param[in] global_tid global thread id
37 * @param[in] schetype type of scheduling (see omptarget-nvptx.h)
38 * @param[in] plastiter pointer to last iteration
39 * @param[in,out] pointer to loop lower bound. it will contain value of
40 * lower bound of first chunk
41 * @param[in,out] pointer to loop upper bound. It will contain value of
42 * upper bound of first chunk
43 * @param[in,out] pointer to loop stride. It will contain value of stride
44 * between two successive chunks executed by the same thread
45 * @param[in] loop increment bump
46 * @param[in] chunk size
47 */
48
49 // helper function for static chunk
ForStaticChunk(int & last,T & lb,T & ub,ST & stride,ST chunk,T entityId,T numberOfEntities)50 INLINE static void ForStaticChunk(int &last, T &lb, T &ub, ST &stride,
51 ST chunk, T entityId, T numberOfEntities) {
52 // each thread executes multiple chunks all of the same size, except
53 // the last one
54
55 // distance between two successive chunks
56 stride = numberOfEntities * chunk;
57 lb = lb + entityId * chunk;
58 T inputUb = ub;
59 ub = lb + chunk - 1; // Clang uses i <= ub
60 // Say ub' is the begining of the last chunk. Then who ever has a
61 // lower bound plus a multiple of the increment equal to ub' is
62 // the last one.
63 T beginingLastChunk = inputUb - (inputUb % chunk);
64 last = ((beginingLastChunk - lb) % stride) == 0;
65 }
66
67 ////////////////////////////////////////////////////////////////////////////////
68 // Loop with static scheduling without chunk
69
70 // helper function for static no chunk
ForStaticNoChunk(int & last,T & lb,T & ub,ST & stride,ST & chunk,T entityId,T numberOfEntities)71 INLINE static void ForStaticNoChunk(int &last, T &lb, T &ub, ST &stride,
72 ST &chunk, T entityId,
73 T numberOfEntities) {
74 // No chunk size specified. Each thread or warp gets at most one
75 // chunk; chunks are all almost of equal size
76 T loopSize = ub - lb + 1;
77
78 chunk = loopSize / numberOfEntities;
79 T leftOver = loopSize - chunk * numberOfEntities;
80
81 if (entityId < leftOver) {
82 chunk++;
83 lb = lb + entityId * chunk;
84 } else {
85 lb = lb + entityId * chunk + leftOver;
86 }
87
88 T inputUb = ub;
89 ub = lb + chunk - 1; // Clang uses i <= ub
90 last = lb <= inputUb && inputUb <= ub;
91 stride = loopSize; // make sure we only do 1 chunk per warp
92 }
93
94 ////////////////////////////////////////////////////////////////////////////////
95 // Support for Static Init
96
for_static_init(int32_t gtid,int32_t schedtype,int32_t * plastiter,T * plower,T * pupper,ST * pstride,ST chunk,bool IsSPMDExecutionMode)97 INLINE static void for_static_init(int32_t gtid, int32_t schedtype,
98 int32_t *plastiter, T *plower, T *pupper,
99 ST *pstride, ST chunk,
100 bool IsSPMDExecutionMode) {
101 // When IsRuntimeUninitialized is true, we assume that the caller is
102 // in an L0 parallel region and that all worker threads participate.
103
104 // Assume we are in teams region or that we use a single block
105 // per target region
106 ST numberOfActiveOMPThreads = GetNumberOfOmpThreads(IsSPMDExecutionMode);
107
108 // All warps that are in excess of the maximum requested, do
109 // not execute the loop
110 PRINT(LD_LOOP,
111 "OMP Thread %d: schedule type %d, chunk size = %lld, mytid "
112 "%d, num tids %d\n",
113 (int)gtid, (int)schedtype, (long long)chunk, (int)gtid,
114 (int)numberOfActiveOMPThreads);
115 ASSERT0(LT_FUSSY, gtid < numberOfActiveOMPThreads,
116 "current thread is not needed here; error");
117
118 // copy
119 int lastiter = 0;
120 T lb = *plower;
121 T ub = *pupper;
122 ST stride = *pstride;
123 // init
124 switch (SCHEDULE_WITHOUT_MODIFIERS(schedtype)) {
125 case kmp_sched_static_chunk: {
126 if (chunk > 0) {
127 ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
128 numberOfActiveOMPThreads);
129 break;
130 }
131 } // note: if chunk <=0, use nochunk
132 case kmp_sched_static_balanced_chunk: {
133 if (chunk > 0) {
134 // round up to make sure the chunk is enough to cover all iterations
135 T tripCount = ub - lb + 1; // +1 because ub is inclusive
136 T span = (tripCount + numberOfActiveOMPThreads - 1) /
137 numberOfActiveOMPThreads;
138 // perform chunk adjustment
139 chunk = (span + chunk - 1) & ~(chunk - 1);
140
141 ASSERT0(LT_FUSSY, ub >= lb, "ub must be >= lb.");
142 T oldUb = ub;
143 ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
144 numberOfActiveOMPThreads);
145 if (ub > oldUb)
146 ub = oldUb;
147 break;
148 }
149 } // note: if chunk <=0, use nochunk
150 case kmp_sched_static_nochunk: {
151 ForStaticNoChunk(lastiter, lb, ub, stride, chunk, gtid,
152 numberOfActiveOMPThreads);
153 break;
154 }
155 case kmp_sched_distr_static_chunk: {
156 if (chunk > 0) {
157 ForStaticChunk(lastiter, lb, ub, stride, chunk, GetOmpTeamId(),
158 GetNumberOfOmpTeams());
159 break;
160 } // note: if chunk <=0, use nochunk
161 }
162 case kmp_sched_distr_static_nochunk: {
163 ForStaticNoChunk(lastiter, lb, ub, stride, chunk, GetOmpTeamId(),
164 GetNumberOfOmpTeams());
165 break;
166 }
167 case kmp_sched_distr_static_chunk_sched_static_chunkone: {
168 ForStaticChunk(lastiter, lb, ub, stride, chunk,
169 numberOfActiveOMPThreads * GetOmpTeamId() + gtid,
170 GetNumberOfOmpTeams() * numberOfActiveOMPThreads);
171 break;
172 }
173 default: {
174 ASSERT(LT_FUSSY, 0, "unknown schedtype %d", (int)schedtype);
175 PRINT(LD_LOOP, "unknown schedtype %d, revert back to static chunk\n",
176 (int)schedtype);
177 ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
178 numberOfActiveOMPThreads);
179 break;
180 }
181 }
182 // copy back
183 *plastiter = lastiter;
184 *plower = lb;
185 *pupper = ub;
186 *pstride = stride;
187 PRINT(LD_LOOP,
188 "Got sched: Active %d, total %d: lb %lld, ub %lld, stride %lld, last "
189 "%d\n",
190 (int)numberOfActiveOMPThreads, (int)GetNumberOfWorkersInTeam(),
191 (long long)(*plower), (long long)(*pupper), (long long)(*pstride),
192 (int)lastiter);
193 }
194
195 ////////////////////////////////////////////////////////////////////////////////
196 // Support for dispatch Init
197
OrderedSchedule(kmp_sched_t schedule)198 INLINE static int OrderedSchedule(kmp_sched_t schedule) {
199 return schedule >= kmp_sched_ordered_first &&
200 schedule <= kmp_sched_ordered_last;
201 }
202
dispatch_init(kmp_Ident * loc,int32_t threadId,kmp_sched_t schedule,T lb,T ub,ST st,ST chunk)203 INLINE static void dispatch_init(kmp_Ident *loc, int32_t threadId,
204 kmp_sched_t schedule, T lb, T ub, ST st,
205 ST chunk) {
206 if (checkRuntimeUninitialized(loc)) {
207 // In SPMD mode no need to check parallelism level - dynamic scheduling
208 // may appear only in L2 parallel regions with lightweight runtime.
209 ASSERT0(LT_FUSSY, checkSPMDMode(loc), "Expected non-SPMD mode.");
210 return;
211 }
212 int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
213 omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(tid);
214 T tnum = GetNumberOfOmpThreads(checkSPMDMode(loc));
215 T tripCount = ub - lb + 1; // +1 because ub is inclusive
216 ASSERT0(LT_FUSSY, threadId < tnum,
217 "current thread is not needed here; error");
218
219 /* Currently just ignore the monotonic and non-monotonic modifiers
220 * (the compiler isn't producing them * yet anyway).
221 * When it is we'll want to look at them somewhere here and use that
222 * information to add to our schedule choice. We shouldn't need to pass
223 * them on, they merely affect which schedule we can legally choose for
224 * various dynamic cases. (In particular, whether or not a stealing scheme
225 * is legal).
226 */
227 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
228
229 // Process schedule.
230 if (tnum == 1 || tripCount <= 1 || OrderedSchedule(schedule)) {
231 if (OrderedSchedule(schedule))
232 __kmpc_barrier(loc, threadId);
233 PRINT(LD_LOOP,
234 "go sequential as tnum=%ld, trip count %lld, ordered sched=%d\n",
235 (long)tnum, (long long)tripCount, (int)schedule);
236 schedule = kmp_sched_static_chunk;
237 chunk = tripCount; // one thread gets the whole loop
238 } else if (schedule == kmp_sched_runtime) {
239 // process runtime
240 omp_sched_t rtSched = currTaskDescr->GetRuntimeSched();
241 chunk = currTaskDescr->RuntimeChunkSize();
242 switch (rtSched) {
243 case omp_sched_static: {
244 if (chunk > 0)
245 schedule = kmp_sched_static_chunk;
246 else
247 schedule = kmp_sched_static_nochunk;
248 break;
249 }
250 case omp_sched_auto: {
251 schedule = kmp_sched_static_chunk;
252 chunk = 1;
253 break;
254 }
255 case omp_sched_dynamic:
256 case omp_sched_guided: {
257 schedule = kmp_sched_dynamic;
258 break;
259 }
260 }
261 PRINT(LD_LOOP, "Runtime sched is %d with chunk %lld\n", (int)schedule,
262 (long long)chunk);
263 } else if (schedule == kmp_sched_auto) {
264 schedule = kmp_sched_static_chunk;
265 chunk = 1;
266 PRINT(LD_LOOP, "Auto sched is %d with chunk %lld\n", (int)schedule,
267 (long long)chunk);
268 } else {
269 PRINT(LD_LOOP, "Dyn sched is %d with chunk %lld\n", (int)schedule,
270 (long long)chunk);
271 ASSERT(LT_FUSSY,
272 schedule == kmp_sched_dynamic || schedule == kmp_sched_guided,
273 "unknown schedule %d & chunk %lld\n", (int)schedule,
274 (long long)chunk);
275 }
276
277 // init schedules
278 if (schedule == kmp_sched_static_chunk) {
279 ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value");
280 // save sched state
281 omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
282 // save ub
283 omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub;
284 // compute static chunk
285 ST stride;
286 int lastiter = 0;
287 ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
288 // save computed params
289 omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk;
290 omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb;
291 omptarget_nvptx_threadPrivateContext->Stride(tid) = stride;
292 PRINT(LD_LOOP,
293 "dispatch init (static chunk) : num threads = %d, ub = %" PRId64
294 ", next lower bound = %llu, stride = %llu\n",
295 (int)tnum,
296 omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
297 (unsigned long long)
298 omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
299 (unsigned long long)omptarget_nvptx_threadPrivateContext->Stride(
300 tid));
301 } else if (schedule == kmp_sched_static_balanced_chunk) {
302 ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value");
303 // save sched state
304 omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
305 // save ub
306 omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub;
307 // compute static chunk
308 ST stride;
309 int lastiter = 0;
310 // round up to make sure the chunk is enough to cover all iterations
311 T span = (tripCount + tnum - 1) / tnum;
312 // perform chunk adjustment
313 chunk = (span + chunk - 1) & ~(chunk - 1);
314
315 T oldUb = ub;
316 ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
317 ASSERT0(LT_FUSSY, ub >= lb, "ub must be >= lb.");
318 if (ub > oldUb)
319 ub = oldUb;
320 // save computed params
321 omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk;
322 omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb;
323 omptarget_nvptx_threadPrivateContext->Stride(tid) = stride;
324 PRINT(LD_LOOP,
325 "dispatch init (static chunk) : num threads = %d, ub = %" PRId64
326 ", next lower bound = %llu, stride = %llu\n",
327 (int)tnum,
328 omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
329 (unsigned long long)
330 omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
331 (unsigned long long)omptarget_nvptx_threadPrivateContext->Stride(
332 tid));
333 } else if (schedule == kmp_sched_static_nochunk) {
334 ASSERT0(LT_FUSSY, chunk == 0, "bad chunk value");
335 // save sched state
336 omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
337 // save ub
338 omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub;
339 // compute static chunk
340 ST stride;
341 int lastiter = 0;
342 ForStaticNoChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
343 // save computed params
344 omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk;
345 omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb;
346 omptarget_nvptx_threadPrivateContext->Stride(tid) = stride;
347 PRINT(LD_LOOP,
348 "dispatch init (static nochunk) : num threads = %d, ub = %" PRId64
349 ", next lower bound = %llu, stride = %llu\n",
350 (int)tnum,
351 omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
352 (unsigned long long)
353 omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
354 (unsigned long long)omptarget_nvptx_threadPrivateContext->Stride(
355 tid));
356 } else if (schedule == kmp_sched_dynamic || schedule == kmp_sched_guided) {
357 // save data
358 omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
359 if (chunk < 1)
360 chunk = 1;
361 omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk;
362 omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub;
363 omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb;
364 __kmpc_barrier(loc, threadId);
365 if (tid == 0) {
366 omptarget_nvptx_threadPrivateContext->Cnt() = 0;
367 __kmpc_impl_threadfence_block();
368 }
369 __kmpc_barrier(loc, threadId);
370 PRINT(LD_LOOP,
371 "dispatch init (dyn) : num threads = %d, lb = %llu, ub = %" PRId64
372 ", chunk %" PRIu64 "\n",
373 (int)tnum,
374 (unsigned long long)
375 omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
376 omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
377 omptarget_nvptx_threadPrivateContext->Chunk(tid));
378 }
379 }
380
381 ////////////////////////////////////////////////////////////////////////////////
382 // Support for dispatch next
383
Shuffle(__kmpc_impl_lanemask_t active,int64_t val,int leader)384 INLINE static uint64_t Shuffle(__kmpc_impl_lanemask_t active, int64_t val,
385 int leader) {
386 uint32_t lo, hi;
387 __kmpc_impl_unpack(val, lo, hi);
388 hi = __kmpc_impl_shfl_sync(active, hi, leader);
389 lo = __kmpc_impl_shfl_sync(active, lo, leader);
390 return __kmpc_impl_pack(lo, hi);
391 }
392
NextIter()393 INLINE static uint64_t NextIter() {
394 __kmpc_impl_lanemask_t active = __kmpc_impl_activemask();
395 uint32_t leader = __kmpc_impl_ffs(active) - 1;
396 uint32_t change = __kmpc_impl_popc(active);
397 __kmpc_impl_lanemask_t lane_mask_lt = __kmpc_impl_lanemask_lt();
398 unsigned int rank = __kmpc_impl_popc(active & lane_mask_lt);
399 uint64_t warp_res;
400 if (rank == 0) {
401 warp_res = __kmpc_atomic_add(
402 (unsigned long long *)&omptarget_nvptx_threadPrivateContext->Cnt(),
403 (unsigned long long)change);
404 }
405 warp_res = Shuffle(active, warp_res, leader);
406 return warp_res + rank;
407 }
408
DynamicNextChunk(T & lb,T & ub,T chunkSize,T loopLowerBound,T loopUpperBound)409 INLINE static int DynamicNextChunk(T &lb, T &ub, T chunkSize,
410 T loopLowerBound, T loopUpperBound) {
411 T N = NextIter();
412 lb = loopLowerBound + N * chunkSize;
413 ub = lb + chunkSize - 1; // Clang uses i <= ub
414
415 // 3 result cases:
416 // a. lb and ub < loopUpperBound --> NOT_FINISHED
417 // b. lb < loopUpperBound and ub >= loopUpperBound: last chunk -->
418 // NOT_FINISHED
419 // c. lb and ub >= loopUpperBound: empty chunk --> FINISHED
420 // a.
421 if (lb <= loopUpperBound && ub < loopUpperBound) {
422 PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; not finished\n",
423 (long long)lb, (long long)ub, (long long)loopUpperBound);
424 return NOT_FINISHED;
425 }
426 // b.
427 if (lb <= loopUpperBound) {
428 PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; clip to loop ub\n",
429 (long long)lb, (long long)ub, (long long)loopUpperBound);
430 ub = loopUpperBound;
431 return LAST_CHUNK;
432 }
433 // c. if we are here, we are in case 'c'
434 lb = loopUpperBound + 2;
435 ub = loopUpperBound + 1;
436 PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; finished\n", (long long)lb,
437 (long long)ub, (long long)loopUpperBound);
438 return FINISHED;
439 }
440
dispatch_next(kmp_Ident * loc,int32_t gtid,int32_t * plast,T * plower,T * pupper,ST * pstride)441 INLINE static int dispatch_next(kmp_Ident *loc, int32_t gtid, int32_t *plast,
442 T *plower, T *pupper, ST *pstride) {
443 if (checkRuntimeUninitialized(loc)) {
444 // In SPMD mode no need to check parallelism level - dynamic scheduling
445 // may appear only in L2 parallel regions with lightweight runtime.
446 ASSERT0(LT_FUSSY, checkSPMDMode(loc), "Expected non-SPMD mode.");
447 if (*plast)
448 return DISPATCH_FINISHED;
449 *plast = 1;
450 return DISPATCH_NOTFINISHED;
451 }
452 // ID of a thread in its own warp
453
454 // automatically selects thread or warp ID based on selected implementation
455 int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
456 ASSERT0(LT_FUSSY, gtid < GetNumberOfOmpThreads(checkSPMDMode(loc)),
457 "current thread is not needed here; error");
458 // retrieve schedule
459 kmp_sched_t schedule =
460 omptarget_nvptx_threadPrivateContext->ScheduleType(tid);
461
462 // xxx reduce to one
463 if (schedule == kmp_sched_static_chunk ||
464 schedule == kmp_sched_static_nochunk) {
465 T myLb = omptarget_nvptx_threadPrivateContext->NextLowerBound(tid);
466 T ub = omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid);
467 // finished?
468 if (myLb > ub) {
469 PRINT(LD_LOOP, "static loop finished with myLb %lld, ub %lld\n",
470 (long long)myLb, (long long)ub);
471 return DISPATCH_FINISHED;
472 }
473 // not finished, save current bounds
474 ST chunk = omptarget_nvptx_threadPrivateContext->Chunk(tid);
475 *plower = myLb;
476 T myUb = myLb + chunk - 1; // Clang uses i <= ub
477 if (myUb > ub)
478 myUb = ub;
479 *pupper = myUb;
480 *plast = (int32_t)(myUb == ub);
481
482 // increment next lower bound by the stride
483 ST stride = omptarget_nvptx_threadPrivateContext->Stride(tid);
484 omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = myLb + stride;
485 PRINT(LD_LOOP, "static loop continues with myLb %lld, myUb %lld\n",
486 (long long)*plower, (long long)*pupper);
487 return DISPATCH_NOTFINISHED;
488 }
489 ASSERT0(LT_FUSSY,
490 schedule == kmp_sched_dynamic || schedule == kmp_sched_guided,
491 "bad sched");
492 T myLb, myUb;
493 int finished = DynamicNextChunk(
494 myLb, myUb, omptarget_nvptx_threadPrivateContext->Chunk(tid),
495 omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
496 omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid));
497
498 if (finished == FINISHED)
499 return DISPATCH_FINISHED;
500
501 // not finished (either not finished or last chunk)
502 *plast = (int32_t)(finished == LAST_CHUNK);
503 *plower = myLb;
504 *pupper = myUb;
505 *pstride = 1;
506
507 PRINT(LD_LOOP,
508 "Got sched: active %d, total %d: lb %lld, ub %lld, stride = %lld, "
509 "last %d\n",
510 (int)GetNumberOfOmpThreads(isSPMDMode()),
511 (int)GetNumberOfWorkersInTeam(), (long long)*plower,
512 (long long)*pupper, (long long)*pstride, (int)*plast);
513 return DISPATCH_NOTFINISHED;
514 }
515
dispatch_fini()516 INLINE static void dispatch_fini() {
517 // nothing
518 }
519
520 ////////////////////////////////////////////////////////////////////////////////
521 // end of template class that encapsulate all the helper functions
522 ////////////////////////////////////////////////////////////////////////////////
523 };
524
525 ////////////////////////////////////////////////////////////////////////////////
526 // KMP interface implementation (dyn loops)
527 ////////////////////////////////////////////////////////////////////////////////
528
529 // init
__kmpc_dispatch_init_4(kmp_Ident * loc,int32_t tid,int32_t schedule,int32_t lb,int32_t ub,int32_t st,int32_t chunk)530 EXTERN void __kmpc_dispatch_init_4(kmp_Ident *loc, int32_t tid,
531 int32_t schedule, int32_t lb, int32_t ub,
532 int32_t st, int32_t chunk) {
533 PRINT0(LD_IO, "call kmpc_dispatch_init_4\n");
534 omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_init(
535 loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk);
536 }
537
__kmpc_dispatch_init_4u(kmp_Ident * loc,int32_t tid,int32_t schedule,uint32_t lb,uint32_t ub,int32_t st,int32_t chunk)538 EXTERN void __kmpc_dispatch_init_4u(kmp_Ident *loc, int32_t tid,
539 int32_t schedule, uint32_t lb, uint32_t ub,
540 int32_t st, int32_t chunk) {
541 PRINT0(LD_IO, "call kmpc_dispatch_init_4u\n");
542 omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_init(
543 loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk);
544 }
545
__kmpc_dispatch_init_8(kmp_Ident * loc,int32_t tid,int32_t schedule,int64_t lb,int64_t ub,int64_t st,int64_t chunk)546 EXTERN void __kmpc_dispatch_init_8(kmp_Ident *loc, int32_t tid,
547 int32_t schedule, int64_t lb, int64_t ub,
548 int64_t st, int64_t chunk) {
549 PRINT0(LD_IO, "call kmpc_dispatch_init_8\n");
550 omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_init(
551 loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk);
552 }
553
__kmpc_dispatch_init_8u(kmp_Ident * loc,int32_t tid,int32_t schedule,uint64_t lb,uint64_t ub,int64_t st,int64_t chunk)554 EXTERN void __kmpc_dispatch_init_8u(kmp_Ident *loc, int32_t tid,
555 int32_t schedule, uint64_t lb, uint64_t ub,
556 int64_t st, int64_t chunk) {
557 PRINT0(LD_IO, "call kmpc_dispatch_init_8u\n");
558 omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_init(
559 loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk);
560 }
561
562 // next
__kmpc_dispatch_next_4(kmp_Ident * loc,int32_t tid,int32_t * p_last,int32_t * p_lb,int32_t * p_ub,int32_t * p_st)563 EXTERN int __kmpc_dispatch_next_4(kmp_Ident *loc, int32_t tid, int32_t *p_last,
564 int32_t *p_lb, int32_t *p_ub, int32_t *p_st) {
565 PRINT0(LD_IO, "call kmpc_dispatch_next_4\n");
566 return omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_next(
567 loc, tid, p_last, p_lb, p_ub, p_st);
568 }
569
__kmpc_dispatch_next_4u(kmp_Ident * loc,int32_t tid,int32_t * p_last,uint32_t * p_lb,uint32_t * p_ub,int32_t * p_st)570 EXTERN int __kmpc_dispatch_next_4u(kmp_Ident *loc, int32_t tid,
571 int32_t *p_last, uint32_t *p_lb,
572 uint32_t *p_ub, int32_t *p_st) {
573 PRINT0(LD_IO, "call kmpc_dispatch_next_4u\n");
574 return omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_next(
575 loc, tid, p_last, p_lb, p_ub, p_st);
576 }
577
__kmpc_dispatch_next_8(kmp_Ident * loc,int32_t tid,int32_t * p_last,int64_t * p_lb,int64_t * p_ub,int64_t * p_st)578 EXTERN int __kmpc_dispatch_next_8(kmp_Ident *loc, int32_t tid, int32_t *p_last,
579 int64_t *p_lb, int64_t *p_ub, int64_t *p_st) {
580 PRINT0(LD_IO, "call kmpc_dispatch_next_8\n");
581 return omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_next(
582 loc, tid, p_last, p_lb, p_ub, p_st);
583 }
584
__kmpc_dispatch_next_8u(kmp_Ident * loc,int32_t tid,int32_t * p_last,uint64_t * p_lb,uint64_t * p_ub,int64_t * p_st)585 EXTERN int __kmpc_dispatch_next_8u(kmp_Ident *loc, int32_t tid,
586 int32_t *p_last, uint64_t *p_lb,
587 uint64_t *p_ub, int64_t *p_st) {
588 PRINT0(LD_IO, "call kmpc_dispatch_next_8u\n");
589 return omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_next(
590 loc, tid, p_last, p_lb, p_ub, p_st);
591 }
592
593 // fini
__kmpc_dispatch_fini_4(kmp_Ident * loc,int32_t tid)594 EXTERN void __kmpc_dispatch_fini_4(kmp_Ident *loc, int32_t tid) {
595 PRINT0(LD_IO, "call kmpc_dispatch_fini_4\n");
596 omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_fini();
597 }
598
__kmpc_dispatch_fini_4u(kmp_Ident * loc,int32_t tid)599 EXTERN void __kmpc_dispatch_fini_4u(kmp_Ident *loc, int32_t tid) {
600 PRINT0(LD_IO, "call kmpc_dispatch_fini_4u\n");
601 omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_fini();
602 }
603
__kmpc_dispatch_fini_8(kmp_Ident * loc,int32_t tid)604 EXTERN void __kmpc_dispatch_fini_8(kmp_Ident *loc, int32_t tid) {
605 PRINT0(LD_IO, "call kmpc_dispatch_fini_8\n");
606 omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_fini();
607 }
608
__kmpc_dispatch_fini_8u(kmp_Ident * loc,int32_t tid)609 EXTERN void __kmpc_dispatch_fini_8u(kmp_Ident *loc, int32_t tid) {
610 PRINT0(LD_IO, "call kmpc_dispatch_fini_8u\n");
611 omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_fini();
612 }
613
614 ////////////////////////////////////////////////////////////////////////////////
615 // KMP interface implementation (static loops)
616 ////////////////////////////////////////////////////////////////////////////////
617
__kmpc_for_static_init_4(kmp_Ident * loc,int32_t global_tid,int32_t schedtype,int32_t * plastiter,int32_t * plower,int32_t * pupper,int32_t * pstride,int32_t incr,int32_t chunk)618 EXTERN void __kmpc_for_static_init_4(kmp_Ident *loc, int32_t global_tid,
619 int32_t schedtype, int32_t *plastiter,
620 int32_t *plower, int32_t *pupper,
621 int32_t *pstride, int32_t incr,
622 int32_t chunk) {
623 PRINT0(LD_IO, "call kmpc_for_static_init_4\n");
624 omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
625 global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
626 checkSPMDMode(loc));
627 }
628
__kmpc_for_static_init_4u(kmp_Ident * loc,int32_t global_tid,int32_t schedtype,int32_t * plastiter,uint32_t * plower,uint32_t * pupper,int32_t * pstride,int32_t incr,int32_t chunk)629 EXTERN void __kmpc_for_static_init_4u(kmp_Ident *loc, int32_t global_tid,
630 int32_t schedtype, int32_t *plastiter,
631 uint32_t *plower, uint32_t *pupper,
632 int32_t *pstride, int32_t incr,
633 int32_t chunk) {
634 PRINT0(LD_IO, "call kmpc_for_static_init_4u\n");
635 omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
636 global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
637 checkSPMDMode(loc));
638 }
639
__kmpc_for_static_init_8(kmp_Ident * loc,int32_t global_tid,int32_t schedtype,int32_t * plastiter,int64_t * plower,int64_t * pupper,int64_t * pstride,int64_t incr,int64_t chunk)640 EXTERN void __kmpc_for_static_init_8(kmp_Ident *loc, int32_t global_tid,
641 int32_t schedtype, int32_t *plastiter,
642 int64_t *plower, int64_t *pupper,
643 int64_t *pstride, int64_t incr,
644 int64_t chunk) {
645 PRINT0(LD_IO, "call kmpc_for_static_init_8\n");
646 omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
647 global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
648 checkSPMDMode(loc));
649 }
650
__kmpc_for_static_init_8u(kmp_Ident * loc,int32_t global_tid,int32_t schedtype,int32_t * plastiter,uint64_t * plower,uint64_t * pupper,int64_t * pstride,int64_t incr,int64_t chunk)651 EXTERN void __kmpc_for_static_init_8u(kmp_Ident *loc, int32_t global_tid,
652 int32_t schedtype, int32_t *plastiter,
653 uint64_t *plower, uint64_t *pupper,
654 int64_t *pstride, int64_t incr,
655 int64_t chunk) {
656 PRINT0(LD_IO, "call kmpc_for_static_init_8u\n");
657 omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
658 global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
659 checkSPMDMode(loc));
660 }
661
662 EXTERN
__kmpc_for_static_init_4_simple_spmd(kmp_Ident * loc,int32_t global_tid,int32_t schedtype,int32_t * plastiter,int32_t * plower,int32_t * pupper,int32_t * pstride,int32_t incr,int32_t chunk)663 void __kmpc_for_static_init_4_simple_spmd(kmp_Ident *loc, int32_t global_tid,
664 int32_t schedtype, int32_t *plastiter,
665 int32_t *plower, int32_t *pupper,
666 int32_t *pstride, int32_t incr,
667 int32_t chunk) {
668 PRINT0(LD_IO, "call kmpc_for_static_init_4_simple_spmd\n");
669 omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
670 global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
671 /*IsSPMDExecutionMode=*/true);
672 }
673
674 EXTERN
__kmpc_for_static_init_4u_simple_spmd(kmp_Ident * loc,int32_t global_tid,int32_t schedtype,int32_t * plastiter,uint32_t * plower,uint32_t * pupper,int32_t * pstride,int32_t incr,int32_t chunk)675 void __kmpc_for_static_init_4u_simple_spmd(kmp_Ident *loc, int32_t global_tid,
676 int32_t schedtype,
677 int32_t *plastiter, uint32_t *plower,
678 uint32_t *pupper, int32_t *pstride,
679 int32_t incr, int32_t chunk) {
680 PRINT0(LD_IO, "call kmpc_for_static_init_4u_simple_spmd\n");
681 omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
682 global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
683 /*IsSPMDExecutionMode=*/true);
684 }
685
686 EXTERN
__kmpc_for_static_init_8_simple_spmd(kmp_Ident * loc,int32_t global_tid,int32_t schedtype,int32_t * plastiter,int64_t * plower,int64_t * pupper,int64_t * pstride,int64_t incr,int64_t chunk)687 void __kmpc_for_static_init_8_simple_spmd(kmp_Ident *loc, int32_t global_tid,
688 int32_t schedtype, int32_t *plastiter,
689 int64_t *plower, int64_t *pupper,
690 int64_t *pstride, int64_t incr,
691 int64_t chunk) {
692 PRINT0(LD_IO, "call kmpc_for_static_init_8_simple_spmd\n");
693 omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
694 global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
695 /*IsSPMDExecutionMode=*/true);
696 }
697
698 EXTERN
__kmpc_for_static_init_8u_simple_spmd(kmp_Ident * loc,int32_t global_tid,int32_t schedtype,int32_t * plastiter,uint64_t * plower,uint64_t * pupper,int64_t * pstride,int64_t incr,int64_t chunk)699 void __kmpc_for_static_init_8u_simple_spmd(kmp_Ident *loc, int32_t global_tid,
700 int32_t schedtype,
701 int32_t *plastiter, uint64_t *plower,
702 uint64_t *pupper, int64_t *pstride,
703 int64_t incr, int64_t chunk) {
704 PRINT0(LD_IO, "call kmpc_for_static_init_8u_simple_spmd\n");
705 omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
706 global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
707 /*IsSPMDExecutionMode=*/true);
708 }
709
710 EXTERN
__kmpc_for_static_init_4_simple_generic(kmp_Ident * loc,int32_t global_tid,int32_t schedtype,int32_t * plastiter,int32_t * plower,int32_t * pupper,int32_t * pstride,int32_t incr,int32_t chunk)711 void __kmpc_for_static_init_4_simple_generic(
712 kmp_Ident *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter,
713 int32_t *plower, int32_t *pupper, int32_t *pstride, int32_t incr,
714 int32_t chunk) {
715 PRINT0(LD_IO, "call kmpc_for_static_init_4_simple_generic\n");
716 omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
717 global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
718 /*IsSPMDExecutionMode=*/false);
719 }
720
721 EXTERN
__kmpc_for_static_init_4u_simple_generic(kmp_Ident * loc,int32_t global_tid,int32_t schedtype,int32_t * plastiter,uint32_t * plower,uint32_t * pupper,int32_t * pstride,int32_t incr,int32_t chunk)722 void __kmpc_for_static_init_4u_simple_generic(
723 kmp_Ident *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter,
724 uint32_t *plower, uint32_t *pupper, int32_t *pstride, int32_t incr,
725 int32_t chunk) {
726 PRINT0(LD_IO, "call kmpc_for_static_init_4u_simple_generic\n");
727 omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
728 global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
729 /*IsSPMDExecutionMode=*/false);
730 }
731
732 EXTERN
__kmpc_for_static_init_8_simple_generic(kmp_Ident * loc,int32_t global_tid,int32_t schedtype,int32_t * plastiter,int64_t * plower,int64_t * pupper,int64_t * pstride,int64_t incr,int64_t chunk)733 void __kmpc_for_static_init_8_simple_generic(
734 kmp_Ident *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter,
735 int64_t *plower, int64_t *pupper, int64_t *pstride, int64_t incr,
736 int64_t chunk) {
737 PRINT0(LD_IO, "call kmpc_for_static_init_8_simple_generic\n");
738 omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
739 global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
740 /*IsSPMDExecutionMode=*/false);
741 }
742
743 EXTERN
__kmpc_for_static_init_8u_simple_generic(kmp_Ident * loc,int32_t global_tid,int32_t schedtype,int32_t * plastiter,uint64_t * plower,uint64_t * pupper,int64_t * pstride,int64_t incr,int64_t chunk)744 void __kmpc_for_static_init_8u_simple_generic(
745 kmp_Ident *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter,
746 uint64_t *plower, uint64_t *pupper, int64_t *pstride, int64_t incr,
747 int64_t chunk) {
748 PRINT0(LD_IO, "call kmpc_for_static_init_8u_simple_generic\n");
749 omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
750 global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
751 /*IsSPMDExecutionMode=*/false);
752 }
753
__kmpc_for_static_fini(kmp_Ident * loc,int32_t global_tid)754 EXTERN void __kmpc_for_static_fini(kmp_Ident *loc, int32_t global_tid) {
755 PRINT0(LD_IO, "call kmpc_for_static_fini\n");
756 }
757