1 /*
2  * kmp_wait_release.h -- Wait/Release implementation
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #ifndef KMP_WAIT_RELEASE_H
14 #define KMP_WAIT_RELEASE_H
15 
16 #include "kmp.h"
17 #include "kmp_itt.h"
18 #include "kmp_stats.h"
19 #if OMPT_SUPPORT
20 #include "ompt-specific.h"
21 #endif
22 
23 /*!
24 @defgroup WAIT_RELEASE Wait/Release operations
25 
26 The definitions and functions here implement the lowest level thread
27 synchronizations of suspending a thread and awaking it. They are used to build
28 higher level operations such as barriers and fork/join.
29 */
30 
31 /*!
32 @ingroup WAIT_RELEASE
33 @{
34 */
35 
36 /*!
37  * The flag_type describes the storage used for the flag.
38  */
39 enum flag_type {
40   flag32, /**< 32 bit flags */
41   flag64, /**< 64 bit flags */
42   flag_oncore /**< special 64-bit flag for on-core barrier (hierarchical) */
43 };
44 
45 struct flag_properties {
46   unsigned int type : 16;
47   unsigned int reserved : 16;
48 };
49 
50 /*!
51  * Base class for wait/release volatile flag
52  */
53 template <typename P> class kmp_flag_native {
54   volatile P *loc;
55   flag_properties t;
56 
57 public:
58   typedef P flag_t;
kmp_flag_native(volatile P * p,flag_type ft)59   kmp_flag_native(volatile P *p, flag_type ft)
60       : loc(p), t({(unsigned int)ft, 0U}) {}
get()61   volatile P *get() { return loc; }
get_void_p()62   void *get_void_p() { return RCAST(void *, CCAST(P *, loc)); }
set(volatile P * new_loc)63   void set(volatile P *new_loc) { loc = new_loc; }
get_type()64   flag_type get_type() { return (flag_type)(t.type); }
load()65   P load() { return *loc; }
store(P val)66   void store(P val) { *loc = val; }
67 };
68 
69 /*!
70  * Base class for wait/release atomic flag
71  */
72 template <typename P> class kmp_flag {
73   std::atomic<P>
74       *loc; /**< Pointer to the flag storage that is modified by another thread
75              */
76   flag_properties t; /**< "Type" of the flag in loc */
77 public:
78   typedef P flag_t;
kmp_flag(std::atomic<P> * p,flag_type ft)79   kmp_flag(std::atomic<P> *p, flag_type ft)
80       : loc(p), t({(unsigned int)ft, 0U}) {}
81   /*!
82    * @result the pointer to the actual flag
83    */
get()84   std::atomic<P> *get() { return loc; }
85   /*!
86    * @result void* pointer to the actual flag
87    */
get_void_p()88   void *get_void_p() { return RCAST(void *, loc); }
89   /*!
90    * @param new_loc in   set loc to point at new_loc
91    */
set(std::atomic<P> * new_loc)92   void set(std::atomic<P> *new_loc) { loc = new_loc; }
93   /*!
94    * @result the flag_type
95    */
get_type()96   flag_type get_type() { return (flag_type)(t.type); }
97   /*!
98    * @result flag value
99    */
load()100   P load() { return loc->load(std::memory_order_acquire); }
101   /*!
102    * @param val the new flag value to be stored
103    */
store(P val)104   void store(P val) { loc->store(val, std::memory_order_release); }
105   // Derived classes must provide the following:
106   /*
107   kmp_info_t * get_waiter(kmp_uint32 i);
108   kmp_uint32 get_num_waiters();
109   bool done_check();
110   bool done_check_val(P old_loc);
111   bool notdone_check();
112   P internal_release();
113   void suspend(int th_gtid);
114   void mwait(int th_gtid);
115   void resume(int th_gtid);
116   P set_sleeping();
117   P unset_sleeping();
118   bool is_sleeping();
119   bool is_any_sleeping();
120   bool is_sleeping_val(P old_loc);
121   int execute_tasks(kmp_info_t *this_thr, kmp_int32 gtid, int final_spin,
122                     int *thread_finished
123                     USE_ITT_BUILD_ARG(void * itt_sync_obj), kmp_int32
124                     is_constrained);
125   */
126 };
127 
128 #if OMPT_SUPPORT
129 OMPT_NOINLINE
__ompt_implicit_task_end(kmp_info_t * this_thr,ompt_state_t ompt_state,ompt_data_t * tId)130 static void __ompt_implicit_task_end(kmp_info_t *this_thr,
131                                      ompt_state_t ompt_state,
132                                      ompt_data_t *tId) {
133   int ds_tid = this_thr->th.th_info.ds.ds_tid;
134   if (ompt_state == ompt_state_wait_barrier_implicit) {
135     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
136 #if OMPT_OPTIONAL
137     void *codeptr = NULL;
138     if (ompt_enabled.ompt_callback_sync_region_wait) {
139       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
140           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, tId,
141           codeptr);
142     }
143     if (ompt_enabled.ompt_callback_sync_region) {
144       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
145           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, tId,
146           codeptr);
147     }
148 #endif
149     if (!KMP_MASTER_TID(ds_tid)) {
150       if (ompt_enabled.ompt_callback_implicit_task) {
151         int flags = this_thr->th.ompt_thread_info.parallel_flags;
152         flags = (flags & ompt_parallel_league) ? ompt_task_initial
153                                                : ompt_task_implicit;
154         ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
155             ompt_scope_end, NULL, tId, 0, ds_tid, flags);
156       }
157       // return to idle state
158       this_thr->th.ompt_thread_info.state = ompt_state_idle;
159     } else {
160       this_thr->th.ompt_thread_info.state = ompt_state_overhead;
161     }
162   }
163 }
164 #endif
165 
166 /* Spin wait loop that first does pause/yield, then sleep. A thread that calls
167    __kmp_wait_*  must make certain that another thread calls __kmp_release
168    to wake it back up to prevent deadlocks!
169 
170    NOTE: We may not belong to a team at this point.  */
171 template <class C, bool final_spin, bool Cancellable = false,
172           bool Sleepable = true>
173 static inline bool
__kmp_wait_template(kmp_info_t * this_thr,C * flag USE_ITT_BUILD_ARG (void * itt_sync_obj))174 __kmp_wait_template(kmp_info_t *this_thr,
175                     C *flag USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
176 #if USE_ITT_BUILD && USE_ITT_NOTIFY
177   volatile void *spin = flag->get();
178 #endif
179   kmp_uint32 spins;
180   int th_gtid;
181   int tasks_completed = FALSE;
182   int oversubscribed;
183 #if !KMP_USE_MONITOR
184   kmp_uint64 poll_count;
185   kmp_uint64 hibernate_goal;
186 #else
187   kmp_uint32 hibernate;
188 #endif
189 
190   KMP_FSYNC_SPIN_INIT(spin, NULL);
191   if (flag->done_check()) {
192     KMP_FSYNC_SPIN_ACQUIRED(CCAST(void *, spin));
193     return false;
194   }
195   th_gtid = this_thr->th.th_info.ds.ds_gtid;
196   if (Cancellable) {
197     kmp_team_t *team = this_thr->th.th_team;
198     if (team && team->t.t_cancel_request == cancel_parallel)
199       return true;
200   }
201 #if KMP_OS_UNIX
202   if (final_spin)
203     KMP_ATOMIC_ST_REL(&this_thr->th.th_blocking, true);
204 #endif
205   KA_TRACE(20,
206            ("__kmp_wait_sleep: T#%d waiting for flag(%p)\n", th_gtid, flag));
207 #if KMP_STATS_ENABLED
208   stats_state_e thread_state = KMP_GET_THREAD_STATE();
209 #endif
210 
211 /* OMPT Behavior:
212 THIS function is called from
213   __kmp_barrier (2 times)  (implicit or explicit barrier in parallel regions)
214             these have join / fork behavior
215 
216        In these cases, we don't change the state or trigger events in THIS
217 function.
218        Events are triggered in the calling code (__kmp_barrier):
219 
220                 state := ompt_state_overhead
221             barrier-begin
222             barrier-wait-begin
223                 state := ompt_state_wait_barrier
224           call join-barrier-implementation (finally arrive here)
225           {}
226           call fork-barrier-implementation (finally arrive here)
227           {}
228                 state := ompt_state_overhead
229             barrier-wait-end
230             barrier-end
231                 state := ompt_state_work_parallel
232 
233 
234   __kmp_fork_barrier  (after thread creation, before executing implicit task)
235           call fork-barrier-implementation (finally arrive here)
236           {} // worker arrive here with state = ompt_state_idle
237 
238 
239   __kmp_join_barrier  (implicit barrier at end of parallel region)
240                 state := ompt_state_barrier_implicit
241             barrier-begin
242             barrier-wait-begin
243           call join-barrier-implementation (finally arrive here
244 final_spin=FALSE)
245           {
246           }
247   __kmp_fork_barrier  (implicit barrier at end of parallel region)
248           call fork-barrier-implementation (finally arrive here final_spin=TRUE)
249 
250        Worker after task-team is finished:
251             barrier-wait-end
252             barrier-end
253             implicit-task-end
254             idle-begin
255                 state := ompt_state_idle
256 
257        Before leaving, if state = ompt_state_idle
258             idle-end
259                 state := ompt_state_overhead
260 */
261 #if OMPT_SUPPORT
262   ompt_state_t ompt_entry_state;
263   ompt_data_t *tId;
264   if (ompt_enabled.enabled) {
265     ompt_entry_state = this_thr->th.ompt_thread_info.state;
266     if (!final_spin || ompt_entry_state != ompt_state_wait_barrier_implicit ||
267         KMP_MASTER_TID(this_thr->th.th_info.ds.ds_tid)) {
268       ompt_lw_taskteam_t *team =
269           this_thr->th.th_team->t.ompt_serialized_team_info;
270       if (team) {
271         tId = &(team->ompt_task_info.task_data);
272       } else {
273         tId = OMPT_CUR_TASK_DATA(this_thr);
274       }
275     } else {
276       tId = &(this_thr->th.ompt_thread_info.task_data);
277     }
278     if (final_spin && (__kmp_tasking_mode == tskm_immediate_exec ||
279                        this_thr->th.th_task_team == NULL)) {
280       // implicit task is done. Either no taskqueue, or task-team finished
281       __ompt_implicit_task_end(this_thr, ompt_entry_state, tId);
282     }
283   }
284 #endif
285 
286   KMP_INIT_YIELD(spins); // Setup for waiting
287 
288   if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ||
289       __kmp_pause_status == kmp_soft_paused) {
290 #if KMP_USE_MONITOR
291 // The worker threads cannot rely on the team struct existing at this point.
292 // Use the bt values cached in the thread struct instead.
293 #ifdef KMP_ADJUST_BLOCKTIME
294     if (__kmp_pause_status == kmp_soft_paused ||
295         (__kmp_zero_bt && !this_thr->th.th_team_bt_set))
296       // Force immediate suspend if not set by user and more threads than
297       // available procs
298       hibernate = 0;
299     else
300       hibernate = this_thr->th.th_team_bt_intervals;
301 #else
302     hibernate = this_thr->th.th_team_bt_intervals;
303 #endif /* KMP_ADJUST_BLOCKTIME */
304 
305     /* If the blocktime is nonzero, we want to make sure that we spin wait for
306        the entirety of the specified #intervals, plus up to one interval more.
307        This increment make certain that this thread doesn't go to sleep too
308        soon.  */
309     if (hibernate != 0)
310       hibernate++;
311 
312     // Add in the current time value.
313     hibernate += TCR_4(__kmp_global.g.g_time.dt.t_value);
314     KF_TRACE(20, ("__kmp_wait_sleep: T#%d now=%d, hibernate=%d, intervals=%d\n",
315                   th_gtid, __kmp_global.g.g_time.dt.t_value, hibernate,
316                   hibernate - __kmp_global.g.g_time.dt.t_value));
317 #else
318     if (__kmp_pause_status == kmp_soft_paused) {
319       // Force immediate suspend
320       hibernate_goal = KMP_NOW();
321     } else
322       hibernate_goal = KMP_NOW() + this_thr->th.th_team_bt_intervals;
323     poll_count = 0;
324 #endif // KMP_USE_MONITOR
325   }
326 
327   oversubscribed = (TCR_4(__kmp_nth) > __kmp_avail_proc);
328   KMP_MB();
329 
330   // Main wait spin loop
331   while (flag->notdone_check()) {
332     kmp_task_team_t *task_team = NULL;
333     if (__kmp_tasking_mode != tskm_immediate_exec) {
334       task_team = this_thr->th.th_task_team;
335       /* If the thread's task team pointer is NULL, it means one of 3 things:
336          1) A newly-created thread is first being released by
337          __kmp_fork_barrier(), and its task team has not been set up yet.
338          2) All tasks have been executed to completion.
339          3) Tasking is off for this region.  This could be because we are in a
340          serialized region (perhaps the outer one), or else tasking was manually
341          disabled (KMP_TASKING=0).  */
342       if (task_team != NULL) {
343         if (TCR_SYNC_4(task_team->tt.tt_active)) {
344           if (KMP_TASKING_ENABLED(task_team))
345             flag->execute_tasks(
346                 this_thr, th_gtid, final_spin,
347                 &tasks_completed USE_ITT_BUILD_ARG(itt_sync_obj), 0);
348           else
349             this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
350         } else {
351           KMP_DEBUG_ASSERT(!KMP_MASTER_TID(this_thr->th.th_info.ds.ds_tid));
352 #if OMPT_SUPPORT
353           // task-team is done now, other cases should be catched above
354           if (final_spin && ompt_enabled.enabled)
355             __ompt_implicit_task_end(this_thr, ompt_entry_state, tId);
356 #endif
357           this_thr->th.th_task_team = NULL;
358           this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
359         }
360       } else {
361         this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
362       } // if
363     } // if
364 
365     KMP_FSYNC_SPIN_PREPARE(CCAST(void *, spin));
366     if (TCR_4(__kmp_global.g.g_done)) {
367       if (__kmp_global.g.g_abort)
368         __kmp_abort_thread();
369       break;
370     }
371 
372     // If we are oversubscribed, or have waited a bit (and
373     // KMP_LIBRARY=throughput), then yield
374     KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
375 
376 #if KMP_STATS_ENABLED
377     // Check if thread has been signalled to idle state
378     // This indicates that the logical "join-barrier" has finished
379     if (this_thr->th.th_stats->isIdle() &&
380         KMP_GET_THREAD_STATE() == FORK_JOIN_BARRIER) {
381       KMP_SET_THREAD_STATE(IDLE);
382       KMP_PUSH_PARTITIONED_TIMER(OMP_idle);
383     }
384 #endif
385     // Check if the barrier surrounding this wait loop has been cancelled
386     if (Cancellable) {
387       kmp_team_t *team = this_thr->th.th_team;
388       if (team && team->t.t_cancel_request == cancel_parallel)
389         break;
390     }
391 
392     // Don't suspend if KMP_BLOCKTIME is set to "infinite"
393     if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
394         __kmp_pause_status != kmp_soft_paused)
395       continue;
396 
397     // Don't suspend if there is a likelihood of new tasks being spawned.
398     if ((task_team != NULL) && TCR_4(task_team->tt.tt_found_tasks))
399       continue;
400 
401 #if KMP_USE_MONITOR
402     // If we have waited a bit more, fall asleep
403     if (TCR_4(__kmp_global.g.g_time.dt.t_value) < hibernate)
404       continue;
405 #else
406     if (KMP_BLOCKING(hibernate_goal, poll_count++))
407       continue;
408 #endif
409     // Don't suspend if wait loop designated non-sleepable
410     // in template parameters
411     if (!Sleepable)
412       continue;
413 
414     if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
415         __kmp_pause_status != kmp_soft_paused)
416       continue;
417 
418 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
419     if (__kmp_mwait_enabled || __kmp_umwait_enabled) {
420       KF_TRACE(50, ("__kmp_wait_sleep: T#%d using monitor/mwait\n", th_gtid));
421       flag->mwait(th_gtid);
422     } else {
423 #endif
424       KF_TRACE(50, ("__kmp_wait_sleep: T#%d suspend time reached\n", th_gtid));
425 #if KMP_OS_UNIX
426       if (final_spin)
427         KMP_ATOMIC_ST_REL(&this_thr->th.th_blocking, false);
428 #endif
429       flag->suspend(th_gtid);
430 #if KMP_OS_UNIX
431       if (final_spin)
432         KMP_ATOMIC_ST_REL(&this_thr->th.th_blocking, true);
433 #endif
434 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
435     }
436 #endif
437 
438     if (TCR_4(__kmp_global.g.g_done)) {
439       if (__kmp_global.g.g_abort)
440         __kmp_abort_thread();
441       break;
442     } else if (__kmp_tasking_mode != tskm_immediate_exec &&
443                this_thr->th.th_reap_state == KMP_SAFE_TO_REAP) {
444       this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
445     }
446     // TODO: If thread is done with work and times out, disband/free
447   }
448 
449 #if OMPT_SUPPORT
450   ompt_state_t ompt_exit_state = this_thr->th.ompt_thread_info.state;
451   if (ompt_enabled.enabled && ompt_exit_state != ompt_state_undefined) {
452 #if OMPT_OPTIONAL
453     if (final_spin) {
454       __ompt_implicit_task_end(this_thr, ompt_exit_state, tId);
455       ompt_exit_state = this_thr->th.ompt_thread_info.state;
456     }
457 #endif
458     if (ompt_exit_state == ompt_state_idle) {
459       this_thr->th.ompt_thread_info.state = ompt_state_overhead;
460     }
461   }
462 #endif
463 #if KMP_STATS_ENABLED
464   // If we were put into idle state, pop that off the state stack
465   if (KMP_GET_THREAD_STATE() == IDLE) {
466     KMP_POP_PARTITIONED_TIMER();
467     KMP_SET_THREAD_STATE(thread_state);
468     this_thr->th.th_stats->resetIdleFlag();
469   }
470 #endif
471 
472 #if KMP_OS_UNIX
473   if (final_spin)
474     KMP_ATOMIC_ST_REL(&this_thr->th.th_blocking, false);
475 #endif
476   KMP_FSYNC_SPIN_ACQUIRED(CCAST(void *, spin));
477   if (Cancellable) {
478     kmp_team_t *team = this_thr->th.th_team;
479     if (team && team->t.t_cancel_request == cancel_parallel) {
480       if (tasks_completed) {
481         // undo the previous decrement of unfinished_threads so that the
482         // thread can decrement at the join barrier with no problem
483         kmp_task_team_t *task_team = this_thr->th.th_task_team;
484         std::atomic<kmp_int32> *unfinished_threads =
485             &(task_team->tt.tt_unfinished_threads);
486         KMP_ATOMIC_INC(unfinished_threads);
487       }
488       return true;
489     }
490   }
491   return false;
492 }
493 
494 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
495 // Set up a monitor on the flag variable causing the calling thread to wait in
496 // a less active state until the flag variable is modified.
497 template <class C>
__kmp_mwait_template(int th_gtid,C * flag)498 static inline void __kmp_mwait_template(int th_gtid, C *flag) {
499   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_mwait);
500   kmp_info_t *th = __kmp_threads[th_gtid];
501 
502   KF_TRACE(30, ("__kmp_mwait_template: T#%d enter for flag = %p\n", th_gtid,
503                 flag->get()));
504 
505   // User-level mwait is available
506   KMP_DEBUG_ASSERT(__kmp_mwait_enabled || __kmp_umwait_enabled);
507 
508   __kmp_suspend_initialize_thread(th);
509   __kmp_lock_suspend_mx(th);
510 
511   volatile void *spin = flag->get();
512   void *cacheline = (void *)(kmp_uint64(spin) & ~(CACHE_LINE - 1));
513 
514   if (!flag->done_check()) {
515     // Mark thread as no longer active
516     th->th.th_active = FALSE;
517     if (th->th.th_active_in_pool) {
518       th->th.th_active_in_pool = FALSE;
519       KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
520       KMP_DEBUG_ASSERT(TCR_4(__kmp_thread_pool_active_nth) >= 0);
521     }
522     flag->set_sleeping();
523     KF_TRACE(50, ("__kmp_mwait_template: T#%d calling monitor\n", th_gtid));
524 #if KMP_HAVE_UMWAIT
525     if (__kmp_umwait_enabled) {
526       __kmp_umonitor(cacheline);
527     }
528 #elif KMP_HAVE_MWAIT
529     if (__kmp_mwait_enabled) {
530       __kmp_mm_monitor(cacheline, 0, 0);
531     }
532 #endif
533     // To avoid a race, check flag between 'monitor' and 'mwait'. A write to
534     // the address could happen after the last time we checked and before
535     // monitoring started, in which case monitor can't detect the change.
536     if (flag->done_check())
537       flag->unset_sleeping();
538     else {
539       // if flag changes here, wake-up happens immediately
540       TCW_PTR(th->th.th_sleep_loc, (void *)flag);
541       __kmp_unlock_suspend_mx(th);
542       KF_TRACE(50, ("__kmp_mwait_template: T#%d calling mwait\n", th_gtid));
543 #if KMP_HAVE_UMWAIT
544       if (__kmp_umwait_enabled) {
545         __kmp_umwait(1, 100); // to do: enable ctrl via hints, backoff counter
546       }
547 #elif KMP_HAVE_MWAIT
548       if (__kmp_mwait_enabled) {
549         __kmp_mm_mwait(0, __kmp_mwait_hints);
550       }
551 #endif
552       KF_TRACE(50, ("__kmp_mwait_template: T#%d mwait done\n", th_gtid));
553       __kmp_lock_suspend_mx(th);
554       // Clean up sleep info; doesn't matter how/why this thread stopped waiting
555       if (flag->is_sleeping())
556         flag->unset_sleeping();
557       TCW_PTR(th->th.th_sleep_loc, NULL);
558     }
559     // Mark thread as active again
560     th->th.th_active = TRUE;
561     if (TCR_4(th->th.th_in_pool)) {
562       KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
563       th->th.th_active_in_pool = TRUE;
564     }
565   } // Drop out to main wait loop to check flag, handle tasks, etc.
566   __kmp_unlock_suspend_mx(th);
567   KF_TRACE(30, ("__kmp_mwait_template: T#%d exit\n", th_gtid));
568 }
569 #endif // KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
570 
571 /* Release any threads specified as waiting on the flag by releasing the flag
572    and resume the waiting thread if indicated by the sleep bit(s). A thread that
573    calls __kmp_wait_template must call this function to wake up the potentially
574    sleeping thread and prevent deadlocks!  */
__kmp_release_template(C * flag)575 template <class C> static inline void __kmp_release_template(C *flag) {
576 #ifdef KMP_DEBUG
577   int gtid = TCR_4(__kmp_init_gtid) ? __kmp_get_gtid() : -1;
578 #endif
579   KF_TRACE(20, ("__kmp_release: T#%d releasing flag(%x)\n", gtid, flag->get()));
580   KMP_DEBUG_ASSERT(flag->get());
581   KMP_FSYNC_RELEASING(flag->get_void_p());
582 
583   flag->internal_release();
584 
585   KF_TRACE(100, ("__kmp_release: T#%d set new spin=%d\n", gtid, flag->get(),
586                  flag->load()));
587 
588   if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
589     // Only need to check sleep stuff if infinite block time not set.
590     // Are *any* threads waiting on flag sleeping?
591     if (flag->is_any_sleeping()) {
592       for (unsigned int i = 0; i < flag->get_num_waiters(); ++i) {
593         // if sleeping waiter exists at i, sets current_waiter to i inside flag
594         kmp_info_t *waiter = flag->get_waiter(i);
595         if (waiter) {
596           int wait_gtid = waiter->th.th_info.ds.ds_gtid;
597           // Wake up thread if needed
598           KF_TRACE(50, ("__kmp_release: T#%d waking up thread T#%d since sleep "
599                         "flag(%p) set\n",
600                         gtid, wait_gtid, flag->get()));
601           flag->resume(wait_gtid); // unsets flag's current_waiter when done
602         }
603       }
604     }
605   }
606 }
607 
608 template <typename FlagType> struct flag_traits {};
609 
610 template <> struct flag_traits<kmp_uint32> {
611   typedef kmp_uint32 flag_t;
612   static const flag_type t = flag32;
613   static inline flag_t tcr(flag_t f) { return TCR_4(f); }
614   static inline flag_t test_then_add4(volatile flag_t *f) {
615     return KMP_TEST_THEN_ADD4_32(RCAST(volatile kmp_int32 *, f));
616   }
617   static inline flag_t test_then_or(volatile flag_t *f, flag_t v) {
618     return KMP_TEST_THEN_OR32(f, v);
619   }
620   static inline flag_t test_then_and(volatile flag_t *f, flag_t v) {
621     return KMP_TEST_THEN_AND32(f, v);
622   }
623 };
624 
625 template <> struct flag_traits<kmp_uint64> {
626   typedef kmp_uint64 flag_t;
627   static const flag_type t = flag64;
628   static inline flag_t tcr(flag_t f) { return TCR_8(f); }
629   static inline flag_t test_then_add4(volatile flag_t *f) {
630     return KMP_TEST_THEN_ADD4_64(RCAST(volatile kmp_int64 *, f));
631   }
632   static inline flag_t test_then_or(volatile flag_t *f, flag_t v) {
633     return KMP_TEST_THEN_OR64(f, v);
634   }
635   static inline flag_t test_then_and(volatile flag_t *f, flag_t v) {
636     return KMP_TEST_THEN_AND64(f, v);
637   }
638 };
639 
640 // Basic flag that does not use C11 Atomics
641 template <typename FlagType, bool Sleepable>
642 class kmp_basic_flag_native : public kmp_flag_native<FlagType> {
643   typedef flag_traits<FlagType> traits_type;
644   FlagType checker; /**< Value to compare flag to to check if flag has been
645                        released. */
646   kmp_info_t
647       *waiting_threads[1]; /**< Array of threads sleeping on this thread. */
648   kmp_uint32
649       num_waiting_threads; /**< Number of threads sleeping on this thread. */
650 public:
651   kmp_basic_flag_native(volatile FlagType *p)
652       : kmp_flag_native<FlagType>(p, traits_type::t), num_waiting_threads(0) {}
653   kmp_basic_flag_native(volatile FlagType *p, kmp_info_t *thr)
654       : kmp_flag_native<FlagType>(p, traits_type::t), num_waiting_threads(1) {
655     waiting_threads[0] = thr;
656   }
657   kmp_basic_flag_native(volatile FlagType *p, FlagType c)
658       : kmp_flag_native<FlagType>(p, traits_type::t), checker(c),
659         num_waiting_threads(0) {}
660   /*!
661    * param i in   index into waiting_threads
662    * @result the thread that is waiting at index i
663    */
664   kmp_info_t *get_waiter(kmp_uint32 i) {
665     KMP_DEBUG_ASSERT(i < num_waiting_threads);
666     return waiting_threads[i];
667   }
668   /*!
669    * @result num_waiting_threads
670    */
671   kmp_uint32 get_num_waiters() { return num_waiting_threads; }
672   /*!
673    * @param thr in   the thread which is now waiting
674    *
675    * Insert a waiting thread at index 0.
676    */
677   void set_waiter(kmp_info_t *thr) {
678     waiting_threads[0] = thr;
679     num_waiting_threads = 1;
680   }
681   /*!
682    * @result true if the flag object has been released.
683    */
684   bool done_check() {
685     if (Sleepable)
686       return (traits_type::tcr(*(this->get())) & ~KMP_BARRIER_SLEEP_STATE) ==
687              checker;
688     else
689       return traits_type::tcr(*(this->get())) == checker;
690   }
691   /*!
692    * @param old_loc in   old value of flag
693    * @result true if the flag's old value indicates it was released.
694    */
695   bool done_check_val(FlagType old_loc) { return old_loc == checker; }
696   /*!
697    * @result true if the flag object is not yet released.
698    * Used in __kmp_wait_template like:
699    * @code
700    * while (flag.notdone_check()) { pause(); }
701    * @endcode
702    */
703   bool notdone_check() { return traits_type::tcr(*(this->get())) != checker; }
704   /*!
705    * @result Actual flag value before release was applied.
706    * Trigger all waiting threads to run by modifying flag to release state.
707    */
708   void internal_release() {
709     (void)traits_type::test_then_add4((volatile FlagType *)this->get());
710   }
711   /*!
712    * @result Actual flag value before sleep bit(s) set.
713    * Notes that there is at least one thread sleeping on the flag by setting
714    * sleep bit(s).
715    */
716   FlagType set_sleeping() {
717     return traits_type::test_then_or((volatile FlagType *)this->get(),
718                                      KMP_BARRIER_SLEEP_STATE);
719   }
720   /*!
721    * @result Actual flag value before sleep bit(s) cleared.
722    * Notes that there are no longer threads sleeping on the flag by clearing
723    * sleep bit(s).
724    */
725   FlagType unset_sleeping() {
726     return traits_type::test_then_and((volatile FlagType *)this->get(),
727                                       ~KMP_BARRIER_SLEEP_STATE);
728   }
729   /*!
730    * @param old_loc in   old value of flag
731    * Test whether there are threads sleeping on the flag's old value in old_loc.
732    */
733   bool is_sleeping_val(FlagType old_loc) {
734     return old_loc & KMP_BARRIER_SLEEP_STATE;
735   }
736   /*!
737    * Test whether there are threads sleeping on the flag.
738    */
739   bool is_sleeping() { return is_sleeping_val(*(this->get())); }
740   bool is_any_sleeping() { return is_sleeping_val(*(this->get())); }
741   kmp_uint8 *get_stolen() { return NULL; }
742   enum barrier_type get_bt() { return bs_last_barrier; }
743 };
744 
745 template <typename FlagType, bool Sleepable>
746 class kmp_basic_flag : public kmp_flag<FlagType> {
747   typedef flag_traits<FlagType> traits_type;
748   FlagType checker; /**< Value to compare flag to to check if flag has been
749                        released. */
750   kmp_info_t
751       *waiting_threads[1]; /**< Array of threads sleeping on this thread. */
752   kmp_uint32
753       num_waiting_threads; /**< Number of threads sleeping on this thread. */
754 public:
755   kmp_basic_flag(std::atomic<FlagType> *p)
756       : kmp_flag<FlagType>(p, traits_type::t), num_waiting_threads(0) {}
757   kmp_basic_flag(std::atomic<FlagType> *p, kmp_info_t *thr)
758       : kmp_flag<FlagType>(p, traits_type::t), num_waiting_threads(1) {
759     waiting_threads[0] = thr;
760   }
761   kmp_basic_flag(std::atomic<FlagType> *p, FlagType c)
762       : kmp_flag<FlagType>(p, traits_type::t), checker(c),
763         num_waiting_threads(0) {}
764   /*!
765    * param i in   index into waiting_threads
766    * @result the thread that is waiting at index i
767    */
768   kmp_info_t *get_waiter(kmp_uint32 i) {
769     KMP_DEBUG_ASSERT(i < num_waiting_threads);
770     return waiting_threads[i];
771   }
772   /*!
773    * @result num_waiting_threads
774    */
775   kmp_uint32 get_num_waiters() { return num_waiting_threads; }
776   /*!
777    * @param thr in   the thread which is now waiting
778    *
779    * Insert a waiting thread at index 0.
780    */
781   void set_waiter(kmp_info_t *thr) {
782     waiting_threads[0] = thr;
783     num_waiting_threads = 1;
784   }
785   /*!
786    * @result true if the flag object has been released.
787    */
788   bool done_check() {
789     if (Sleepable)
790       return (this->load() & ~KMP_BARRIER_SLEEP_STATE) == checker;
791     else
792       return this->load() == checker;
793   }
794   /*!
795    * @param old_loc in   old value of flag
796    * @result true if the flag's old value indicates it was released.
797    */
798   bool done_check_val(FlagType old_loc) { return old_loc == checker; }
799   /*!
800    * @result true if the flag object is not yet released.
801    * Used in __kmp_wait_template like:
802    * @code
803    * while (flag.notdone_check()) { pause(); }
804    * @endcode
805    */
806   bool notdone_check() { return this->load() != checker; }
807   /*!
808    * @result Actual flag value before release was applied.
809    * Trigger all waiting threads to run by modifying flag to release state.
810    */
811   void internal_release() { KMP_ATOMIC_ADD(this->get(), 4); }
812   /*!
813    * @result Actual flag value before sleep bit(s) set.
814    * Notes that there is at least one thread sleeping on the flag by setting
815    * sleep bit(s).
816    */
817   FlagType set_sleeping() {
818     return KMP_ATOMIC_OR(this->get(), KMP_BARRIER_SLEEP_STATE);
819   }
820   /*!
821    * @result Actual flag value before sleep bit(s) cleared.
822    * Notes that there are no longer threads sleeping on the flag by clearing
823    * sleep bit(s).
824    */
825   FlagType unset_sleeping() {
826     return KMP_ATOMIC_AND(this->get(), ~KMP_BARRIER_SLEEP_STATE);
827   }
828   /*!
829    * @param old_loc in   old value of flag
830    * Test whether there are threads sleeping on the flag's old value in old_loc.
831    */
832   bool is_sleeping_val(FlagType old_loc) {
833     return old_loc & KMP_BARRIER_SLEEP_STATE;
834   }
835   /*!
836    * Test whether there are threads sleeping on the flag.
837    */
838   bool is_sleeping() { return is_sleeping_val(this->load()); }
839   bool is_any_sleeping() { return is_sleeping_val(this->load()); }
840   kmp_uint8 *get_stolen() { return NULL; }
841   enum barrier_type get_bt() { return bs_last_barrier; }
842 };
843 
844 template <bool Cancellable, bool Sleepable>
845 class kmp_flag_32 : public kmp_basic_flag<kmp_uint32, Sleepable> {
846 public:
847   kmp_flag_32(std::atomic<kmp_uint32> *p)
848       : kmp_basic_flag<kmp_uint32, Sleepable>(p) {}
849   kmp_flag_32(std::atomic<kmp_uint32> *p, kmp_info_t *thr)
850       : kmp_basic_flag<kmp_uint32, Sleepable>(p, thr) {}
851   kmp_flag_32(std::atomic<kmp_uint32> *p, kmp_uint32 c)
852       : kmp_basic_flag<kmp_uint32, Sleepable>(p, c) {}
853   void suspend(int th_gtid) { __kmp_suspend_32(th_gtid, this); }
854 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
855   void mwait(int th_gtid) { __kmp_mwait_32(th_gtid, this); }
856 #endif
857   void resume(int th_gtid) { __kmp_resume_32(th_gtid, this); }
858   int execute_tasks(kmp_info_t *this_thr, kmp_int32 gtid, int final_spin,
859                     int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
860                     kmp_int32 is_constrained) {
861     return __kmp_execute_tasks_32(
862         this_thr, gtid, this, final_spin,
863         thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
864   }
865   bool wait(kmp_info_t *this_thr,
866             int final_spin USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
867     if (final_spin)
868       return __kmp_wait_template<kmp_flag_32, TRUE, Cancellable, Sleepable>(
869           this_thr, this USE_ITT_BUILD_ARG(itt_sync_obj));
870     else
871       return __kmp_wait_template<kmp_flag_32, FALSE, Cancellable, Sleepable>(
872           this_thr, this USE_ITT_BUILD_ARG(itt_sync_obj));
873   }
874   void release() { __kmp_release_template(this); }
875   flag_type get_ptr_type() { return flag32; }
876 };
877 
878 template <bool Cancellable, bool Sleepable>
879 class kmp_flag_64 : public kmp_basic_flag_native<kmp_uint64, Sleepable> {
880 public:
881   kmp_flag_64(volatile kmp_uint64 *p)
882       : kmp_basic_flag_native<kmp_uint64, Sleepable>(p) {}
883   kmp_flag_64(volatile kmp_uint64 *p, kmp_info_t *thr)
884       : kmp_basic_flag_native<kmp_uint64, Sleepable>(p, thr) {}
885   kmp_flag_64(volatile kmp_uint64 *p, kmp_uint64 c)
886       : kmp_basic_flag_native<kmp_uint64, Sleepable>(p, c) {}
887   void suspend(int th_gtid) { __kmp_suspend_64(th_gtid, this); }
888 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
889   void mwait(int th_gtid) { __kmp_mwait_64(th_gtid, this); }
890 #endif
891   void resume(int th_gtid) { __kmp_resume_64(th_gtid, this); }
892   int execute_tasks(kmp_info_t *this_thr, kmp_int32 gtid, int final_spin,
893                     int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
894                     kmp_int32 is_constrained) {
895     return __kmp_execute_tasks_64(
896         this_thr, gtid, this, final_spin,
897         thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
898   }
899   bool wait(kmp_info_t *this_thr,
900             int final_spin USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
901     if (final_spin)
902       return __kmp_wait_template<kmp_flag_64, TRUE, Cancellable, Sleepable>(
903           this_thr, this USE_ITT_BUILD_ARG(itt_sync_obj));
904     else
905       return __kmp_wait_template<kmp_flag_64, FALSE, Cancellable, Sleepable>(
906           this_thr, this USE_ITT_BUILD_ARG(itt_sync_obj));
907   }
908   void release() { __kmp_release_template(this); }
909   flag_type get_ptr_type() { return flag64; }
910 };
911 
912 // Hierarchical 64-bit on-core barrier instantiation
913 class kmp_flag_oncore : public kmp_flag_native<kmp_uint64> {
914   kmp_uint64 checker;
915   kmp_info_t *waiting_threads[1];
916   kmp_uint32 num_waiting_threads;
917   kmp_uint32
918       offset; /**< Portion of flag that is of interest for an operation. */
919   bool flag_switch; /**< Indicates a switch in flag location. */
920   enum barrier_type bt; /**< Barrier type. */
921   kmp_info_t *this_thr; /**< Thread that may be redirected to different flag
922                            location. */
923 #if USE_ITT_BUILD
924   void *
925       itt_sync_obj; /**< ITT object that must be passed to new flag location. */
926 #endif
927   unsigned char &byteref(volatile kmp_uint64 *loc, size_t offset) {
928     return (RCAST(unsigned char *, CCAST(kmp_uint64 *, loc)))[offset];
929   }
930 
931 public:
932   kmp_flag_oncore(volatile kmp_uint64 *p)
933       : kmp_flag_native<kmp_uint64>(p, flag_oncore), num_waiting_threads(0),
934         flag_switch(false) {}
935   kmp_flag_oncore(volatile kmp_uint64 *p, kmp_uint32 idx)
936       : kmp_flag_native<kmp_uint64>(p, flag_oncore), num_waiting_threads(0),
937         offset(idx), flag_switch(false) {}
938   kmp_flag_oncore(volatile kmp_uint64 *p, kmp_uint64 c, kmp_uint32 idx,
939                   enum barrier_type bar_t,
940                   kmp_info_t *thr USE_ITT_BUILD_ARG(void *itt))
941       : kmp_flag_native<kmp_uint64>(p, flag_oncore), checker(c),
942         num_waiting_threads(0), offset(idx), flag_switch(false), bt(bar_t),
943         this_thr(thr) USE_ITT_BUILD_ARG(itt_sync_obj(itt)) {}
944   kmp_info_t *get_waiter(kmp_uint32 i) {
945     KMP_DEBUG_ASSERT(i < num_waiting_threads);
946     return waiting_threads[i];
947   }
948   kmp_uint32 get_num_waiters() { return num_waiting_threads; }
949   void set_waiter(kmp_info_t *thr) {
950     waiting_threads[0] = thr;
951     num_waiting_threads = 1;
952   }
953   bool done_check_val(kmp_uint64 old_loc) {
954     return byteref(&old_loc, offset) == checker;
955   }
956   bool done_check() { return done_check_val(*get()); }
957   bool notdone_check() {
958     // Calculate flag_switch
959     if (this_thr->th.th_bar[bt].bb.wait_flag == KMP_BARRIER_SWITCH_TO_OWN_FLAG)
960       flag_switch = true;
961     if (byteref(get(), offset) != 1 && !flag_switch)
962       return true;
963     else if (flag_switch) {
964       this_thr->th.th_bar[bt].bb.wait_flag = KMP_BARRIER_SWITCHING;
965       kmp_flag_64<> flag(&this_thr->th.th_bar[bt].bb.b_go,
966                        (kmp_uint64)KMP_BARRIER_STATE_BUMP);
967       __kmp_wait_64(this_thr, &flag, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
968     }
969     return false;
970   }
971   void internal_release() {
972     // Other threads can write their own bytes simultaneously.
973     if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
974       byteref(get(), offset) = 1;
975     } else {
976       kmp_uint64 mask = 0;
977       byteref(&mask, offset) = 1;
978       KMP_TEST_THEN_OR64(get(), mask);
979     }
980   }
981   kmp_uint64 set_sleeping() {
982     return KMP_TEST_THEN_OR64(get(), KMP_BARRIER_SLEEP_STATE);
983   }
984   kmp_uint64 unset_sleeping() {
985     return KMP_TEST_THEN_AND64(get(), ~KMP_BARRIER_SLEEP_STATE);
986   }
987   bool is_sleeping_val(kmp_uint64 old_loc) {
988     return old_loc & KMP_BARRIER_SLEEP_STATE;
989   }
990   bool is_sleeping() { return is_sleeping_val(*get()); }
991   bool is_any_sleeping() { return is_sleeping_val(*get()); }
992   void wait(kmp_info_t *this_thr, int final_spin) {
993     if (final_spin)
994       __kmp_wait_template<kmp_flag_oncore, TRUE>(
995           this_thr, this USE_ITT_BUILD_ARG(itt_sync_obj));
996     else
997       __kmp_wait_template<kmp_flag_oncore, FALSE>(
998           this_thr, this USE_ITT_BUILD_ARG(itt_sync_obj));
999   }
1000   void release() { __kmp_release_template(this); }
1001   void suspend(int th_gtid) { __kmp_suspend_oncore(th_gtid, this); }
1002 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
1003   void mwait(int th_gtid) { __kmp_mwait_oncore(th_gtid, this); }
1004 #endif
1005   void resume(int th_gtid) { __kmp_resume_oncore(th_gtid, this); }
1006   int execute_tasks(kmp_info_t *this_thr, kmp_int32 gtid, int final_spin,
1007                     int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
1008                     kmp_int32 is_constrained) {
1009     return __kmp_execute_tasks_oncore(
1010         this_thr, gtid, this, final_spin,
1011         thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
1012   }
1013   kmp_uint8 *get_stolen() { return NULL; }
1014   enum barrier_type get_bt() { return bt; }
1015   flag_type get_ptr_type() { return flag_oncore; }
1016 };
1017 
1018 // Used to wake up threads, volatile void* flag is usually the th_sleep_loc
1019 // associated with int gtid.
1020 static inline void __kmp_null_resume_wrapper(int gtid, volatile void *flag) {
1021   if (!flag)
1022     return;
1023 
1024   switch (RCAST(kmp_flag_64<> *, CCAST(void *, flag))->get_type()) {
1025   case flag32:
1026     __kmp_resume_32(gtid, (kmp_flag_32<> *)NULL);
1027     break;
1028   case flag64:
1029     __kmp_resume_64(gtid, (kmp_flag_64<> *)NULL);
1030     break;
1031   case flag_oncore:
1032     __kmp_resume_oncore(gtid, (kmp_flag_oncore *)NULL);
1033     break;
1034   }
1035 }
1036 
1037 /*!
1038 @}
1039 */
1040 
1041 #endif // KMP_WAIT_RELEASE_H
1042