1 /*
2  * kmp_barrier.cpp
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_wait_release.h"
15 #include "kmp_itt.h"
16 #include "kmp_os.h"
17 #include "kmp_stats.h"
18 #include "ompt-specific.h"
19 
20 #if KMP_MIC
21 #include <immintrin.h>
22 #define USE_NGO_STORES 1
23 #endif // KMP_MIC
24 
25 #include "tsan_annotations.h"
26 
27 #if KMP_MIC && USE_NGO_STORES
28 // ICV copying
29 #define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src))
30 #define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
31 #define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
32 #define ngo_sync() __asm__ volatile("lock; addl $0,0(%%rsp)" ::: "memory")
33 #else
34 #define ngo_load(src) ((void)0)
35 #define ngo_store_icvs(dst, src) copy_icvs((dst), (src))
36 #define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE)
37 #define ngo_sync() ((void)0)
38 #endif /* KMP_MIC && USE_NGO_STORES */
39 
40 void __kmp_print_structure(void); // Forward declaration
41 
42 // ---------------------------- Barrier Algorithms ----------------------------
43 
44 // Linear Barrier
45 template <bool cancellable = false>
__kmp_linear_barrier_gather_template(enum barrier_type bt,kmp_info_t * this_thr,int gtid,int tid,void (* reduce)(void *,void *)USE_ITT_BUILD_ARG (void * itt_sync_obj))46 static bool __kmp_linear_barrier_gather_template(
47     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
48     void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
49   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_gather);
50   kmp_team_t *team = this_thr->th.th_team;
51   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
52   kmp_info_t **other_threads = team->t.t_threads;
53 
54   KA_TRACE(
55       20,
56       ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
57        gtid, team->t.t_id, tid, bt));
58   KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
59 
60 #if USE_ITT_BUILD && USE_ITT_NOTIFY
61   // Barrier imbalance - save arrive time to the thread
62   if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
63     this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
64         __itt_get_timestamp();
65   }
66 #endif
67   // We now perform a linear reduction to signal that all of the threads have
68   // arrived.
69   if (!KMP_MASTER_TID(tid)) {
70     KA_TRACE(20,
71              ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
72               "arrived(%p): %llu => %llu\n",
73               gtid, team->t.t_id, tid, __kmp_gtid_from_tid(0, team),
74               team->t.t_id, 0, &thr_bar->b_arrived, thr_bar->b_arrived,
75               thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
76     // Mark arrival to master thread
77     /* After performing this write, a worker thread may not assume that the team
78        is valid any more - it could be deallocated by the master thread at any
79        time. */
80     ANNOTATE_BARRIER_BEGIN(this_thr);
81     kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[0]);
82     flag.release();
83   } else {
84     kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
85     int nproc = this_thr->th.th_team_nproc;
86     int i;
87     // Don't have to worry about sleep bit here or atomic since team setting
88     kmp_uint64 new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;
89 
90     // Collect all the worker team member threads.
91     for (i = 1; i < nproc; ++i) {
92 #if KMP_CACHE_MANAGE
93       // Prefetch next thread's arrived count
94       if (i + 1 < nproc)
95         KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_arrived);
96 #endif /* KMP_CACHE_MANAGE */
97       KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
98                     "arrived(%p) == %llu\n",
99                     gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
100                     team->t.t_id, i,
101                     &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));
102 
103       // Wait for worker thread to arrive
104       if (cancellable) {
105         kmp_flag_64<true, false> flag(
106             &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state);
107         if (flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)))
108           return true;
109       } else {
110         kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived,
111                            new_state);
112         flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
113       }
114       ANNOTATE_BARRIER_END(other_threads[i]);
115 #if USE_ITT_BUILD && USE_ITT_NOTIFY
116       // Barrier imbalance - write min of the thread time and the other thread
117       // time to the thread.
118       if (__kmp_forkjoin_frames_mode == 2) {
119         this_thr->th.th_bar_min_time = KMP_MIN(
120             this_thr->th.th_bar_min_time, other_threads[i]->th.th_bar_min_time);
121       }
122 #endif
123       if (reduce) {
124         KA_TRACE(100,
125                  ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
126                   gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
127                   team->t.t_id, i));
128         ANNOTATE_REDUCE_AFTER(reduce);
129         OMPT_REDUCTION_DECL(this_thr, gtid);
130         OMPT_REDUCTION_BEGIN;
131         (*reduce)(this_thr->th.th_local.reduce_data,
132                   other_threads[i]->th.th_local.reduce_data);
133         OMPT_REDUCTION_END;
134         ANNOTATE_REDUCE_BEFORE(reduce);
135         ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
136       }
137     }
138     // Don't have to worry about sleep bit here or atomic since team setting
139     team_bar->b_arrived = new_state;
140     KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d "
141                   "arrived(%p) = %llu\n",
142                   gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived,
143                   new_state));
144   }
145   KA_TRACE(
146       20,
147       ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
148        gtid, team->t.t_id, tid, bt));
149   return false;
150 }
151 
152 template <bool cancellable = false>
__kmp_linear_barrier_release_template(enum barrier_type bt,kmp_info_t * this_thr,int gtid,int tid,int propagate_icvs USE_ITT_BUILD_ARG (void * itt_sync_obj))153 static bool __kmp_linear_barrier_release_template(
154     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
155     int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
156   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_release);
157   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
158   kmp_team_t *team;
159 
160   if (KMP_MASTER_TID(tid)) {
161     unsigned int i;
162     kmp_uint32 nproc = this_thr->th.th_team_nproc;
163     kmp_info_t **other_threads;
164 
165     team = __kmp_threads[gtid]->th.th_team;
166     KMP_DEBUG_ASSERT(team != NULL);
167     other_threads = team->t.t_threads;
168 
169     KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) master enter for "
170                   "barrier type %d\n",
171                   gtid, team->t.t_id, tid, bt));
172 
173     if (nproc > 1) {
174 #if KMP_BARRIER_ICV_PUSH
175       {
176         KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
177         if (propagate_icvs) {
178           ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
179           for (i = 1; i < nproc; ++i) {
180             __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i],
181                                      team, i, FALSE);
182             ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
183                            &team->t.t_implicit_task_taskdata[0].td_icvs);
184           }
185           ngo_sync();
186         }
187       }
188 #endif // KMP_BARRIER_ICV_PUSH
189 
190       // Now, release all of the worker threads
191       for (i = 1; i < nproc; ++i) {
192 #if KMP_CACHE_MANAGE
193         // Prefetch next thread's go flag
194         if (i + 1 < nproc)
195           KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_go);
196 #endif /* KMP_CACHE_MANAGE */
197         KA_TRACE(
198             20,
199             ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
200              "go(%p): %u => %u\n",
201              gtid, team->t.t_id, tid, other_threads[i]->th.th_info.ds.ds_gtid,
202              team->t.t_id, i, &other_threads[i]->th.th_bar[bt].bb.b_go,
203              other_threads[i]->th.th_bar[bt].bb.b_go,
204              other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
205         ANNOTATE_BARRIER_BEGIN(other_threads[i]);
206         kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_go,
207                          other_threads[i]);
208         flag.release();
209       }
210     }
211   } else { // Wait for the MASTER thread to release us
212     KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
213                   gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
214     if (cancellable) {
215       kmp_flag_64<true, false> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
216       if (flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)))
217         return true;
218     } else {
219       kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
220       flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
221     }
222     ANNOTATE_BARRIER_END(this_thr);
223 #if USE_ITT_BUILD && USE_ITT_NOTIFY
224     if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
225       // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is
226       // disabled)
227       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
228       // Cancel wait on previous parallel region...
229       __kmp_itt_task_starting(itt_sync_obj);
230 
231       if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
232         return false;
233 
234       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
235       if (itt_sync_obj != NULL)
236         // Call prepare as early as possible for "new" barrier
237         __kmp_itt_task_finished(itt_sync_obj);
238     } else
239 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
240         // Early exit for reaping threads releasing forkjoin barrier
241         if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
242       return false;
243 // The worker thread may now assume that the team is valid.
244 #ifdef KMP_DEBUG
245     tid = __kmp_tid_from_gtid(gtid);
246     team = __kmp_threads[gtid]->th.th_team;
247 #endif
248     KMP_DEBUG_ASSERT(team != NULL);
249     TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
250     KA_TRACE(20,
251              ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
252               gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
253     KMP_MB(); // Flush all pending memory write invalidates.
254   }
255   KA_TRACE(
256       20,
257       ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
258        gtid, team->t.t_id, tid, bt));
259   return false;
260 }
261 
__kmp_linear_barrier_gather(enum barrier_type bt,kmp_info_t * this_thr,int gtid,int tid,void (* reduce)(void *,void *)USE_ITT_BUILD_ARG (void * itt_sync_obj))262 static void __kmp_linear_barrier_gather(
263     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
264     void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
265   __kmp_linear_barrier_gather_template<false>(
266       bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
267 }
268 
__kmp_linear_barrier_gather_cancellable(enum barrier_type bt,kmp_info_t * this_thr,int gtid,int tid,void (* reduce)(void *,void *)USE_ITT_BUILD_ARG (void * itt_sync_obj))269 static bool __kmp_linear_barrier_gather_cancellable(
270     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
271     void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
272   return __kmp_linear_barrier_gather_template<true>(
273       bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
274 }
275 
__kmp_linear_barrier_release(enum barrier_type bt,kmp_info_t * this_thr,int gtid,int tid,int propagate_icvs USE_ITT_BUILD_ARG (void * itt_sync_obj))276 static void __kmp_linear_barrier_release(
277     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
278     int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
279   __kmp_linear_barrier_release_template<false>(
280       bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
281 }
282 
__kmp_linear_barrier_release_cancellable(enum barrier_type bt,kmp_info_t * this_thr,int gtid,int tid,int propagate_icvs USE_ITT_BUILD_ARG (void * itt_sync_obj))283 static bool __kmp_linear_barrier_release_cancellable(
284     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
285     int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
286   return __kmp_linear_barrier_release_template<true>(
287       bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
288 }
289 
290 // Tree barrier
291 static void
__kmp_tree_barrier_gather(enum barrier_type bt,kmp_info_t * this_thr,int gtid,int tid,void (* reduce)(void *,void *)USE_ITT_BUILD_ARG (void * itt_sync_obj))292 __kmp_tree_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid,
293                           int tid, void (*reduce)(void *, void *)
294                                        USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
295   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_gather);
296   kmp_team_t *team = this_thr->th.th_team;
297   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
298   kmp_info_t **other_threads = team->t.t_threads;
299   kmp_uint32 nproc = this_thr->th.th_team_nproc;
300   kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
301   kmp_uint32 branch_factor = 1 << branch_bits;
302   kmp_uint32 child;
303   kmp_uint32 child_tid;
304   kmp_uint64 new_state;
305 
306   KA_TRACE(
307       20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
308            gtid, team->t.t_id, tid, bt));
309   KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
310 
311 #if USE_ITT_BUILD && USE_ITT_NOTIFY
312   // Barrier imbalance - save arrive time to the thread
313   if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
314     this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
315         __itt_get_timestamp();
316   }
317 #endif
318   // Perform tree gather to wait until all threads have arrived; reduce any
319   // required data as we go
320   child_tid = (tid << branch_bits) + 1;
321   if (child_tid < nproc) {
322     // Parent threads wait for all their children to arrive
323     new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
324     child = 1;
325     do {
326       kmp_info_t *child_thr = other_threads[child_tid];
327       kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
328 #if KMP_CACHE_MANAGE
329       // Prefetch next thread's arrived count
330       if (child + 1 <= branch_factor && child_tid + 1 < nproc)
331         KMP_CACHE_PREFETCH(
332             &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_arrived);
333 #endif /* KMP_CACHE_MANAGE */
334       KA_TRACE(20,
335                ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
336                 "arrived(%p) == %llu\n",
337                 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
338                 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
339       // Wait for child to arrive
340       kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
341       flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
342       ANNOTATE_BARRIER_END(child_thr);
343 #if USE_ITT_BUILD && USE_ITT_NOTIFY
344       // Barrier imbalance - write min of the thread time and a child time to
345       // the thread.
346       if (__kmp_forkjoin_frames_mode == 2) {
347         this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
348                                                child_thr->th.th_bar_min_time);
349       }
350 #endif
351       if (reduce) {
352         KA_TRACE(100,
353                  ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
354                   gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
355                   team->t.t_id, child_tid));
356         ANNOTATE_REDUCE_AFTER(reduce);
357         OMPT_REDUCTION_DECL(this_thr, gtid);
358         OMPT_REDUCTION_BEGIN;
359         (*reduce)(this_thr->th.th_local.reduce_data,
360                   child_thr->th.th_local.reduce_data);
361         OMPT_REDUCTION_END;
362         ANNOTATE_REDUCE_BEFORE(reduce);
363         ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
364       }
365       child++;
366       child_tid++;
367     } while (child <= branch_factor && child_tid < nproc);
368   }
369 
370   if (!KMP_MASTER_TID(tid)) { // Worker threads
371     kmp_int32 parent_tid = (tid - 1) >> branch_bits;
372 
373     KA_TRACE(20,
374              ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
375               "arrived(%p): %llu => %llu\n",
376               gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
377               team->t.t_id, parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
378               thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
379 
380     // Mark arrival to parent thread
381     /* After performing this write, a worker thread may not assume that the team
382        is valid any more - it could be deallocated by the master thread at any
383        time.  */
384     ANNOTATE_BARRIER_BEGIN(this_thr);
385     kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[parent_tid]);
386     flag.release();
387   } else {
388     // Need to update the team arrived pointer if we are the master thread
389     if (nproc > 1) // New value was already computed above
390       team->t.t_bar[bt].b_arrived = new_state;
391     else
392       team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
393     KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d "
394                   "arrived(%p) = %llu\n",
395                   gtid, team->t.t_id, tid, team->t.t_id,
396                   &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
397   }
398   KA_TRACE(20,
399            ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
400             gtid, team->t.t_id, tid, bt));
401 }
402 
__kmp_tree_barrier_release(enum barrier_type bt,kmp_info_t * this_thr,int gtid,int tid,int propagate_icvs USE_ITT_BUILD_ARG (void * itt_sync_obj))403 static void __kmp_tree_barrier_release(
404     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
405     int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
406   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_release);
407   kmp_team_t *team;
408   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
409   kmp_uint32 nproc;
410   kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
411   kmp_uint32 branch_factor = 1 << branch_bits;
412   kmp_uint32 child;
413   kmp_uint32 child_tid;
414 
415   // Perform a tree release for all of the threads that have been gathered
416   if (!KMP_MASTER_TID(
417           tid)) { // Handle fork barrier workers who aren't part of a team yet
418     KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n", gtid,
419                   &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
420     // Wait for parent thread to release us
421     kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
422     flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
423     ANNOTATE_BARRIER_END(this_thr);
424 #if USE_ITT_BUILD && USE_ITT_NOTIFY
425     if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
426       // In fork barrier where we could not get the object reliably (or
427       // ITTNOTIFY is disabled)
428       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
429       // Cancel wait on previous parallel region...
430       __kmp_itt_task_starting(itt_sync_obj);
431 
432       if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
433         return;
434 
435       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
436       if (itt_sync_obj != NULL)
437         // Call prepare as early as possible for "new" barrier
438         __kmp_itt_task_finished(itt_sync_obj);
439     } else
440 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
441         // Early exit for reaping threads releasing forkjoin barrier
442         if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
443       return;
444 
445     // The worker thread may now assume that the team is valid.
446     team = __kmp_threads[gtid]->th.th_team;
447     KMP_DEBUG_ASSERT(team != NULL);
448     tid = __kmp_tid_from_gtid(gtid);
449 
450     TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
451     KA_TRACE(20,
452              ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", gtid,
453               team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
454     KMP_MB(); // Flush all pending memory write invalidates.
455   } else {
456     team = __kmp_threads[gtid]->th.th_team;
457     KMP_DEBUG_ASSERT(team != NULL);
458     KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) master enter for "
459                   "barrier type %d\n",
460                   gtid, team->t.t_id, tid, bt));
461   }
462   nproc = this_thr->th.th_team_nproc;
463   child_tid = (tid << branch_bits) + 1;
464 
465   if (child_tid < nproc) {
466     kmp_info_t **other_threads = team->t.t_threads;
467     child = 1;
468     // Parent threads release all their children
469     do {
470       kmp_info_t *child_thr = other_threads[child_tid];
471       kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
472 #if KMP_CACHE_MANAGE
473       // Prefetch next thread's go count
474       if (child + 1 <= branch_factor && child_tid + 1 < nproc)
475         KMP_CACHE_PREFETCH(
476             &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_go);
477 #endif /* KMP_CACHE_MANAGE */
478 
479 #if KMP_BARRIER_ICV_PUSH
480       {
481         KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
482         if (propagate_icvs) {
483           __kmp_init_implicit_task(team->t.t_ident,
484                                    team->t.t_threads[child_tid], team,
485                                    child_tid, FALSE);
486           copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
487                     &team->t.t_implicit_task_taskdata[0].td_icvs);
488         }
489       }
490 #endif // KMP_BARRIER_ICV_PUSH
491       KA_TRACE(20,
492                ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
493                 "go(%p): %u => %u\n",
494                 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
495                 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
496                 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
497       // Release child from barrier
498       ANNOTATE_BARRIER_BEGIN(child_thr);
499       kmp_flag_64<> flag(&child_bar->b_go, child_thr);
500       flag.release();
501       child++;
502       child_tid++;
503     } while (child <= branch_factor && child_tid < nproc);
504   }
505   KA_TRACE(
506       20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
507            gtid, team->t.t_id, tid, bt));
508 }
509 
510 // Hyper Barrier
511 static void
__kmp_hyper_barrier_gather(enum barrier_type bt,kmp_info_t * this_thr,int gtid,int tid,void (* reduce)(void *,void *)USE_ITT_BUILD_ARG (void * itt_sync_obj))512 __kmp_hyper_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid,
513                            int tid, void (*reduce)(void *, void *)
514                                         USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
515   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_gather);
516   kmp_team_t *team = this_thr->th.th_team;
517   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
518   kmp_info_t **other_threads = team->t.t_threads;
519   kmp_uint64 new_state = KMP_BARRIER_UNUSED_STATE;
520   kmp_uint32 num_threads = this_thr->th.th_team_nproc;
521   kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
522   kmp_uint32 branch_factor = 1 << branch_bits;
523   kmp_uint32 offset;
524   kmp_uint32 level;
525 
526   KA_TRACE(
527       20,
528       ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
529        gtid, team->t.t_id, tid, bt));
530   KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
531 
532 #if USE_ITT_BUILD && USE_ITT_NOTIFY
533   // Barrier imbalance - save arrive time to the thread
534   if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
535     this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
536         __itt_get_timestamp();
537   }
538 #endif
539   /* Perform a hypercube-embedded tree gather to wait until all of the threads
540      have arrived, and reduce any required data as we go.  */
541   kmp_flag_64<> p_flag(&thr_bar->b_arrived);
542   for (level = 0, offset = 1; offset < num_threads;
543        level += branch_bits, offset <<= branch_bits) {
544     kmp_uint32 child;
545     kmp_uint32 child_tid;
546 
547     if (((tid >> level) & (branch_factor - 1)) != 0) {
548       kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) - 1);
549 
550       KMP_MB(); // Synchronize parent and child threads.
551       KA_TRACE(20,
552                ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
553                 "arrived(%p): %llu => %llu\n",
554                 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
555                 team->t.t_id, parent_tid, &thr_bar->b_arrived,
556                 thr_bar->b_arrived,
557                 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
558       // Mark arrival to parent thread
559       /* After performing this write (in the last iteration of the enclosing for
560          loop), a worker thread may not assume that the team is valid any more
561          - it could be deallocated by the master thread at any time.  */
562       ANNOTATE_BARRIER_BEGIN(this_thr);
563       p_flag.set_waiter(other_threads[parent_tid]);
564       p_flag.release();
565       break;
566     }
567 
568     // Parent threads wait for children to arrive
569     if (new_state == KMP_BARRIER_UNUSED_STATE)
570       new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
571     for (child = 1, child_tid = tid + (1 << level);
572          child < branch_factor && child_tid < num_threads;
573          child++, child_tid += (1 << level)) {
574       kmp_info_t *child_thr = other_threads[child_tid];
575       kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
576 #if KMP_CACHE_MANAGE
577       kmp_uint32 next_child_tid = child_tid + (1 << level);
578       // Prefetch next thread's arrived count
579       if (child + 1 < branch_factor && next_child_tid < num_threads)
580         KMP_CACHE_PREFETCH(
581             &other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
582 #endif /* KMP_CACHE_MANAGE */
583       KA_TRACE(20,
584                ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
585                 "arrived(%p) == %llu\n",
586                 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
587                 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
588       // Wait for child to arrive
589       kmp_flag_64<> c_flag(&child_bar->b_arrived, new_state);
590       c_flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
591       ANNOTATE_BARRIER_END(child_thr);
592       KMP_MB(); // Synchronize parent and child threads.
593 #if USE_ITT_BUILD && USE_ITT_NOTIFY
594       // Barrier imbalance - write min of the thread time and a child time to
595       // the thread.
596       if (__kmp_forkjoin_frames_mode == 2) {
597         this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
598                                                child_thr->th.th_bar_min_time);
599       }
600 #endif
601       if (reduce) {
602         KA_TRACE(100,
603                  ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
604                   gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
605                   team->t.t_id, child_tid));
606         ANNOTATE_REDUCE_AFTER(reduce);
607         OMPT_REDUCTION_DECL(this_thr, gtid);
608         OMPT_REDUCTION_BEGIN;
609         (*reduce)(this_thr->th.th_local.reduce_data,
610                   child_thr->th.th_local.reduce_data);
611         OMPT_REDUCTION_END;
612         ANNOTATE_REDUCE_BEFORE(reduce);
613         ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
614       }
615     }
616   }
617 
618   if (KMP_MASTER_TID(tid)) {
619     // Need to update the team arrived pointer if we are the master thread
620     if (new_state == KMP_BARRIER_UNUSED_STATE)
621       team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
622     else
623       team->t.t_bar[bt].b_arrived = new_state;
624     KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d "
625                   "arrived(%p) = %llu\n",
626                   gtid, team->t.t_id, tid, team->t.t_id,
627                   &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
628   }
629   KA_TRACE(
630       20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
631            gtid, team->t.t_id, tid, bt));
632 }
633 
634 // The reverse versions seem to beat the forward versions overall
635 #define KMP_REVERSE_HYPER_BAR
__kmp_hyper_barrier_release(enum barrier_type bt,kmp_info_t * this_thr,int gtid,int tid,int propagate_icvs USE_ITT_BUILD_ARG (void * itt_sync_obj))636 static void __kmp_hyper_barrier_release(
637     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
638     int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
639   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_release);
640   kmp_team_t *team;
641   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
642   kmp_info_t **other_threads;
643   kmp_uint32 num_threads;
644   kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
645   kmp_uint32 branch_factor = 1 << branch_bits;
646   kmp_uint32 child;
647   kmp_uint32 child_tid;
648   kmp_uint32 offset;
649   kmp_uint32 level;
650 
651   /* Perform a hypercube-embedded tree release for all of the threads that have
652      been gathered. If KMP_REVERSE_HYPER_BAR is defined (default) the threads
653      are released in the reverse order of the corresponding gather, otherwise
654      threads are released in the same order. */
655   if (KMP_MASTER_TID(tid)) { // master
656     team = __kmp_threads[gtid]->th.th_team;
657     KMP_DEBUG_ASSERT(team != NULL);
658     KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) master enter for "
659                   "barrier type %d\n",
660                   gtid, team->t.t_id, tid, bt));
661 #if KMP_BARRIER_ICV_PUSH
662     if (propagate_icvs) { // master already has ICVs in final destination; copy
663       copy_icvs(&thr_bar->th_fixed_icvs,
664                 &team->t.t_implicit_task_taskdata[tid].td_icvs);
665     }
666 #endif
667   } else { // Handle fork barrier workers who aren't part of a team yet
668     KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n", gtid,
669                   &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
670     // Wait for parent thread to release us
671     kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
672     flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
673     ANNOTATE_BARRIER_END(this_thr);
674 #if USE_ITT_BUILD && USE_ITT_NOTIFY
675     if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
676       // In fork barrier where we could not get the object reliably
677       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
678       // Cancel wait on previous parallel region...
679       __kmp_itt_task_starting(itt_sync_obj);
680 
681       if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
682         return;
683 
684       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
685       if (itt_sync_obj != NULL)
686         // Call prepare as early as possible for "new" barrier
687         __kmp_itt_task_finished(itt_sync_obj);
688     } else
689 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
690         // Early exit for reaping threads releasing forkjoin barrier
691         if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
692       return;
693 
694     // The worker thread may now assume that the team is valid.
695     team = __kmp_threads[gtid]->th.th_team;
696     KMP_DEBUG_ASSERT(team != NULL);
697     tid = __kmp_tid_from_gtid(gtid);
698 
699     TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
700     KA_TRACE(20,
701              ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
702               gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
703     KMP_MB(); // Flush all pending memory write invalidates.
704   }
705   num_threads = this_thr->th.th_team_nproc;
706   other_threads = team->t.t_threads;
707 
708 #ifdef KMP_REVERSE_HYPER_BAR
709   // Count up to correct level for parent
710   for (level = 0, offset = 1;
711        offset < num_threads && (((tid >> level) & (branch_factor - 1)) == 0);
712        level += branch_bits, offset <<= branch_bits)
713     ;
714 
715   // Now go down from there
716   for (level -= branch_bits, offset >>= branch_bits; offset != 0;
717        level -= branch_bits, offset >>= branch_bits)
718 #else
719   // Go down the tree, level by level
720   for (level = 0, offset = 1; offset < num_threads;
721        level += branch_bits, offset <<= branch_bits)
722 #endif // KMP_REVERSE_HYPER_BAR
723   {
724 #ifdef KMP_REVERSE_HYPER_BAR
725     /* Now go in reverse order through the children, highest to lowest.
726        Initial setting of child is conservative here. */
727     child = num_threads >> ((level == 0) ? level : level - 1);
728     for (child = (child < branch_factor - 1) ? child : branch_factor - 1,
729         child_tid = tid + (child << level);
730          child >= 1; child--, child_tid -= (1 << level))
731 #else
732     if (((tid >> level) & (branch_factor - 1)) != 0)
733       // No need to go lower than this, since this is the level parent would be
734       // notified
735       break;
736     // Iterate through children on this level of the tree
737     for (child = 1, child_tid = tid + (1 << level);
738          child < branch_factor && child_tid < num_threads;
739          child++, child_tid += (1 << level))
740 #endif // KMP_REVERSE_HYPER_BAR
741     {
742       if (child_tid >= num_threads)
743         continue; // Child doesn't exist so keep going
744       else {
745         kmp_info_t *child_thr = other_threads[child_tid];
746         kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
747 #if KMP_CACHE_MANAGE
748         kmp_uint32 next_child_tid = child_tid - (1 << level);
749 // Prefetch next thread's go count
750 #ifdef KMP_REVERSE_HYPER_BAR
751         if (child - 1 >= 1 && next_child_tid < num_threads)
752 #else
753         if (child + 1 < branch_factor && next_child_tid < num_threads)
754 #endif // KMP_REVERSE_HYPER_BAR
755           KMP_CACHE_PREFETCH(
756               &other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
757 #endif /* KMP_CACHE_MANAGE */
758 
759 #if KMP_BARRIER_ICV_PUSH
760         if (propagate_icvs) // push my fixed ICVs to my child
761           copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
762 #endif // KMP_BARRIER_ICV_PUSH
763 
764         KA_TRACE(
765             20,
766             ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
767              "go(%p): %u => %u\n",
768              gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
769              team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
770              child_bar->b_go + KMP_BARRIER_STATE_BUMP));
771         // Release child from barrier
772         ANNOTATE_BARRIER_BEGIN(child_thr);
773         kmp_flag_64<> flag(&child_bar->b_go, child_thr);
774         flag.release();
775       }
776     }
777   }
778 #if KMP_BARRIER_ICV_PUSH
779   if (propagate_icvs &&
780       !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest
781     __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
782                              FALSE);
783     copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
784               &thr_bar->th_fixed_icvs);
785   }
786 #endif
787   KA_TRACE(
788       20,
789       ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
790        gtid, team->t.t_id, tid, bt));
791 }
792 
793 // Hierarchical Barrier
794 
795 // Initialize thread barrier data
796 /* Initializes/re-initializes the hierarchical barrier data stored on a thread.
797    Performs the minimum amount of initialization required based on how the team
798    has changed. Returns true if leaf children will require both on-core and
799    traditional wake-up mechanisms. For example, if the team size increases,
800    threads already in the team will respond to on-core wakeup on their parent
801    thread, but threads newly added to the team will only be listening on the
802    their local b_go. */
__kmp_init_hierarchical_barrier_thread(enum barrier_type bt,kmp_bstate_t * thr_bar,kmp_uint32 nproc,int gtid,int tid,kmp_team_t * team)803 static bool __kmp_init_hierarchical_barrier_thread(enum barrier_type bt,
804                                                    kmp_bstate_t *thr_bar,
805                                                    kmp_uint32 nproc, int gtid,
806                                                    int tid, kmp_team_t *team) {
807   // Checks to determine if (re-)initialization is needed
808   bool uninitialized = thr_bar->team == NULL;
809   bool team_changed = team != thr_bar->team;
810   bool team_sz_changed = nproc != thr_bar->nproc;
811   bool tid_changed = tid != thr_bar->old_tid;
812   bool retval = false;
813 
814   if (uninitialized || team_sz_changed) {
815     __kmp_get_hierarchy(nproc, thr_bar);
816   }
817 
818   if (uninitialized || team_sz_changed || tid_changed) {
819     thr_bar->my_level = thr_bar->depth - 1; // default for master
820     thr_bar->parent_tid = -1; // default for master
821     if (!KMP_MASTER_TID(
822             tid)) { // if not master, find parent thread in hierarchy
823       kmp_uint32 d = 0;
824       while (d < thr_bar->depth) { // find parent based on level of thread in
825         // hierarchy, and note level
826         kmp_uint32 rem;
827         if (d == thr_bar->depth - 2) { // reached level right below the master
828           thr_bar->parent_tid = 0;
829           thr_bar->my_level = d;
830           break;
831         } else if ((rem = tid % thr_bar->skip_per_level[d + 1]) !=
832                    0) { // TODO: can we make this op faster?
833           // thread is not a subtree root at next level, so this is max
834           thr_bar->parent_tid = tid - rem;
835           thr_bar->my_level = d;
836           break;
837         }
838         ++d;
839       }
840     }
841     thr_bar->offset = 7 - (tid - thr_bar->parent_tid - 1);
842     thr_bar->old_tid = tid;
843     thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
844     thr_bar->team = team;
845     thr_bar->parent_bar =
846         &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
847   }
848   if (uninitialized || team_changed || tid_changed) {
849     thr_bar->team = team;
850     thr_bar->parent_bar =
851         &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
852     retval = true;
853   }
854   if (uninitialized || team_sz_changed || tid_changed) {
855     thr_bar->nproc = nproc;
856     thr_bar->leaf_kids = thr_bar->base_leaf_kids;
857     if (thr_bar->my_level == 0)
858       thr_bar->leaf_kids = 0;
859     if (thr_bar->leaf_kids && (kmp_uint32)tid + thr_bar->leaf_kids + 1 > nproc)
860       thr_bar->leaf_kids = nproc - tid - 1;
861     thr_bar->leaf_state = 0;
862     for (int i = 0; i < thr_bar->leaf_kids; ++i)
863       ((char *)&(thr_bar->leaf_state))[7 - i] = 1;
864   }
865   return retval;
866 }
867 
__kmp_hierarchical_barrier_gather(enum barrier_type bt,kmp_info_t * this_thr,int gtid,int tid,void (* reduce)(void *,void *)USE_ITT_BUILD_ARG (void * itt_sync_obj))868 static void __kmp_hierarchical_barrier_gather(
869     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
870     void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
871   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_gather);
872   kmp_team_t *team = this_thr->th.th_team;
873   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
874   kmp_uint32 nproc = this_thr->th.th_team_nproc;
875   kmp_info_t **other_threads = team->t.t_threads;
876   kmp_uint64 new_state;
877 
878   int level = team->t.t_level;
879   if (other_threads[0]
880           ->th.th_teams_microtask) // are we inside the teams construct?
881     if (this_thr->th.th_teams_size.nteams > 1)
882       ++level; // level was not increased in teams construct for team_of_masters
883   if (level == 1)
884     thr_bar->use_oncore_barrier = 1;
885   else
886     thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
887 
888   KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for "
889                 "barrier type %d\n",
890                 gtid, team->t.t_id, tid, bt));
891   KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
892 
893 #if USE_ITT_BUILD && USE_ITT_NOTIFY
894   // Barrier imbalance - save arrive time to the thread
895   if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
896     this_thr->th.th_bar_arrive_time = __itt_get_timestamp();
897   }
898 #endif
899 
900   (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid,
901                                                team);
902 
903   if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf)
904     kmp_int32 child_tid;
905     new_state =
906         (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
907     if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
908         thr_bar->use_oncore_barrier) {
909       if (thr_bar->leaf_kids) {
910         // First, wait for leaf children to check-in on my b_arrived flag
911         kmp_uint64 leaf_state =
912             KMP_MASTER_TID(tid)
913                 ? thr_bar->b_arrived | thr_bar->leaf_state
914                 : team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;
915         KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting "
916                       "for leaf kids\n",
917                       gtid, team->t.t_id, tid));
918         kmp_flag_64<> flag(&thr_bar->b_arrived, leaf_state);
919         flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
920         if (reduce) {
921           ANNOTATE_REDUCE_AFTER(reduce);
922           OMPT_REDUCTION_DECL(this_thr, gtid);
923           OMPT_REDUCTION_BEGIN;
924           for (child_tid = tid + 1; child_tid <= tid + thr_bar->leaf_kids;
925                ++child_tid) {
926             KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
927                            "T#%d(%d:%d)\n",
928                            gtid, team->t.t_id, tid,
929                            __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
930                            child_tid));
931             ANNOTATE_BARRIER_END(other_threads[child_tid]);
932             (*reduce)(this_thr->th.th_local.reduce_data,
933                       other_threads[child_tid]->th.th_local.reduce_data);
934           }
935           OMPT_REDUCTION_END;
936           ANNOTATE_REDUCE_BEFORE(reduce);
937           ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
938         }
939         // clear leaf_state bits
940         KMP_TEST_THEN_AND64(&thr_bar->b_arrived, ~(thr_bar->leaf_state));
941       }
942       // Next, wait for higher level children on each child's b_arrived flag
943       for (kmp_uint32 d = 1; d < thr_bar->my_level;
944            ++d) { // gather lowest level threads first, but skip 0
945         kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
946                    skip = thr_bar->skip_per_level[d];
947         if (last > nproc)
948           last = nproc;
949         for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
950           kmp_info_t *child_thr = other_threads[child_tid];
951           kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
952           KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
953                         "T#%d(%d:%d) "
954                         "arrived(%p) == %llu\n",
955                         gtid, team->t.t_id, tid,
956                         __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
957                         child_tid, &child_bar->b_arrived, new_state));
958           kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
959           flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
960           ANNOTATE_BARRIER_END(child_thr);
961           if (reduce) {
962             KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
963                            "T#%d(%d:%d)\n",
964                            gtid, team->t.t_id, tid,
965                            __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
966                            child_tid));
967             ANNOTATE_REDUCE_AFTER(reduce);
968             (*reduce)(this_thr->th.th_local.reduce_data,
969                       child_thr->th.th_local.reduce_data);
970             ANNOTATE_REDUCE_BEFORE(reduce);
971             ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
972           }
973         }
974       }
975     } else { // Blocktime is not infinite
976       for (kmp_uint32 d = 0; d < thr_bar->my_level;
977            ++d) { // Gather lowest level threads first
978         kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
979                    skip = thr_bar->skip_per_level[d];
980         if (last > nproc)
981           last = nproc;
982         for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
983           kmp_info_t *child_thr = other_threads[child_tid];
984           kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
985           KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
986                         "T#%d(%d:%d) "
987                         "arrived(%p) == %llu\n",
988                         gtid, team->t.t_id, tid,
989                         __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
990                         child_tid, &child_bar->b_arrived, new_state));
991           kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
992           flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
993           ANNOTATE_BARRIER_END(child_thr);
994           if (reduce) {
995             KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
996                            "T#%d(%d:%d)\n",
997                            gtid, team->t.t_id, tid,
998                            __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
999                            child_tid));
1000             ANNOTATE_REDUCE_AFTER(reduce);
1001             (*reduce)(this_thr->th.th_local.reduce_data,
1002                       child_thr->th.th_local.reduce_data);
1003             ANNOTATE_REDUCE_BEFORE(reduce);
1004             ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
1005           }
1006         }
1007       }
1008     }
1009   }
1010   // All subordinates are gathered; now release parent if not master thread
1011 
1012   if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy
1013     KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing"
1014                   " T#%d(%d:%d) arrived(%p): %llu => %llu\n",
1015                   gtid, team->t.t_id, tid,
1016                   __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id,
1017                   thr_bar->parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
1018                   thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
1019     /* Mark arrival to parent: After performing this write, a worker thread may
1020        not assume that the team is valid any more - it could be deallocated by
1021        the master thread at any time. */
1022     if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ||
1023         !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived
1024       // flag; release it
1025       ANNOTATE_BARRIER_BEGIN(this_thr);
1026       kmp_flag_64<> flag(&thr_bar->b_arrived,
1027                          other_threads[thr_bar->parent_tid]);
1028       flag.release();
1029     } else {
1030       // Leaf does special release on "offset" bits of parent's b_arrived flag
1031       thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
1032       kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived, thr_bar->offset);
1033       flag.set_waiter(other_threads[thr_bar->parent_tid]);
1034       flag.release();
1035     }
1036   } else { // Master thread needs to update the team's b_arrived value
1037     team->t.t_bar[bt].b_arrived = new_state;
1038     KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d "
1039                   "arrived(%p) = %llu\n",
1040                   gtid, team->t.t_id, tid, team->t.t_id,
1041                   &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
1042   }
1043   // Is the team access below unsafe or just technically invalid?
1044   KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for "
1045                 "barrier type %d\n",
1046                 gtid, team->t.t_id, tid, bt));
1047 }
1048 
__kmp_hierarchical_barrier_release(enum barrier_type bt,kmp_info_t * this_thr,int gtid,int tid,int propagate_icvs USE_ITT_BUILD_ARG (void * itt_sync_obj))1049 static void __kmp_hierarchical_barrier_release(
1050     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
1051     int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
1052   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_release);
1053   kmp_team_t *team;
1054   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1055   kmp_uint32 nproc;
1056   bool team_change = false; // indicates on-core barrier shouldn't be used
1057 
1058   if (KMP_MASTER_TID(tid)) {
1059     team = __kmp_threads[gtid]->th.th_team;
1060     KMP_DEBUG_ASSERT(team != NULL);
1061     KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) master "
1062                   "entered barrier type %d\n",
1063                   gtid, team->t.t_id, tid, bt));
1064   } else { // Worker threads
1065     // Wait for parent thread to release me
1066     if (!thr_bar->use_oncore_barrier ||
1067         __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME || thr_bar->my_level != 0 ||
1068         thr_bar->team == NULL) {
1069       // Use traditional method of waiting on my own b_go flag
1070       thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
1071       kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
1072       flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1073       ANNOTATE_BARRIER_END(this_thr);
1074       TCW_8(thr_bar->b_go,
1075             KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1076     } else { // Thread barrier data is initialized, this is a leaf, blocktime is
1077       // infinite, not nested
1078       // Wait on my "offset" bits on parent's b_go flag
1079       thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
1080       kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP,
1081                            thr_bar->offset, bt,
1082                            this_thr USE_ITT_BUILD_ARG(itt_sync_obj));
1083       flag.wait(this_thr, TRUE);
1084       if (thr_bar->wait_flag ==
1085           KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go
1086         TCW_8(thr_bar->b_go,
1087               KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1088       } else { // Reset my bits on parent's b_go flag
1089         (RCAST(volatile char *,
1090                &(thr_bar->parent_bar->b_go)))[thr_bar->offset] = 0;
1091       }
1092     }
1093     thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
1094     // Early exit for reaping threads releasing forkjoin barrier
1095     if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1096       return;
1097     // The worker thread may now assume that the team is valid.
1098     team = __kmp_threads[gtid]->th.th_team;
1099     KMP_DEBUG_ASSERT(team != NULL);
1100     tid = __kmp_tid_from_gtid(gtid);
1101 
1102     KA_TRACE(
1103         20,
1104         ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
1105          gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
1106     KMP_MB(); // Flush all pending memory write invalidates.
1107   }
1108 
1109   nproc = this_thr->th.th_team_nproc;
1110   int level = team->t.t_level;
1111   if (team->t.t_threads[0]
1112           ->th.th_teams_microtask) { // are we inside the teams construct?
1113     if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1114         this_thr->th.th_teams_level == level)
1115       ++level; // level was not increased in teams construct for team_of_workers
1116     if (this_thr->th.th_teams_size.nteams > 1)
1117       ++level; // level was not increased in teams construct for team_of_masters
1118   }
1119   if (level == 1)
1120     thr_bar->use_oncore_barrier = 1;
1121   else
1122     thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
1123 
1124   // If the team size has increased, we still communicate with old leaves via
1125   // oncore barrier.
1126   unsigned short int old_leaf_kids = thr_bar->leaf_kids;
1127   kmp_uint64 old_leaf_state = thr_bar->leaf_state;
1128   team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid,
1129                                                        tid, team);
1130   // But if the entire team changes, we won't use oncore barrier at all
1131   if (team_change)
1132     old_leaf_kids = 0;
1133 
1134 #if KMP_BARRIER_ICV_PUSH
1135   if (propagate_icvs) {
1136     __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
1137                              FALSE);
1138     if (KMP_MASTER_TID(
1139             tid)) { // master already has copy in final destination; copy
1140       copy_icvs(&thr_bar->th_fixed_icvs,
1141                 &team->t.t_implicit_task_taskdata[tid].td_icvs);
1142     } else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1143                thr_bar->use_oncore_barrier) { // optimization for inf blocktime
1144       if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0)
1145         // leaves (on-core children) pull parent's fixed ICVs directly to local
1146         // ICV store
1147         copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1148                   &thr_bar->parent_bar->th_fixed_icvs);
1149       // non-leaves will get ICVs piggybacked with b_go via NGO store
1150     } else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs
1151       if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can
1152         // access
1153         copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);
1154       else // leaves copy parent's fixed ICVs directly to local ICV store
1155         copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1156                   &thr_bar->parent_bar->th_fixed_icvs);
1157     }
1158   }
1159 #endif // KMP_BARRIER_ICV_PUSH
1160 
1161   // Now, release my children
1162   if (thr_bar->my_level) { // not a leaf
1163     kmp_int32 child_tid;
1164     kmp_uint32 last;
1165     if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1166         thr_bar->use_oncore_barrier) {
1167       if (KMP_MASTER_TID(tid)) { // do a flat release
1168         // Set local b_go to bump children via NGO store of the cache line
1169         // containing IVCs and b_go.
1170         thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
1171         // Use ngo stores if available; b_go piggybacks in the last 8 bytes of
1172         // the cache line
1173         ngo_load(&thr_bar->th_fixed_icvs);
1174         // This loops over all the threads skipping only the leaf nodes in the
1175         // hierarchy
1176         for (child_tid = thr_bar->skip_per_level[1]; child_tid < (int)nproc;
1177              child_tid += thr_bar->skip_per_level[1]) {
1178           kmp_bstate_t *child_bar =
1179               &team->t.t_threads[child_tid]->th.th_bar[bt].bb;
1180           KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1181                         "releasing T#%d(%d:%d)"
1182                         " go(%p): %u => %u\n",
1183                         gtid, team->t.t_id, tid,
1184                         __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1185                         child_tid, &child_bar->b_go, child_bar->b_go,
1186                         child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1187           // Use ngo store (if available) to both store ICVs and release child
1188           // via child's b_go
1189           ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
1190         }
1191         ngo_sync();
1192       }
1193       TCW_8(thr_bar->b_go,
1194             KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1195       // Now, release leaf children
1196       if (thr_bar->leaf_kids) { // if there are any
1197         // We test team_change on the off-chance that the level 1 team changed.
1198         if (team_change ||
1199             old_leaf_kids < thr_bar->leaf_kids) { // some old, some new
1200           if (old_leaf_kids) { // release old leaf kids
1201             thr_bar->b_go |= old_leaf_state;
1202           }
1203           // Release new leaf kids
1204           last = tid + thr_bar->skip_per_level[1];
1205           if (last > nproc)
1206             last = nproc;
1207           for (child_tid = tid + 1 + old_leaf_kids; child_tid < (int)last;
1208                ++child_tid) { // skip_per_level[0]=1
1209             kmp_info_t *child_thr = team->t.t_threads[child_tid];
1210             kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1211             KA_TRACE(
1212                 20,
1213                 ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"
1214                  " T#%d(%d:%d) go(%p): %u => %u\n",
1215                  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1216                  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1217                  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1218             // Release child using child's b_go flag
1219             ANNOTATE_BARRIER_BEGIN(child_thr);
1220             kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1221             flag.release();
1222           }
1223         } else { // Release all children at once with leaf_state bits on my own
1224           // b_go flag
1225           thr_bar->b_go |= thr_bar->leaf_state;
1226         }
1227       }
1228     } else { // Blocktime is not infinite; do a simple hierarchical release
1229       for (int d = thr_bar->my_level - 1; d >= 0;
1230            --d) { // Release highest level threads first
1231         last = tid + thr_bar->skip_per_level[d + 1];
1232         kmp_uint32 skip = thr_bar->skip_per_level[d];
1233         if (last > nproc)
1234           last = nproc;
1235         for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1236           kmp_info_t *child_thr = team->t.t_threads[child_tid];
1237           kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1238           KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1239                         "releasing T#%d(%d:%d) go(%p): %u => %u\n",
1240                         gtid, team->t.t_id, tid,
1241                         __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1242                         child_tid, &child_bar->b_go, child_bar->b_go,
1243                         child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1244           // Release child using child's b_go flag
1245           ANNOTATE_BARRIER_BEGIN(child_thr);
1246           kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1247           flag.release();
1248         }
1249       }
1250     }
1251 #if KMP_BARRIER_ICV_PUSH
1252     if (propagate_icvs && !KMP_MASTER_TID(tid))
1253       // non-leaves copy ICVs from fixed ICVs to local dest
1254       copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1255                 &thr_bar->th_fixed_icvs);
1256 #endif // KMP_BARRIER_ICV_PUSH
1257   }
1258   KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for "
1259                 "barrier type %d\n",
1260                 gtid, team->t.t_id, tid, bt));
1261 }
1262 
1263 // End of Barrier Algorithms
1264 
1265 // type traits for cancellable value
1266 // if cancellable is true, then is_cancellable is a normal boolean variable
1267 // if cancellable is false, then is_cancellable is a compile time constant
1268 template <bool cancellable> struct is_cancellable {};
1269 template <> struct is_cancellable<true> {
1270   bool value;
is_cancellableis_cancellable1271   is_cancellable() : value(false) {}
is_cancellableis_cancellable1272   is_cancellable(bool b) : value(b) {}
operator =is_cancellable1273   is_cancellable &operator=(bool b) {
1274     value = b;
1275     return *this;
1276   }
operator boolis_cancellable1277   operator bool() const { return value; }
1278 };
1279 template <> struct is_cancellable<false> {
operator =is_cancellable1280   is_cancellable &operator=(bool b) { return *this; }
operator boolis_cancellable1281   constexpr operator bool() const { return false; }
1282 };
1283 
1284 // Internal function to do a barrier.
1285 /* If is_split is true, do a split barrier, otherwise, do a plain barrier
1286    If reduce is non-NULL, do a split reduction barrier, otherwise, do a split
1287    barrier
1288    When cancellable = false,
1289      Returns 0 if master thread, 1 if worker thread.
1290    When cancellable = true
1291      Returns 0 if not cancelled, 1 if cancelled.  */
1292 template <bool cancellable = false>
__kmp_barrier_template(enum barrier_type bt,int gtid,int is_split,size_t reduce_size,void * reduce_data,void (* reduce)(void *,void *))1293 static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split,
1294                                   size_t reduce_size, void *reduce_data,
1295                                   void (*reduce)(void *, void *)) {
1296   KMP_TIME_PARTITIONED_BLOCK(OMP_plain_barrier);
1297   KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1298   int tid = __kmp_tid_from_gtid(gtid);
1299   kmp_info_t *this_thr = __kmp_threads[gtid];
1300   kmp_team_t *team = this_thr->th.th_team;
1301   int status = 0;
1302   is_cancellable<cancellable> cancelled;
1303 #if OMPT_SUPPORT && OMPT_OPTIONAL
1304   ompt_data_t *my_task_data;
1305   ompt_data_t *my_parallel_data;
1306   void *return_address;
1307   ompt_sync_region_t barrier_kind;
1308 #endif
1309 
1310   KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n", gtid,
1311                 __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1312 
1313   ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1314 #if OMPT_SUPPORT
1315   if (ompt_enabled.enabled) {
1316 #if OMPT_OPTIONAL
1317     my_task_data = OMPT_CUR_TASK_DATA(this_thr);
1318     my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
1319     return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1320     barrier_kind = __ompt_get_barrier_kind(bt, this_thr);
1321     if (ompt_enabled.ompt_callback_sync_region) {
1322       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1323           barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1324           return_address);
1325     }
1326     if (ompt_enabled.ompt_callback_sync_region_wait) {
1327       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1328           barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1329           return_address);
1330     }
1331 #endif
1332     // It is OK to report the barrier state after the barrier begin callback.
1333     // According to the OMPT specification, a compliant implementation may
1334     // even delay reporting this state until the barrier begins to wait.
1335     this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
1336   }
1337 #endif
1338 
1339   if (!team->t.t_serialized) {
1340 #if USE_ITT_BUILD
1341     // This value will be used in itt notify events below.
1342     void *itt_sync_obj = NULL;
1343 #if USE_ITT_NOTIFY
1344     if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1345       itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1346 #endif
1347 #endif /* USE_ITT_BUILD */
1348     if (__kmp_tasking_mode == tskm_extra_barrier) {
1349       __kmp_tasking_barrier(team, this_thr, gtid);
1350       KA_TRACE(15,
1351                ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n", gtid,
1352                 __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1353     }
1354 
1355     /* Copy the blocktime info to the thread, where __kmp_wait_template() can
1356        access it when the team struct is not guaranteed to exist. */
1357     // See note about the corresponding code in __kmp_join_barrier() being
1358     // performance-critical.
1359     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1360 #if KMP_USE_MONITOR
1361       this_thr->th.th_team_bt_intervals =
1362           team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1363       this_thr->th.th_team_bt_set =
1364           team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1365 #else
1366       this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1367 #endif
1368     }
1369 
1370 #if USE_ITT_BUILD
1371     if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1372       __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1373 #endif /* USE_ITT_BUILD */
1374 #if USE_DEBUGGER
1375     // Let the debugger know: the thread arrived to the barrier and waiting.
1376     if (KMP_MASTER_TID(tid)) { // Master counter is stored in team structure.
1377       team->t.t_bar[bt].b_master_arrived += 1;
1378     } else {
1379       this_thr->th.th_bar[bt].bb.b_worker_arrived += 1;
1380     } // if
1381 #endif /* USE_DEBUGGER */
1382     if (reduce != NULL) {
1383       // KMP_DEBUG_ASSERT( is_split == TRUE );  // #C69956
1384       this_thr->th.th_local.reduce_data = reduce_data;
1385     }
1386 
1387     if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec)
1388       // use 0 to only setup the current team if nthreads > 1
1389       __kmp_task_team_setup(this_thr, team, 0);
1390 
1391     if (cancellable) {
1392       cancelled = __kmp_linear_barrier_gather_cancellable(
1393           bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1394     } else {
1395       switch (__kmp_barrier_gather_pattern[bt]) {
1396       case bp_hyper_bar: {
1397         // don't set branch bits to 0; use linear
1398         KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
1399         __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid,
1400                                    reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1401         break;
1402       }
1403       case bp_hierarchical_bar: {
1404         __kmp_hierarchical_barrier_gather(
1405             bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1406         break;
1407       }
1408       case bp_tree_bar: {
1409         // don't set branch bits to 0; use linear
1410         KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
1411         __kmp_tree_barrier_gather(bt, this_thr, gtid, tid,
1412                                   reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1413         break;
1414       }
1415       default: {
1416         __kmp_linear_barrier_gather(bt, this_thr, gtid, tid,
1417                                     reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1418       }
1419       }
1420     }
1421 
1422     KMP_MB();
1423 
1424     if (KMP_MASTER_TID(tid)) {
1425       status = 0;
1426       if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
1427         __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1428       }
1429 #if USE_DEBUGGER
1430       // Let the debugger know: All threads are arrived and starting leaving the
1431       // barrier.
1432       team->t.t_bar[bt].b_team_arrived += 1;
1433 #endif
1434 
1435       if (__kmp_omp_cancellation) {
1436         kmp_int32 cancel_request = KMP_ATOMIC_LD_RLX(&team->t.t_cancel_request);
1437         // Reset cancellation flag for worksharing constructs
1438         if (cancel_request == cancel_loop ||
1439             cancel_request == cancel_sections) {
1440           KMP_ATOMIC_ST_RLX(&team->t.t_cancel_request, cancel_noreq);
1441         }
1442       }
1443 #if USE_ITT_BUILD
1444       /* TODO: In case of split reduction barrier, master thread may send
1445          acquired event early, before the final summation into the shared
1446          variable is done (final summation can be a long operation for array
1447          reductions).  */
1448       if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1449         __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1450 #endif /* USE_ITT_BUILD */
1451 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1452       // Barrier - report frame end (only if active_level == 1)
1453       if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
1454           __kmp_forkjoin_frames_mode &&
1455           (this_thr->th.th_teams_microtask == NULL || // either not in teams
1456            this_thr->th.th_teams_size.nteams == 1) && // or inside single team
1457           team->t.t_active_level == 1) {
1458         ident_t *loc = __kmp_threads[gtid]->th.th_ident;
1459         kmp_uint64 cur_time = __itt_get_timestamp();
1460         kmp_info_t **other_threads = team->t.t_threads;
1461         int nproc = this_thr->th.th_team_nproc;
1462         int i;
1463         switch (__kmp_forkjoin_frames_mode) {
1464         case 1:
1465           __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1466                                  loc, nproc);
1467           this_thr->th.th_frame_time = cur_time;
1468           break;
1469         case 2: // AC 2015-01-19: currently does not work for hierarchical (to
1470           // be fixed)
1471           __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time,
1472                                  1, loc, nproc);
1473           break;
1474         case 3:
1475           if (__itt_metadata_add_ptr) {
1476             // Initialize with master's wait time
1477             kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1478             // Set arrive time to zero to be able to check it in
1479             // __kmp_invoke_task(); the same is done inside the loop below
1480             this_thr->th.th_bar_arrive_time = 0;
1481             for (i = 1; i < nproc; ++i) {
1482               delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
1483               other_threads[i]->th.th_bar_arrive_time = 0;
1484             }
1485             __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
1486                                          cur_time, delta,
1487                                          (kmp_uint64)(reduce != NULL));
1488           }
1489           __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1490                                  loc, nproc);
1491           this_thr->th.th_frame_time = cur_time;
1492           break;
1493         }
1494       }
1495 #endif /* USE_ITT_BUILD */
1496     } else {
1497       status = 1;
1498 #if USE_ITT_BUILD
1499       if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1500         __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1501 #endif /* USE_ITT_BUILD */
1502     }
1503     if ((status == 1 || !is_split) && !cancelled) {
1504       if (cancellable) {
1505         cancelled = __kmp_linear_barrier_release_cancellable(
1506             bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1507       } else {
1508         switch (__kmp_barrier_release_pattern[bt]) {
1509         case bp_hyper_bar: {
1510           KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1511           __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
1512                                       FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1513           break;
1514         }
1515         case bp_hierarchical_bar: {
1516           __kmp_hierarchical_barrier_release(
1517               bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1518           break;
1519         }
1520         case bp_tree_bar: {
1521           KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1522           __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
1523                                      FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1524           break;
1525         }
1526         default: {
1527           __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
1528                                        FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1529         }
1530         }
1531       }
1532       if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
1533         __kmp_task_team_sync(this_thr, team);
1534       }
1535     }
1536 
1537 #if USE_ITT_BUILD
1538     /* GEH: TODO: Move this under if-condition above and also include in
1539        __kmp_end_split_barrier(). This will more accurately represent the actual
1540        release time of the threads for split barriers.  */
1541     if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1542       __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1543 #endif /* USE_ITT_BUILD */
1544   } else { // Team is serialized.
1545     status = 0;
1546     if (__kmp_tasking_mode != tskm_immediate_exec) {
1547       if (this_thr->th.th_task_team != NULL) {
1548 #if USE_ITT_NOTIFY
1549         void *itt_sync_obj = NULL;
1550         if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1551           itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1552           __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1553         }
1554 #endif
1555 
1556         KMP_DEBUG_ASSERT(this_thr->th.th_task_team->tt.tt_found_proxy_tasks ==
1557                          TRUE);
1558         __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1559         __kmp_task_team_setup(this_thr, team, 0);
1560 
1561 #if USE_ITT_BUILD
1562         if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1563           __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1564 #endif /* USE_ITT_BUILD */
1565       }
1566     }
1567   }
1568   KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
1569                 gtid, __kmp_team_from_gtid(gtid)->t.t_id,
1570                 __kmp_tid_from_gtid(gtid), status));
1571 
1572 #if OMPT_SUPPORT
1573   if (ompt_enabled.enabled) {
1574 #if OMPT_OPTIONAL
1575     if (ompt_enabled.ompt_callback_sync_region_wait) {
1576       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1577           barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
1578           return_address);
1579     }
1580     if (ompt_enabled.ompt_callback_sync_region) {
1581       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1582           barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
1583           return_address);
1584     }
1585 #endif
1586     this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1587   }
1588 #endif
1589   ANNOTATE_BARRIER_END(&team->t.t_bar);
1590 
1591   if (cancellable)
1592     return (int)cancelled;
1593   return status;
1594 }
1595 
1596 // Returns 0 if master thread, 1 if worker thread.
__kmp_barrier(enum barrier_type bt,int gtid,int is_split,size_t reduce_size,void * reduce_data,void (* reduce)(void *,void *))1597 int __kmp_barrier(enum barrier_type bt, int gtid, int is_split,
1598                   size_t reduce_size, void *reduce_data,
1599                   void (*reduce)(void *, void *)) {
1600   return __kmp_barrier_template<>(bt, gtid, is_split, reduce_size, reduce_data,
1601                                   reduce);
1602 }
1603 
1604 #if defined(KMP_GOMP_COMPAT)
1605 // Returns 1 if cancelled, 0 otherwise
__kmp_barrier_gomp_cancel(int gtid)1606 int __kmp_barrier_gomp_cancel(int gtid) {
1607   if (__kmp_omp_cancellation) {
1608     int cancelled = __kmp_barrier_template<true>(bs_plain_barrier, gtid, FALSE,
1609                                                  0, NULL, NULL);
1610     if (cancelled) {
1611       int tid = __kmp_tid_from_gtid(gtid);
1612       kmp_info_t *this_thr = __kmp_threads[gtid];
1613       if (KMP_MASTER_TID(tid)) {
1614         // Master does not need to revert anything
1615       } else {
1616         // Workers need to revert their private b_arrived flag
1617         this_thr->th.th_bar[bs_plain_barrier].bb.b_arrived -=
1618             KMP_BARRIER_STATE_BUMP;
1619       }
1620     }
1621     return cancelled;
1622   }
1623   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
1624   return FALSE;
1625 }
1626 #endif
1627 
__kmp_end_split_barrier(enum barrier_type bt,int gtid)1628 void __kmp_end_split_barrier(enum barrier_type bt, int gtid) {
1629   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_end_split_barrier);
1630   KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1631   int tid = __kmp_tid_from_gtid(gtid);
1632   kmp_info_t *this_thr = __kmp_threads[gtid];
1633   kmp_team_t *team = this_thr->th.th_team;
1634 
1635   ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1636   if (!team->t.t_serialized) {
1637     if (KMP_MASTER_GTID(gtid)) {
1638       switch (__kmp_barrier_release_pattern[bt]) {
1639       case bp_hyper_bar: {
1640         KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1641         __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
1642                                     FALSE USE_ITT_BUILD_ARG(NULL));
1643         break;
1644       }
1645       case bp_hierarchical_bar: {
1646         __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid,
1647                                            FALSE USE_ITT_BUILD_ARG(NULL));
1648         break;
1649       }
1650       case bp_tree_bar: {
1651         KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1652         __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
1653                                    FALSE USE_ITT_BUILD_ARG(NULL));
1654         break;
1655       }
1656       default: {
1657         __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
1658                                      FALSE USE_ITT_BUILD_ARG(NULL));
1659       }
1660       }
1661       if (__kmp_tasking_mode != tskm_immediate_exec) {
1662         __kmp_task_team_sync(this_thr, team);
1663       } // if
1664     }
1665   }
1666   ANNOTATE_BARRIER_END(&team->t.t_bar);
1667 }
1668 
__kmp_join_barrier(int gtid)1669 void __kmp_join_barrier(int gtid) {
1670   KMP_TIME_PARTITIONED_BLOCK(OMP_join_barrier);
1671   KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
1672   kmp_info_t *this_thr = __kmp_threads[gtid];
1673   kmp_team_t *team;
1674   kmp_uint nproc;
1675   kmp_info_t *master_thread;
1676   int tid;
1677 #ifdef KMP_DEBUG
1678   int team_id;
1679 #endif /* KMP_DEBUG */
1680 #if USE_ITT_BUILD
1681   void *itt_sync_obj = NULL;
1682 #if USE_ITT_NOTIFY
1683   if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need
1684     // Get object created at fork_barrier
1685     itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1686 #endif
1687 #endif /* USE_ITT_BUILD */
1688   KMP_MB();
1689 
1690   // Get current info
1691   team = this_thr->th.th_team;
1692   nproc = this_thr->th.th_team_nproc;
1693   KMP_DEBUG_ASSERT((int)nproc == team->t.t_nproc);
1694   tid = __kmp_tid_from_gtid(gtid);
1695 #ifdef KMP_DEBUG
1696   team_id = team->t.t_id;
1697 #endif /* KMP_DEBUG */
1698   master_thread = this_thr->th.th_team_master;
1699 #ifdef KMP_DEBUG
1700   if (master_thread != team->t.t_threads[0]) {
1701     __kmp_print_structure();
1702   }
1703 #endif /* KMP_DEBUG */
1704   KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);
1705   KMP_MB();
1706 
1707   // Verify state
1708   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
1709   KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
1710   KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
1711   KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
1712   KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n",
1713                 gtid, team_id, tid));
1714 
1715   ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1716 #if OMPT_SUPPORT
1717   if (ompt_enabled.enabled) {
1718 #if OMPT_OPTIONAL
1719     ompt_data_t *my_task_data;
1720     ompt_data_t *my_parallel_data;
1721     void *codeptr = NULL;
1722     int ds_tid = this_thr->th.th_info.ds.ds_tid;
1723     if (KMP_MASTER_TID(ds_tid) &&
1724         (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
1725          ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
1726       codeptr = team->t.ompt_team_info.master_return_address;
1727     my_task_data = OMPT_CUR_TASK_DATA(this_thr);
1728     my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
1729     if (ompt_enabled.ompt_callback_sync_region) {
1730       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1731           ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data,
1732           my_task_data, codeptr);
1733     }
1734     if (ompt_enabled.ompt_callback_sync_region_wait) {
1735       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1736           ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data,
1737           my_task_data, codeptr);
1738     }
1739     if (!KMP_MASTER_TID(ds_tid))
1740       this_thr->th.ompt_thread_info.task_data = *OMPT_CUR_TASK_DATA(this_thr);
1741 #endif
1742     this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier_implicit;
1743   }
1744 #endif
1745 
1746   if (__kmp_tasking_mode == tskm_extra_barrier) {
1747     __kmp_tasking_barrier(team, this_thr, gtid);
1748     KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n", gtid,
1749                   team_id, tid));
1750   }
1751 #ifdef KMP_DEBUG
1752   if (__kmp_tasking_mode != tskm_immediate_exec) {
1753     KA_TRACE(20, ("__kmp_join_barrier: T#%d, old team = %d, old task_team = "
1754                   "%p, th_task_team = %p\n",
1755                   __kmp_gtid_from_thread(this_thr), team_id,
1756                   team->t.t_task_team[this_thr->th.th_task_state],
1757                   this_thr->th.th_task_team));
1758     KMP_DEBUG_ASSERT(this_thr->th.th_task_team ==
1759                      team->t.t_task_team[this_thr->th.th_task_state]);
1760   }
1761 #endif /* KMP_DEBUG */
1762 
1763   /* Copy the blocktime info to the thread, where __kmp_wait_template() can
1764      access it when the team struct is not guaranteed to exist. Doing these
1765      loads causes a cache miss slows down EPCC parallel by 2x. As a workaround,
1766      we do not perform the copy if blocktime=infinite, since the values are not
1767      used by __kmp_wait_template() in that case. */
1768   if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1769 #if KMP_USE_MONITOR
1770     this_thr->th.th_team_bt_intervals =
1771         team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1772     this_thr->th.th_team_bt_set =
1773         team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1774 #else
1775     this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1776 #endif
1777   }
1778 
1779 #if USE_ITT_BUILD
1780   if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1781     __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1782 #endif /* USE_ITT_BUILD */
1783 
1784   switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
1785   case bp_hyper_bar: {
1786     KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1787     __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1788                                NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1789     break;
1790   }
1791   case bp_hierarchical_bar: {
1792     __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1793                                       NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1794     break;
1795   }
1796   case bp_tree_bar: {
1797     KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1798     __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1799                               NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1800     break;
1801   }
1802   default: {
1803     __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1804                                 NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1805   }
1806   }
1807 
1808   /* From this point on, the team data structure may be deallocated at any time
1809      by the master thread - it is unsafe to reference it in any of the worker
1810      threads. Any per-team data items that need to be referenced before the
1811      end of the barrier should be moved to the kmp_task_team_t structs.  */
1812   if (KMP_MASTER_TID(tid)) {
1813     if (__kmp_tasking_mode != tskm_immediate_exec) {
1814       __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1815     }
1816     if (__kmp_display_affinity) {
1817       KMP_CHECK_UPDATE(team->t.t_display_affinity, 0);
1818     }
1819 #if KMP_STATS_ENABLED
1820     // Have master thread flag the workers to indicate they are now waiting for
1821     // next parallel region, Also wake them up so they switch their timers to
1822     // idle.
1823     for (int i = 0; i < team->t.t_nproc; ++i) {
1824       kmp_info_t *team_thread = team->t.t_threads[i];
1825       if (team_thread == this_thr)
1826         continue;
1827       team_thread->th.th_stats->setIdleFlag();
1828       if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME &&
1829           team_thread->th.th_sleep_loc != NULL)
1830         __kmp_null_resume_wrapper(__kmp_gtid_from_thread(team_thread),
1831                                   team_thread->th.th_sleep_loc);
1832     }
1833 #endif
1834 #if USE_ITT_BUILD
1835     if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1836       __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1837 #endif /* USE_ITT_BUILD */
1838 
1839 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1840     // Join barrier - report frame end
1841     if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
1842         __kmp_forkjoin_frames_mode &&
1843         (this_thr->th.th_teams_microtask == NULL || // either not in teams
1844          this_thr->th.th_teams_size.nteams == 1) && // or inside single team
1845         team->t.t_active_level == 1) {
1846       kmp_uint64 cur_time = __itt_get_timestamp();
1847       ident_t *loc = team->t.t_ident;
1848       kmp_info_t **other_threads = team->t.t_threads;
1849       int nproc = this_thr->th.th_team_nproc;
1850       int i;
1851       switch (__kmp_forkjoin_frames_mode) {
1852       case 1:
1853         __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1854                                loc, nproc);
1855         break;
1856       case 2:
1857         __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1,
1858                                loc, nproc);
1859         break;
1860       case 3:
1861         if (__itt_metadata_add_ptr) {
1862           // Initialize with master's wait time
1863           kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1864           // Set arrive time to zero to be able to check it in
1865           // __kmp_invoke_task(); the same is done inside the loop below
1866           this_thr->th.th_bar_arrive_time = 0;
1867           for (i = 1; i < nproc; ++i) {
1868             delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
1869             other_threads[i]->th.th_bar_arrive_time = 0;
1870           }
1871           __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
1872                                        cur_time, delta, 0);
1873         }
1874         __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1875                                loc, nproc);
1876         this_thr->th.th_frame_time = cur_time;
1877         break;
1878       }
1879     }
1880 #endif /* USE_ITT_BUILD */
1881   }
1882 #if USE_ITT_BUILD
1883   else {
1884     if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1885       __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1886   }
1887 #endif /* USE_ITT_BUILD */
1888 
1889 #if KMP_DEBUG
1890   if (KMP_MASTER_TID(tid)) {
1891     KA_TRACE(
1892         15,
1893         ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
1894          gtid, team_id, tid, nproc));
1895   }
1896 #endif /* KMP_DEBUG */
1897 
1898   // TODO now, mark worker threads as done so they may be disbanded
1899   KMP_MB(); // Flush all pending memory write invalidates.
1900   KA_TRACE(10,
1901            ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
1902 
1903   ANNOTATE_BARRIER_END(&team->t.t_bar);
1904 }
1905 
1906 // TODO release worker threads' fork barriers as we are ready instead of all at
1907 // once
__kmp_fork_barrier(int gtid,int tid)1908 void __kmp_fork_barrier(int gtid, int tid) {
1909   KMP_TIME_PARTITIONED_BLOCK(OMP_fork_barrier);
1910   KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
1911   kmp_info_t *this_thr = __kmp_threads[gtid];
1912   kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
1913 #if USE_ITT_BUILD
1914   void *itt_sync_obj = NULL;
1915 #endif /* USE_ITT_BUILD */
1916   if (team)
1917     ANNOTATE_BARRIER_END(&team->t.t_bar);
1918 
1919   KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n", gtid,
1920                 (team != NULL) ? team->t.t_id : -1, tid));
1921 
1922   // th_team pointer only valid for master thread here
1923   if (KMP_MASTER_TID(tid)) {
1924 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1925     if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1926       // Create itt barrier object
1927       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);
1928       __kmp_itt_barrier_middle(gtid, itt_sync_obj); // Call acquired/releasing
1929     }
1930 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1931 
1932 #ifdef KMP_DEBUG
1933     kmp_info_t **other_threads = team->t.t_threads;
1934     int i;
1935 
1936     // Verify state
1937     KMP_MB();
1938 
1939     for (i = 1; i < team->t.t_nproc; ++i) {
1940       KA_TRACE(500,
1941                ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go "
1942                 "== %u.\n",
1943                 gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
1944                 team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
1945                 other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
1946       KMP_DEBUG_ASSERT(
1947           (TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go) &
1948            ~(KMP_BARRIER_SLEEP_STATE)) == KMP_INIT_BARRIER_STATE);
1949       KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
1950     }
1951 #endif
1952 
1953     if (__kmp_tasking_mode != tskm_immediate_exec) {
1954       // 0 indicates setup current task team if nthreads > 1
1955       __kmp_task_team_setup(this_thr, team, 0);
1956     }
1957 
1958     /* The master thread may have changed its blocktime between the join barrier
1959        and the fork barrier. Copy the blocktime info to the thread, where
1960        __kmp_wait_template() can access it when the team struct is not
1961        guaranteed to exist. */
1962     // See note about the corresponding code in __kmp_join_barrier() being
1963     // performance-critical
1964     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1965 #if KMP_USE_MONITOR
1966       this_thr->th.th_team_bt_intervals =
1967           team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1968       this_thr->th.th_team_bt_set =
1969           team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1970 #else
1971       this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1972 #endif
1973     }
1974   } // master
1975 
1976   switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
1977   case bp_hyper_bar: {
1978     KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1979     __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1980                                 TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1981     break;
1982   }
1983   case bp_hierarchical_bar: {
1984     __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1985                                        TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1986     break;
1987   }
1988   case bp_tree_bar: {
1989     KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1990     __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1991                                TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1992     break;
1993   }
1994   default: {
1995     __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1996                                  TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1997   }
1998   }
1999 
2000 #if OMPT_SUPPORT
2001   if (ompt_enabled.enabled &&
2002       this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
2003     int ds_tid = this_thr->th.th_info.ds.ds_tid;
2004     ompt_data_t *task_data = (team)
2005                                  ? OMPT_CUR_TASK_DATA(this_thr)
2006                                  : &(this_thr->th.ompt_thread_info.task_data);
2007     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
2008 #if OMPT_OPTIONAL
2009     void *codeptr = NULL;
2010     if (KMP_MASTER_TID(ds_tid) &&
2011         (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
2012          ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
2013       codeptr = team->t.ompt_team_info.master_return_address;
2014     if (ompt_enabled.ompt_callback_sync_region_wait) {
2015       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2016           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
2017           codeptr);
2018     }
2019     if (ompt_enabled.ompt_callback_sync_region) {
2020       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2021           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
2022           codeptr);
2023     }
2024 #endif
2025     if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
2026       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2027           ompt_scope_end, NULL, task_data, 0, ds_tid, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
2028     }
2029   }
2030 #endif
2031 
2032   // Early exit for reaping threads releasing forkjoin barrier
2033   if (TCR_4(__kmp_global.g.g_done)) {
2034     this_thr->th.th_task_team = NULL;
2035 
2036 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2037     if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2038       if (!KMP_MASTER_TID(tid)) {
2039         itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2040         if (itt_sync_obj)
2041           __kmp_itt_barrier_finished(gtid, itt_sync_obj);
2042       }
2043     }
2044 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2045     KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid));
2046     return;
2047   }
2048 
2049   /* We can now assume that a valid team structure has been allocated by the
2050      master and propagated to all worker threads. The current thread, however,
2051      may not be part of the team, so we can't blindly assume that the team
2052      pointer is non-null.  */
2053   team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
2054   KMP_DEBUG_ASSERT(team != NULL);
2055   tid = __kmp_tid_from_gtid(gtid);
2056 
2057 #if KMP_BARRIER_ICV_PULL
2058   /* Master thread's copy of the ICVs was set up on the implicit taskdata in
2059      __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's
2060      implicit task has this data before this function is called. We cannot
2061      modify __kmp_fork_call() to look at the fixed ICVs in the master's thread
2062      struct, because it is not always the case that the threads arrays have
2063      been allocated when __kmp_fork_call() is executed. */
2064   {
2065     KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
2066     if (!KMP_MASTER_TID(tid)) { // master thread already has ICVs
2067       // Copy the initial ICVs from the master's thread struct to the implicit
2068       // task for this tid.
2069       KA_TRACE(10,
2070                ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
2071       __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team,
2072                                tid, FALSE);
2073       copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
2074                 &team->t.t_threads[0]
2075                      ->th.th_bar[bs_forkjoin_barrier]
2076                      .bb.th_fixed_icvs);
2077     }
2078   }
2079 #endif // KMP_BARRIER_ICV_PULL
2080 
2081   if (__kmp_tasking_mode != tskm_immediate_exec) {
2082     __kmp_task_team_sync(this_thr, team);
2083   }
2084 
2085 #if KMP_AFFINITY_SUPPORTED
2086   kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
2087   if (proc_bind == proc_bind_intel) {
2088     // Call dynamic affinity settings
2089     if (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) {
2090       __kmp_balanced_affinity(this_thr, team->t.t_nproc);
2091     }
2092   } else if (proc_bind != proc_bind_false) {
2093     if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
2094       KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n",
2095                      __kmp_gtid_from_thread(this_thr),
2096                      this_thr->th.th_current_place));
2097     } else {
2098       __kmp_affinity_set_place(gtid);
2099     }
2100   }
2101 #endif // KMP_AFFINITY_SUPPORTED
2102   // Perform the display affinity functionality
2103   if (__kmp_display_affinity) {
2104     if (team->t.t_display_affinity
2105 #if KMP_AFFINITY_SUPPORTED
2106         || (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed)
2107 #endif
2108             ) {
2109       // NULL means use the affinity-format-var ICV
2110       __kmp_aux_display_affinity(gtid, NULL);
2111       this_thr->th.th_prev_num_threads = team->t.t_nproc;
2112       this_thr->th.th_prev_level = team->t.t_level;
2113     }
2114   }
2115   if (!KMP_MASTER_TID(tid))
2116     KMP_CHECK_UPDATE(this_thr->th.th_def_allocator, team->t.t_def_allocator);
2117 
2118 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2119   if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2120     if (!KMP_MASTER_TID(tid)) {
2121       // Get correct barrier object
2122       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2123       __kmp_itt_barrier_finished(gtid, itt_sync_obj); // Workers call acquired
2124     } // (prepare called inside barrier_release)
2125   }
2126 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2127   ANNOTATE_BARRIER_END(&team->t.t_bar);
2128   KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid,
2129                 team->t.t_id, tid));
2130 }
2131 
__kmp_setup_icv_copy(kmp_team_t * team,int new_nproc,kmp_internal_control_t * new_icvs,ident_t * loc)2132 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
2133                           kmp_internal_control_t *new_icvs, ident_t *loc) {
2134   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_setup_icv_copy);
2135 
2136   KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
2137   KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
2138 
2139 /* Master thread's copy of the ICVs was set up on the implicit taskdata in
2140    __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's
2141    implicit task has this data before this function is called. */
2142 #if KMP_BARRIER_ICV_PULL
2143   /* Copy ICVs to master's thread structure into th_fixed_icvs (which remains
2144      untouched), where all of the worker threads can access them and make their
2145      own copies after the barrier. */
2146   KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
2147   // allocated at this point
2148   copy_icvs(
2149       &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs,
2150       new_icvs);
2151   KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n", 0,
2152                 team->t.t_threads[0], team));
2153 #elif KMP_BARRIER_ICV_PUSH
2154   // The ICVs will be propagated in the fork barrier, so nothing needs to be
2155   // done here.
2156   KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n", 0,
2157                 team->t.t_threads[0], team));
2158 #else
2159   // Copy the ICVs to each of the non-master threads.  This takes O(nthreads)
2160   // time.
2161   ngo_load(new_icvs);
2162   KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
2163   // allocated at this point
2164   for (int f = 1; f < new_nproc; ++f) { // Skip the master thread
2165     // TODO: GEH - pass in better source location info since usually NULL here
2166     KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2167                   f, team->t.t_threads[f], team));
2168     __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
2169     ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
2170     KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2171                   f, team->t.t_threads[f], team));
2172   }
2173   ngo_sync();
2174 #endif // KMP_BARRIER_ICV_PULL
2175 }
2176