1 /*
2  * kmp_runtime.cpp -- KPTS runtime support library
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_affinity.h"
15 #include "kmp_atomic.h"
16 #include "kmp_environment.h"
17 #include "kmp_error.h"
18 #include "kmp_i18n.h"
19 #include "kmp_io.h"
20 #include "kmp_itt.h"
21 #include "kmp_settings.h"
22 #include "kmp_stats.h"
23 #include "kmp_str.h"
24 #include "kmp_wait_release.h"
25 #include "kmp_wrapper_getpid.h"
26 #include "kmp_dispatch.h"
27 #if KMP_USE_HIER_SCHED
28 #include "kmp_dispatch_hier.h"
29 #endif
30 
31 #if OMPT_SUPPORT
32 #include "ompt-specific.h"
33 #endif
34 
35 /* these are temporary issues to be dealt with */
36 #define KMP_USE_PRCTL 0
37 
38 #if KMP_OS_WINDOWS
39 #include <process.h>
40 #endif
41 
42 #include "tsan_annotations.h"
43 
44 #if KMP_OS_WINDOWS
45 // windows does not need include files as it doesn't use shared memory
46 #else
47 #include <sys/mman.h>
48 #include <sys/stat.h>
49 #include <fcntl.h>
50 #define SHM_SIZE 1024
51 #endif
52 
53 #if defined(KMP_GOMP_COMPAT)
54 char const __kmp_version_alt_comp[] =
55     KMP_VERSION_PREFIX "alternative compiler support: yes";
56 #endif /* defined(KMP_GOMP_COMPAT) */
57 
58 char const __kmp_version_omp_api[] =
59     KMP_VERSION_PREFIX "API version: 5.0 (201611)";
60 
61 #ifdef KMP_DEBUG
62 char const __kmp_version_lock[] =
63     KMP_VERSION_PREFIX "lock type: run time selectable";
64 #endif /* KMP_DEBUG */
65 
66 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
67 
68 /* ------------------------------------------------------------------------ */
69 
70 #if KMP_USE_MONITOR
71 kmp_info_t __kmp_monitor;
72 #endif
73 
74 /* Forward declarations */
75 
76 void __kmp_cleanup(void);
77 
78 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
79                                   int gtid);
80 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
81                                   kmp_internal_control_t *new_icvs,
82                                   ident_t *loc);
83 #if KMP_AFFINITY_SUPPORTED
84 static void __kmp_partition_places(kmp_team_t *team,
85                                    int update_master_only = 0);
86 #endif
87 static void __kmp_do_serial_initialize(void);
88 void __kmp_fork_barrier(int gtid, int tid);
89 void __kmp_join_barrier(int gtid);
90 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
91                           kmp_internal_control_t *new_icvs, ident_t *loc);
92 
93 #ifdef USE_LOAD_BALANCE
94 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
95 #endif
96 
97 static int __kmp_expand_threads(int nNeed);
98 #if KMP_OS_WINDOWS
99 static int __kmp_unregister_root_other_thread(int gtid);
100 #endif
101 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
102 kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
103 
104 /* Calculate the identifier of the current thread */
105 /* fast (and somewhat portable) way to get unique identifier of executing
106    thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
__kmp_get_global_thread_id()107 int __kmp_get_global_thread_id() {
108   int i;
109   kmp_info_t **other_threads;
110   size_t stack_data;
111   char *stack_addr;
112   size_t stack_size;
113   char *stack_base;
114 
115   KA_TRACE(
116       1000,
117       ("*** __kmp_get_global_thread_id: entering, nproc=%d  all_nproc=%d\n",
118        __kmp_nth, __kmp_all_nth));
119 
120   /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
121      a parallel region, made it return KMP_GTID_DNE to force serial_initialize
122      by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
123      __kmp_init_gtid for this to work. */
124 
125   if (!TCR_4(__kmp_init_gtid))
126     return KMP_GTID_DNE;
127 
128 #ifdef KMP_TDATA_GTID
129   if (TCR_4(__kmp_gtid_mode) >= 3) {
130     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
131     return __kmp_gtid;
132   }
133 #endif
134   if (TCR_4(__kmp_gtid_mode) >= 2) {
135     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
136     return __kmp_gtid_get_specific();
137   }
138   KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
139 
140   stack_addr = (char *)&stack_data;
141   other_threads = __kmp_threads;
142 
143   /* ATT: The code below is a source of potential bugs due to unsynchronized
144      access to __kmp_threads array. For example:
145      1. Current thread loads other_threads[i] to thr and checks it, it is
146         non-NULL.
147      2. Current thread is suspended by OS.
148      3. Another thread unregisters and finishes (debug versions of free()
149         may fill memory with something like 0xEF).
150      4. Current thread is resumed.
151      5. Current thread reads junk from *thr.
152      TODO: Fix it.  --ln  */
153 
154   for (i = 0; i < __kmp_threads_capacity; i++) {
155 
156     kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
157     if (!thr)
158       continue;
159 
160     stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
161     stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
162 
163     /* stack grows down -- search through all of the active threads */
164 
165     if (stack_addr <= stack_base) {
166       size_t stack_diff = stack_base - stack_addr;
167 
168       if (stack_diff <= stack_size) {
169         /* The only way we can be closer than the allocated */
170         /* stack size is if we are running on this thread. */
171         KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
172         return i;
173       }
174     }
175   }
176 
177   /* get specific to try and determine our gtid */
178   KA_TRACE(1000,
179            ("*** __kmp_get_global_thread_id: internal alg. failed to find "
180             "thread, using TLS\n"));
181   i = __kmp_gtid_get_specific();
182 
183   /*fprintf( stderr, "=== %d\n", i );  */ /* GROO */
184 
185   /* if we havn't been assigned a gtid, then return code */
186   if (i < 0)
187     return i;
188 
189   /* dynamically updated stack window for uber threads to avoid get_specific
190      call */
191   if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
192     KMP_FATAL(StackOverflow, i);
193   }
194 
195   stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
196   if (stack_addr > stack_base) {
197     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
198     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
199             other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
200                 stack_base);
201   } else {
202     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
203             stack_base - stack_addr);
204   }
205 
206   /* Reprint stack bounds for ubermaster since they have been refined */
207   if (__kmp_storage_map) {
208     char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
209     char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
210     __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
211                                  other_threads[i]->th.th_info.ds.ds_stacksize,
212                                  "th_%d stack (refinement)", i);
213   }
214   return i;
215 }
216 
__kmp_get_global_thread_id_reg()217 int __kmp_get_global_thread_id_reg() {
218   int gtid;
219 
220   if (!__kmp_init_serial) {
221     gtid = KMP_GTID_DNE;
222   } else
223 #ifdef KMP_TDATA_GTID
224       if (TCR_4(__kmp_gtid_mode) >= 3) {
225     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
226     gtid = __kmp_gtid;
227   } else
228 #endif
229       if (TCR_4(__kmp_gtid_mode) >= 2) {
230     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
231     gtid = __kmp_gtid_get_specific();
232   } else {
233     KA_TRACE(1000,
234              ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
235     gtid = __kmp_get_global_thread_id();
236   }
237 
238   /* we must be a new uber master sibling thread */
239   if (gtid == KMP_GTID_DNE) {
240     KA_TRACE(10,
241              ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
242               "Registering a new gtid.\n"));
243     __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
244     if (!__kmp_init_serial) {
245       __kmp_do_serial_initialize();
246       gtid = __kmp_gtid_get_specific();
247     } else {
248       gtid = __kmp_register_root(FALSE);
249     }
250     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
251     /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
252   }
253 
254   KMP_DEBUG_ASSERT(gtid >= 0);
255 
256   return gtid;
257 }
258 
259 /* caller must hold forkjoin_lock */
__kmp_check_stack_overlap(kmp_info_t * th)260 void __kmp_check_stack_overlap(kmp_info_t *th) {
261   int f;
262   char *stack_beg = NULL;
263   char *stack_end = NULL;
264   int gtid;
265 
266   KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
267   if (__kmp_storage_map) {
268     stack_end = (char *)th->th.th_info.ds.ds_stackbase;
269     stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
270 
271     gtid = __kmp_gtid_from_thread(th);
272 
273     if (gtid == KMP_GTID_MONITOR) {
274       __kmp_print_storage_map_gtid(
275           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
276           "th_%s stack (%s)", "mon",
277           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
278     } else {
279       __kmp_print_storage_map_gtid(
280           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
281           "th_%d stack (%s)", gtid,
282           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
283     }
284   }
285 
286   /* No point in checking ubermaster threads since they use refinement and
287    * cannot overlap */
288   gtid = __kmp_gtid_from_thread(th);
289   if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
290     KA_TRACE(10,
291              ("__kmp_check_stack_overlap: performing extensive checking\n"));
292     if (stack_beg == NULL) {
293       stack_end = (char *)th->th.th_info.ds.ds_stackbase;
294       stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
295     }
296 
297     for (f = 0; f < __kmp_threads_capacity; f++) {
298       kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
299 
300       if (f_th && f_th != th) {
301         char *other_stack_end =
302             (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
303         char *other_stack_beg =
304             other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
305         if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
306             (stack_end > other_stack_beg && stack_end < other_stack_end)) {
307 
308           /* Print the other stack values before the abort */
309           if (__kmp_storage_map)
310             __kmp_print_storage_map_gtid(
311                 -1, other_stack_beg, other_stack_end,
312                 (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
313                 "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
314 
315           __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
316                       __kmp_msg_null);
317         }
318       }
319     }
320   }
321   KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
322 }
323 
324 /* ------------------------------------------------------------------------ */
325 
__kmp_infinite_loop(void)326 void __kmp_infinite_loop(void) {
327   static int done = FALSE;
328 
329   while (!done) {
330     KMP_YIELD(TRUE);
331   }
332 }
333 
334 #define MAX_MESSAGE 512
335 
__kmp_print_storage_map_gtid(int gtid,void * p1,void * p2,size_t size,char const * format,...)336 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
337                                   char const *format, ...) {
338   char buffer[MAX_MESSAGE];
339   va_list ap;
340 
341   va_start(ap, format);
342   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
343                p2, (unsigned long)size, format);
344   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
345   __kmp_vprintf(kmp_err, buffer, ap);
346 #if KMP_PRINT_DATA_PLACEMENT
347   int node;
348   if (gtid >= 0) {
349     if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
350       if (__kmp_storage_map_verbose) {
351         node = __kmp_get_host_node(p1);
352         if (node < 0) /* doesn't work, so don't try this next time */
353           __kmp_storage_map_verbose = FALSE;
354         else {
355           char *last;
356           int lastNode;
357           int localProc = __kmp_get_cpu_from_gtid(gtid);
358 
359           const int page_size = KMP_GET_PAGE_SIZE();
360 
361           p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
362           p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
363           if (localProc >= 0)
364             __kmp_printf_no_lock("  GTID %d localNode %d\n", gtid,
365                                  localProc >> 1);
366           else
367             __kmp_printf_no_lock("  GTID %d\n", gtid);
368 #if KMP_USE_PRCTL
369           /* The more elaborate format is disabled for now because of the prctl
370            * hanging bug. */
371           do {
372             last = p1;
373             lastNode = node;
374             /* This loop collates adjacent pages with the same host node. */
375             do {
376               (char *)p1 += page_size;
377             } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
378             __kmp_printf_no_lock("    %p-%p memNode %d\n", last, (char *)p1 - 1,
379                                  lastNode);
380           } while (p1 <= p2);
381 #else
382           __kmp_printf_no_lock("    %p-%p memNode %d\n", p1,
383                                (char *)p1 + (page_size - 1),
384                                __kmp_get_host_node(p1));
385           if (p1 < p2) {
386             __kmp_printf_no_lock("    %p-%p memNode %d\n", p2,
387                                  (char *)p2 + (page_size - 1),
388                                  __kmp_get_host_node(p2));
389           }
390 #endif
391         }
392       }
393     } else
394       __kmp_printf_no_lock("  %s\n", KMP_I18N_STR(StorageMapWarning));
395   }
396 #endif /* KMP_PRINT_DATA_PLACEMENT */
397   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
398 }
399 
__kmp_warn(char const * format,...)400 void __kmp_warn(char const *format, ...) {
401   char buffer[MAX_MESSAGE];
402   va_list ap;
403 
404   if (__kmp_generate_warnings == kmp_warnings_off) {
405     return;
406   }
407 
408   va_start(ap, format);
409 
410   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
411   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
412   __kmp_vprintf(kmp_err, buffer, ap);
413   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
414 
415   va_end(ap);
416 }
417 
__kmp_abort_process()418 void __kmp_abort_process() {
419   // Later threads may stall here, but that's ok because abort() will kill them.
420   __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
421 
422   if (__kmp_debug_buf) {
423     __kmp_dump_debug_buffer();
424   }
425 
426   if (KMP_OS_WINDOWS) {
427     // Let other threads know of abnormal termination and prevent deadlock
428     // if abort happened during library initialization or shutdown
429     __kmp_global.g.g_abort = SIGABRT;
430 
431     /* On Windows* OS by default abort() causes pop-up error box, which stalls
432        nightly testing. Unfortunately, we cannot reliably suppress pop-up error
433        boxes. _set_abort_behavior() works well, but this function is not
434        available in VS7 (this is not problem for DLL, but it is a problem for
435        static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
436        help, at least in some versions of MS C RTL.
437 
438        It seems following sequence is the only way to simulate abort() and
439        avoid pop-up error box. */
440     raise(SIGABRT);
441     _exit(3); // Just in case, if signal ignored, exit anyway.
442   } else {
443     __kmp_unregister_library();
444     abort();
445   }
446 
447   __kmp_infinite_loop();
448   __kmp_release_bootstrap_lock(&__kmp_exit_lock);
449 
450 } // __kmp_abort_process
451 
__kmp_abort_thread(void)452 void __kmp_abort_thread(void) {
453   // TODO: Eliminate g_abort global variable and this function.
454   // In case of abort just call abort(), it will kill all the threads.
455   __kmp_infinite_loop();
456 } // __kmp_abort_thread
457 
458 /* Print out the storage map for the major kmp_info_t thread data structures
459    that are allocated together. */
460 
__kmp_print_thread_storage_map(kmp_info_t * thr,int gtid)461 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
462   __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
463                                gtid);
464 
465   __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
466                                sizeof(kmp_desc_t), "th_%d.th_info", gtid);
467 
468   __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
469                                sizeof(kmp_local_t), "th_%d.th_local", gtid);
470 
471   __kmp_print_storage_map_gtid(
472       gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
473       sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
474 
475   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
476                                &thr->th.th_bar[bs_plain_barrier + 1],
477                                sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
478                                gtid);
479 
480   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
481                                &thr->th.th_bar[bs_forkjoin_barrier + 1],
482                                sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
483                                gtid);
484 
485 #if KMP_FAST_REDUCTION_BARRIER
486   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
487                                &thr->th.th_bar[bs_reduction_barrier + 1],
488                                sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
489                                gtid);
490 #endif // KMP_FAST_REDUCTION_BARRIER
491 }
492 
493 /* Print out the storage map for the major kmp_team_t team data structures
494    that are allocated together. */
495 
__kmp_print_team_storage_map(const char * header,kmp_team_t * team,int team_id,int num_thr)496 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
497                                          int team_id, int num_thr) {
498   int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
499   __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
500                                header, team_id);
501 
502   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
503                                &team->t.t_bar[bs_last_barrier],
504                                sizeof(kmp_balign_team_t) * bs_last_barrier,
505                                "%s_%d.t_bar", header, team_id);
506 
507   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
508                                &team->t.t_bar[bs_plain_barrier + 1],
509                                sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
510                                header, team_id);
511 
512   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
513                                &team->t.t_bar[bs_forkjoin_barrier + 1],
514                                sizeof(kmp_balign_team_t),
515                                "%s_%d.t_bar[forkjoin]", header, team_id);
516 
517 #if KMP_FAST_REDUCTION_BARRIER
518   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
519                                &team->t.t_bar[bs_reduction_barrier + 1],
520                                sizeof(kmp_balign_team_t),
521                                "%s_%d.t_bar[reduction]", header, team_id);
522 #endif // KMP_FAST_REDUCTION_BARRIER
523 
524   __kmp_print_storage_map_gtid(
525       -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
526       sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
527 
528   __kmp_print_storage_map_gtid(
529       -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
530       sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
531 
532   __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
533                                &team->t.t_disp_buffer[num_disp_buff],
534                                sizeof(dispatch_shared_info_t) * num_disp_buff,
535                                "%s_%d.t_disp_buffer", header, team_id);
536 }
537 
__kmp_init_allocator()538 static void __kmp_init_allocator() { __kmp_init_memkind(); }
__kmp_fini_allocator()539 static void __kmp_fini_allocator() { __kmp_fini_memkind(); }
540 
541 /* ------------------------------------------------------------------------ */
542 
543 #if KMP_DYNAMIC_LIB
544 #if KMP_OS_WINDOWS
545 
__kmp_reset_lock(kmp_bootstrap_lock_t * lck)546 static void __kmp_reset_lock(kmp_bootstrap_lock_t *lck) {
547   // TODO: Change to __kmp_break_bootstrap_lock().
548   __kmp_init_bootstrap_lock(lck); // make the lock released
549 }
550 
__kmp_reset_locks_on_process_detach(int gtid_req)551 static void __kmp_reset_locks_on_process_detach(int gtid_req) {
552   int i;
553   int thread_count;
554 
555   // PROCESS_DETACH is expected to be called by a thread that executes
556   // ProcessExit() or FreeLibrary(). OS terminates other threads (except the one
557   // calling ProcessExit or FreeLibrary). So, it might be safe to access the
558   // __kmp_threads[] without taking the forkjoin_lock. However, in fact, some
559   // threads can be still alive here, although being about to be terminated. The
560   // threads in the array with ds_thread==0 are most suspicious. Actually, it
561   // can be not safe to access the __kmp_threads[].
562 
563   // TODO: does it make sense to check __kmp_roots[] ?
564 
565   // Let's check that there are no other alive threads registered with the OMP
566   // lib.
567   while (1) {
568     thread_count = 0;
569     for (i = 0; i < __kmp_threads_capacity; ++i) {
570       if (!__kmp_threads)
571         continue;
572       kmp_info_t *th = __kmp_threads[i];
573       if (th == NULL)
574         continue;
575       int gtid = th->th.th_info.ds.ds_gtid;
576       if (gtid == gtid_req)
577         continue;
578       if (gtid < 0)
579         continue;
580       DWORD exit_val;
581       int alive = __kmp_is_thread_alive(th, &exit_val);
582       if (alive) {
583         ++thread_count;
584       }
585     }
586     if (thread_count == 0)
587       break; // success
588   }
589 
590   // Assume that I'm alone. Now it might be safe to check and reset locks.
591   // __kmp_forkjoin_lock and __kmp_stdio_lock are expected to be reset.
592   __kmp_reset_lock(&__kmp_forkjoin_lock);
593 #ifdef KMP_DEBUG
594   __kmp_reset_lock(&__kmp_stdio_lock);
595 #endif // KMP_DEBUG
596 }
597 
DllMain(HINSTANCE hInstDLL,DWORD fdwReason,LPVOID lpReserved)598 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
599   //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
600 
601   switch (fdwReason) {
602 
603   case DLL_PROCESS_ATTACH:
604     KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
605 
606     return TRUE;
607 
608   case DLL_PROCESS_DETACH:
609     KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
610 
611     if (lpReserved != NULL) {
612       // lpReserved is used for telling the difference:
613       //   lpReserved == NULL when FreeLibrary() was called,
614       //   lpReserved != NULL when the process terminates.
615       // When FreeLibrary() is called, worker threads remain alive. So they will
616       // release the forkjoin lock by themselves. When the process terminates,
617       // worker threads disappear triggering the problem of unreleased forkjoin
618       // lock as described below.
619 
620       // A worker thread can take the forkjoin lock. The problem comes up if
621       // that worker thread becomes dead before it releases the forkjoin lock.
622       // The forkjoin lock remains taken, while the thread executing
623       // DllMain()->PROCESS_DETACH->__kmp_internal_end_library() below will try
624       // to take the forkjoin lock and will always fail, so that the application
625       // will never finish [normally]. This scenario is possible if
626       // __kmpc_end() has not been executed. It looks like it's not a corner
627       // case, but common cases:
628       // - the main function was compiled by an alternative compiler;
629       // - the main function was compiled by icl but without /Qopenmp
630       //   (application with plugins);
631       // - application terminates by calling C exit(), Fortran CALL EXIT() or
632       //   Fortran STOP.
633       // - alive foreign thread prevented __kmpc_end from doing cleanup.
634       //
635       // This is a hack to work around the problem.
636       // TODO: !!! figure out something better.
637       __kmp_reset_locks_on_process_detach(__kmp_gtid_get_specific());
638     }
639 
640     __kmp_internal_end_library(__kmp_gtid_get_specific());
641 
642     return TRUE;
643 
644   case DLL_THREAD_ATTACH:
645     KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
646 
647     /* if we want to register new siblings all the time here call
648      * __kmp_get_gtid(); */
649     return TRUE;
650 
651   case DLL_THREAD_DETACH:
652     KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
653 
654     __kmp_internal_end_thread(__kmp_gtid_get_specific());
655     return TRUE;
656   }
657 
658   return TRUE;
659 }
660 
661 #endif /* KMP_OS_WINDOWS */
662 #endif /* KMP_DYNAMIC_LIB */
663 
664 /* __kmp_parallel_deo -- Wait until it's our turn. */
__kmp_parallel_deo(int * gtid_ref,int * cid_ref,ident_t * loc_ref)665 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
666   int gtid = *gtid_ref;
667 #ifdef BUILD_PARALLEL_ORDERED
668   kmp_team_t *team = __kmp_team_from_gtid(gtid);
669 #endif /* BUILD_PARALLEL_ORDERED */
670 
671   if (__kmp_env_consistency_check) {
672     if (__kmp_threads[gtid]->th.th_root->r.r_active)
673 #if KMP_USE_DYNAMIC_LOCK
674       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
675 #else
676       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
677 #endif
678   }
679 #ifdef BUILD_PARALLEL_ORDERED
680   if (!team->t.t_serialized) {
681     KMP_MB();
682     KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
683              NULL);
684     KMP_MB();
685   }
686 #endif /* BUILD_PARALLEL_ORDERED */
687 }
688 
689 /* __kmp_parallel_dxo -- Signal the next task. */
__kmp_parallel_dxo(int * gtid_ref,int * cid_ref,ident_t * loc_ref)690 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
691   int gtid = *gtid_ref;
692 #ifdef BUILD_PARALLEL_ORDERED
693   int tid = __kmp_tid_from_gtid(gtid);
694   kmp_team_t *team = __kmp_team_from_gtid(gtid);
695 #endif /* BUILD_PARALLEL_ORDERED */
696 
697   if (__kmp_env_consistency_check) {
698     if (__kmp_threads[gtid]->th.th_root->r.r_active)
699       __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
700   }
701 #ifdef BUILD_PARALLEL_ORDERED
702   if (!team->t.t_serialized) {
703     KMP_MB(); /* Flush all pending memory write invalidates.  */
704 
705     /* use the tid of the next thread in this team */
706     /* TODO replace with general release procedure */
707     team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
708 
709     KMP_MB(); /* Flush all pending memory write invalidates.  */
710   }
711 #endif /* BUILD_PARALLEL_ORDERED */
712 }
713 
714 /* ------------------------------------------------------------------------ */
715 /* The BARRIER for a SINGLE process section is always explicit   */
716 
__kmp_enter_single(int gtid,ident_t * id_ref,int push_ws)717 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
718   int status;
719   kmp_info_t *th;
720   kmp_team_t *team;
721 
722   if (!TCR_4(__kmp_init_parallel))
723     __kmp_parallel_initialize();
724   __kmp_resume_if_soft_paused();
725 
726   th = __kmp_threads[gtid];
727   team = th->th.th_team;
728   status = 0;
729 
730   th->th.th_ident = id_ref;
731 
732   if (team->t.t_serialized) {
733     status = 1;
734   } else {
735     kmp_int32 old_this = th->th.th_local.this_construct;
736 
737     ++th->th.th_local.this_construct;
738     /* try to set team count to thread count--success means thread got the
739        single block */
740     /* TODO: Should this be acquire or release? */
741     if (team->t.t_construct == old_this) {
742       status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
743                                               th->th.th_local.this_construct);
744     }
745 #if USE_ITT_BUILD
746     if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
747         KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
748         team->t.t_active_level ==
749             1) { // Only report metadata by master of active team at level 1
750       __kmp_itt_metadata_single(id_ref);
751     }
752 #endif /* USE_ITT_BUILD */
753   }
754 
755   if (__kmp_env_consistency_check) {
756     if (status && push_ws) {
757       __kmp_push_workshare(gtid, ct_psingle, id_ref);
758     } else {
759       __kmp_check_workshare(gtid, ct_psingle, id_ref);
760     }
761   }
762 #if USE_ITT_BUILD
763   if (status) {
764     __kmp_itt_single_start(gtid);
765   }
766 #endif /* USE_ITT_BUILD */
767   return status;
768 }
769 
__kmp_exit_single(int gtid)770 void __kmp_exit_single(int gtid) {
771 #if USE_ITT_BUILD
772   __kmp_itt_single_end(gtid);
773 #endif /* USE_ITT_BUILD */
774   if (__kmp_env_consistency_check)
775     __kmp_pop_workshare(gtid, ct_psingle, NULL);
776 }
777 
778 /* determine if we can go parallel or must use a serialized parallel region and
779  * how many threads we can use
780  * set_nproc is the number of threads requested for the team
781  * returns 0 if we should serialize or only use one thread,
782  * otherwise the number of threads to use
783  * The forkjoin lock is held by the caller. */
__kmp_reserve_threads(kmp_root_t * root,kmp_team_t * parent_team,int master_tid,int set_nthreads,int enter_teams)784 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
785                                  int master_tid, int set_nthreads,
786                                  int enter_teams) {
787   int capacity;
788   int new_nthreads;
789   KMP_DEBUG_ASSERT(__kmp_init_serial);
790   KMP_DEBUG_ASSERT(root && parent_team);
791   kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
792 
793   // If dyn-var is set, dynamically adjust the number of desired threads,
794   // according to the method specified by dynamic_mode.
795   new_nthreads = set_nthreads;
796   if (!get__dynamic_2(parent_team, master_tid)) {
797     ;
798   }
799 #ifdef USE_LOAD_BALANCE
800   else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
801     new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
802     if (new_nthreads == 1) {
803       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
804                     "reservation to 1 thread\n",
805                     master_tid));
806       return 1;
807     }
808     if (new_nthreads < set_nthreads) {
809       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
810                     "reservation to %d threads\n",
811                     master_tid, new_nthreads));
812     }
813   }
814 #endif /* USE_LOAD_BALANCE */
815   else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
816     new_nthreads = __kmp_avail_proc - __kmp_nth +
817                    (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
818     if (new_nthreads <= 1) {
819       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
820                     "reservation to 1 thread\n",
821                     master_tid));
822       return 1;
823     }
824     if (new_nthreads < set_nthreads) {
825       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
826                     "reservation to %d threads\n",
827                     master_tid, new_nthreads));
828     } else {
829       new_nthreads = set_nthreads;
830     }
831   } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
832     if (set_nthreads > 2) {
833       new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
834       new_nthreads = (new_nthreads % set_nthreads) + 1;
835       if (new_nthreads == 1) {
836         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
837                       "reservation to 1 thread\n",
838                       master_tid));
839         return 1;
840       }
841       if (new_nthreads < set_nthreads) {
842         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
843                       "reservation to %d threads\n",
844                       master_tid, new_nthreads));
845       }
846     }
847   } else {
848     KMP_ASSERT(0);
849   }
850 
851   // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
852   if (__kmp_nth + new_nthreads -
853           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
854       __kmp_max_nth) {
855     int tl_nthreads = __kmp_max_nth - __kmp_nth +
856                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
857     if (tl_nthreads <= 0) {
858       tl_nthreads = 1;
859     }
860 
861     // If dyn-var is false, emit a 1-time warning.
862     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
863       __kmp_reserve_warn = 1;
864       __kmp_msg(kmp_ms_warning,
865                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
866                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
867     }
868     if (tl_nthreads == 1) {
869       KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
870                     "reduced reservation to 1 thread\n",
871                     master_tid));
872       return 1;
873     }
874     KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
875                   "reservation to %d threads\n",
876                   master_tid, tl_nthreads));
877     new_nthreads = tl_nthreads;
878   }
879 
880   // Respect OMP_THREAD_LIMIT
881   int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
882   int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
883   if (cg_nthreads + new_nthreads -
884           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
885       max_cg_threads) {
886     int tl_nthreads = max_cg_threads - cg_nthreads +
887                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
888     if (tl_nthreads <= 0) {
889       tl_nthreads = 1;
890     }
891 
892     // If dyn-var is false, emit a 1-time warning.
893     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
894       __kmp_reserve_warn = 1;
895       __kmp_msg(kmp_ms_warning,
896                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
897                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
898     }
899     if (tl_nthreads == 1) {
900       KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
901                     "reduced reservation to 1 thread\n",
902                     master_tid));
903       return 1;
904     }
905     KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
906                   "reservation to %d threads\n",
907                   master_tid, tl_nthreads));
908     new_nthreads = tl_nthreads;
909   }
910 
911   // Check if the threads array is large enough, or needs expanding.
912   // See comment in __kmp_register_root() about the adjustment if
913   // __kmp_threads[0] == NULL.
914   capacity = __kmp_threads_capacity;
915   if (TCR_PTR(__kmp_threads[0]) == NULL) {
916     --capacity;
917   }
918   if (__kmp_nth + new_nthreads -
919           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
920       capacity) {
921     // Expand the threads array.
922     int slotsRequired = __kmp_nth + new_nthreads -
923                         (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
924                         capacity;
925     int slotsAdded = __kmp_expand_threads(slotsRequired);
926     if (slotsAdded < slotsRequired) {
927       // The threads array was not expanded enough.
928       new_nthreads -= (slotsRequired - slotsAdded);
929       KMP_ASSERT(new_nthreads >= 1);
930 
931       // If dyn-var is false, emit a 1-time warning.
932       if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
933         __kmp_reserve_warn = 1;
934         if (__kmp_tp_cached) {
935           __kmp_msg(kmp_ms_warning,
936                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
937                     KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
938                     KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
939         } else {
940           __kmp_msg(kmp_ms_warning,
941                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
942                     KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
943         }
944       }
945     }
946   }
947 
948 #ifdef KMP_DEBUG
949   if (new_nthreads == 1) {
950     KC_TRACE(10,
951              ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
952               "dead roots and rechecking; requested %d threads\n",
953               __kmp_get_gtid(), set_nthreads));
954   } else {
955     KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
956                   " %d threads\n",
957                   __kmp_get_gtid(), new_nthreads, set_nthreads));
958   }
959 #endif // KMP_DEBUG
960   return new_nthreads;
961 }
962 
963 /* Allocate threads from the thread pool and assign them to the new team. We are
964    assured that there are enough threads available, because we checked on that
965    earlier within critical section forkjoin */
__kmp_fork_team_threads(kmp_root_t * root,kmp_team_t * team,kmp_info_t * master_th,int master_gtid)966 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
967                                     kmp_info_t *master_th, int master_gtid) {
968   int i;
969   int use_hot_team;
970 
971   KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
972   KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
973   KMP_MB();
974 
975   /* first, let's setup the master thread */
976   master_th->th.th_info.ds.ds_tid = 0;
977   master_th->th.th_team = team;
978   master_th->th.th_team_nproc = team->t.t_nproc;
979   master_th->th.th_team_master = master_th;
980   master_th->th.th_team_serialized = FALSE;
981   master_th->th.th_dispatch = &team->t.t_dispatch[0];
982 
983 /* make sure we are not the optimized hot team */
984 #if KMP_NESTED_HOT_TEAMS
985   use_hot_team = 0;
986   kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
987   if (hot_teams) { // hot teams array is not allocated if
988     // KMP_HOT_TEAMS_MAX_LEVEL=0
989     int level = team->t.t_active_level - 1; // index in array of hot teams
990     if (master_th->th.th_teams_microtask) { // are we inside the teams?
991       if (master_th->th.th_teams_size.nteams > 1) {
992         ++level; // level was not increased in teams construct for
993         // team_of_masters
994       }
995       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
996           master_th->th.th_teams_level == team->t.t_level) {
997         ++level; // level was not increased in teams construct for
998         // team_of_workers before the parallel
999       } // team->t.t_level will be increased inside parallel
1000     }
1001     if (level < __kmp_hot_teams_max_level) {
1002       if (hot_teams[level].hot_team) {
1003         // hot team has already been allocated for given level
1004         KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
1005         use_hot_team = 1; // the team is ready to use
1006       } else {
1007         use_hot_team = 0; // AC: threads are not allocated yet
1008         hot_teams[level].hot_team = team; // remember new hot team
1009         hot_teams[level].hot_team_nth = team->t.t_nproc;
1010       }
1011     } else {
1012       use_hot_team = 0;
1013     }
1014   }
1015 #else
1016   use_hot_team = team == root->r.r_hot_team;
1017 #endif
1018   if (!use_hot_team) {
1019 
1020     /* install the master thread */
1021     team->t.t_threads[0] = master_th;
1022     __kmp_initialize_info(master_th, team, 0, master_gtid);
1023 
1024     /* now, install the worker threads */
1025     for (i = 1; i < team->t.t_nproc; i++) {
1026 
1027       /* fork or reallocate a new thread and install it in team */
1028       kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
1029       team->t.t_threads[i] = thr;
1030       KMP_DEBUG_ASSERT(thr);
1031       KMP_DEBUG_ASSERT(thr->th.th_team == team);
1032       /* align team and thread arrived states */
1033       KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
1034                     "T#%d(%d:%d) join =%llu, plain=%llu\n",
1035                     __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
1036                     __kmp_gtid_from_tid(i, team), team->t.t_id, i,
1037                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
1038                     team->t.t_bar[bs_plain_barrier].b_arrived));
1039       thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1040       thr->th.th_teams_level = master_th->th.th_teams_level;
1041       thr->th.th_teams_size = master_th->th.th_teams_size;
1042       { // Initialize threads' barrier data.
1043         int b;
1044         kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
1045         for (b = 0; b < bs_last_barrier; ++b) {
1046           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
1047           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1048 #if USE_DEBUGGER
1049           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1050 #endif
1051         }
1052       }
1053     }
1054 
1055 #if KMP_AFFINITY_SUPPORTED
1056     __kmp_partition_places(team);
1057 #endif
1058   }
1059 
1060   if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1061     for (i = 0; i < team->t.t_nproc; i++) {
1062       kmp_info_t *thr = team->t.t_threads[i];
1063       if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1064           thr->th.th_prev_level != team->t.t_level) {
1065         team->t.t_display_affinity = 1;
1066         break;
1067       }
1068     }
1069   }
1070 
1071   KMP_MB();
1072 }
1073 
1074 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1075 // Propagate any changes to the floating point control registers out to the team
1076 // We try to avoid unnecessary writes to the relevant cache line in the team
1077 // structure, so we don't make changes unless they are needed.
propagateFPControl(kmp_team_t * team)1078 inline static void propagateFPControl(kmp_team_t *team) {
1079   if (__kmp_inherit_fp_control) {
1080     kmp_int16 x87_fpu_control_word;
1081     kmp_uint32 mxcsr;
1082 
1083     // Get master values of FPU control flags (both X87 and vector)
1084     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1085     __kmp_store_mxcsr(&mxcsr);
1086     mxcsr &= KMP_X86_MXCSR_MASK;
1087 
1088     // There is no point looking at t_fp_control_saved here.
1089     // If it is TRUE, we still have to update the values if they are different
1090     // from those we now have. If it is FALSE we didn't save anything yet, but
1091     // our objective is the same. We have to ensure that the values in the team
1092     // are the same as those we have.
1093     // So, this code achieves what we need whether or not t_fp_control_saved is
1094     // true. By checking whether the value needs updating we avoid unnecessary
1095     // writes that would put the cache-line into a written state, causing all
1096     // threads in the team to have to read it again.
1097     KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1098     KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1099     // Although we don't use this value, other code in the runtime wants to know
1100     // whether it should restore them. So we must ensure it is correct.
1101     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1102   } else {
1103     // Similarly here. Don't write to this cache-line in the team structure
1104     // unless we have to.
1105     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1106   }
1107 }
1108 
1109 // Do the opposite, setting the hardware registers to the updated values from
1110 // the team.
updateHWFPControl(kmp_team_t * team)1111 inline static void updateHWFPControl(kmp_team_t *team) {
1112   if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1113     // Only reset the fp control regs if they have been changed in the team.
1114     // the parallel region that we are exiting.
1115     kmp_int16 x87_fpu_control_word;
1116     kmp_uint32 mxcsr;
1117     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1118     __kmp_store_mxcsr(&mxcsr);
1119     mxcsr &= KMP_X86_MXCSR_MASK;
1120 
1121     if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1122       __kmp_clear_x87_fpu_status_word();
1123       __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1124     }
1125 
1126     if (team->t.t_mxcsr != mxcsr) {
1127       __kmp_load_mxcsr(&team->t.t_mxcsr);
1128     }
1129   }
1130 }
1131 #else
1132 #define propagateFPControl(x) ((void)0)
1133 #define updateHWFPControl(x) ((void)0)
1134 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1135 
1136 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1137                                      int realloc); // forward declaration
1138 
1139 /* Run a parallel region that has been serialized, so runs only in a team of the
1140    single master thread. */
__kmp_serialized_parallel(ident_t * loc,kmp_int32 global_tid)1141 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1142   kmp_info_t *this_thr;
1143   kmp_team_t *serial_team;
1144 
1145   KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1146 
1147   /* Skip all this code for autopar serialized loops since it results in
1148      unacceptable overhead */
1149   if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1150     return;
1151 
1152   if (!TCR_4(__kmp_init_parallel))
1153     __kmp_parallel_initialize();
1154   __kmp_resume_if_soft_paused();
1155 
1156   this_thr = __kmp_threads[global_tid];
1157   serial_team = this_thr->th.th_serial_team;
1158 
1159   /* utilize the serialized team held by this thread */
1160   KMP_DEBUG_ASSERT(serial_team);
1161   KMP_MB();
1162 
1163   if (__kmp_tasking_mode != tskm_immediate_exec) {
1164     KMP_DEBUG_ASSERT(
1165         this_thr->th.th_task_team ==
1166         this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1167     KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1168                      NULL);
1169     KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1170                   "team %p, new task_team = NULL\n",
1171                   global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1172     this_thr->th.th_task_team = NULL;
1173   }
1174 
1175   kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1176   if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1177     proc_bind = proc_bind_false;
1178   } else if (proc_bind == proc_bind_default) {
1179     // No proc_bind clause was specified, so use the current value
1180     // of proc-bind-var for this parallel region.
1181     proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1182   }
1183   // Reset for next parallel region
1184   this_thr->th.th_set_proc_bind = proc_bind_default;
1185 
1186 #if OMPT_SUPPORT
1187   ompt_data_t ompt_parallel_data = ompt_data_none;
1188   ompt_data_t *implicit_task_data;
1189   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1190   if (ompt_enabled.enabled &&
1191       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1192 
1193     ompt_task_info_t *parent_task_info;
1194     parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1195 
1196     parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1197     if (ompt_enabled.ompt_callback_parallel_begin) {
1198       int team_size = 1;
1199 
1200       ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1201           &(parent_task_info->task_data), &(parent_task_info->frame),
1202           &ompt_parallel_data, team_size,
1203           ompt_parallel_invoker_program | ompt_parallel_team, codeptr);
1204     }
1205   }
1206 #endif // OMPT_SUPPORT
1207 
1208   if (this_thr->th.th_team != serial_team) {
1209     // Nested level will be an index in the nested nthreads array
1210     int level = this_thr->th.th_team->t.t_level;
1211 
1212     if (serial_team->t.t_serialized) {
1213       /* this serial team was already used
1214          TODO increase performance by making this locks more specific */
1215       kmp_team_t *new_team;
1216 
1217       __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1218 
1219       new_team =
1220           __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1221 #if OMPT_SUPPORT
1222                               ompt_parallel_data,
1223 #endif
1224                               proc_bind, &this_thr->th.th_current_task->td_icvs,
1225                               0 USE_NESTED_HOT_ARG(NULL));
1226       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1227       KMP_ASSERT(new_team);
1228 
1229       /* setup new serialized team and install it */
1230       new_team->t.t_threads[0] = this_thr;
1231       new_team->t.t_parent = this_thr->th.th_team;
1232       serial_team = new_team;
1233       this_thr->th.th_serial_team = serial_team;
1234 
1235       KF_TRACE(
1236           10,
1237           ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1238            global_tid, serial_team));
1239 
1240       /* TODO the above breaks the requirement that if we run out of resources,
1241          then we can still guarantee that serialized teams are ok, since we may
1242          need to allocate a new one */
1243     } else {
1244       KF_TRACE(
1245           10,
1246           ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1247            global_tid, serial_team));
1248     }
1249 
1250     /* we have to initialize this serial team */
1251     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1252     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1253     KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1254     serial_team->t.t_ident = loc;
1255     serial_team->t.t_serialized = 1;
1256     serial_team->t.t_nproc = 1;
1257     serial_team->t.t_parent = this_thr->th.th_team;
1258     serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1259     this_thr->th.th_team = serial_team;
1260     serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1261 
1262     KF_TRACE(10, ("__kmpc_serialized_parallel: T#d curtask=%p\n", global_tid,
1263                   this_thr->th.th_current_task));
1264     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1265     this_thr->th.th_current_task->td_flags.executing = 0;
1266 
1267     __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1268 
1269     /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1270        implicit task for each serialized task represented by
1271        team->t.t_serialized? */
1272     copy_icvs(&this_thr->th.th_current_task->td_icvs,
1273               &this_thr->th.th_current_task->td_parent->td_icvs);
1274 
1275     // Thread value exists in the nested nthreads array for the next nested
1276     // level
1277     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1278       this_thr->th.th_current_task->td_icvs.nproc =
1279           __kmp_nested_nth.nth[level + 1];
1280     }
1281 
1282     if (__kmp_nested_proc_bind.used &&
1283         (level + 1 < __kmp_nested_proc_bind.used)) {
1284       this_thr->th.th_current_task->td_icvs.proc_bind =
1285           __kmp_nested_proc_bind.bind_types[level + 1];
1286     }
1287 
1288 #if USE_DEBUGGER
1289     serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1290 #endif
1291     this_thr->th.th_info.ds.ds_tid = 0;
1292 
1293     /* set thread cache values */
1294     this_thr->th.th_team_nproc = 1;
1295     this_thr->th.th_team_master = this_thr;
1296     this_thr->th.th_team_serialized = 1;
1297 
1298     serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1299     serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1300     serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1301 
1302     propagateFPControl(serial_team);
1303 
1304     /* check if we need to allocate dispatch buffers stack */
1305     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1306     if (!serial_team->t.t_dispatch->th_disp_buffer) {
1307       serial_team->t.t_dispatch->th_disp_buffer =
1308           (dispatch_private_info_t *)__kmp_allocate(
1309               sizeof(dispatch_private_info_t));
1310     }
1311     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1312 
1313     KMP_MB();
1314 
1315   } else {
1316     /* this serialized team is already being used,
1317      * that's fine, just add another nested level */
1318     KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1319     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1320     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1321     ++serial_team->t.t_serialized;
1322     this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1323 
1324     // Nested level will be an index in the nested nthreads array
1325     int level = this_thr->th.th_team->t.t_level;
1326     // Thread value exists in the nested nthreads array for the next nested
1327     // level
1328     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1329       this_thr->th.th_current_task->td_icvs.nproc =
1330           __kmp_nested_nth.nth[level + 1];
1331     }
1332     serial_team->t.t_level++;
1333     KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1334                   "of serial team %p to %d\n",
1335                   global_tid, serial_team, serial_team->t.t_level));
1336 
1337     /* allocate/push dispatch buffers stack */
1338     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1339     {
1340       dispatch_private_info_t *disp_buffer =
1341           (dispatch_private_info_t *)__kmp_allocate(
1342               sizeof(dispatch_private_info_t));
1343       disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1344       serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1345     }
1346     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1347 
1348     KMP_MB();
1349   }
1350   KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1351 
1352   // Perform the display affinity functionality for
1353   // serialized parallel regions
1354   if (__kmp_display_affinity) {
1355     if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1356         this_thr->th.th_prev_num_threads != 1) {
1357       // NULL means use the affinity-format-var ICV
1358       __kmp_aux_display_affinity(global_tid, NULL);
1359       this_thr->th.th_prev_level = serial_team->t.t_level;
1360       this_thr->th.th_prev_num_threads = 1;
1361     }
1362   }
1363 
1364   if (__kmp_env_consistency_check)
1365     __kmp_push_parallel(global_tid, NULL);
1366 #if OMPT_SUPPORT
1367   serial_team->t.ompt_team_info.master_return_address = codeptr;
1368   if (ompt_enabled.enabled &&
1369       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1370     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1371 
1372     ompt_lw_taskteam_t lw_taskteam;
1373     __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1374                             &ompt_parallel_data, codeptr);
1375 
1376     __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1377     // don't use lw_taskteam after linking. content was swaped
1378 
1379     /* OMPT implicit task begin */
1380     implicit_task_data = OMPT_CUR_TASK_DATA(this_thr);
1381     if (ompt_enabled.ompt_callback_implicit_task) {
1382       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1383           ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1384           OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1385       OMPT_CUR_TASK_INFO(this_thr)
1386           ->thread_num = __kmp_tid_from_gtid(global_tid);
1387     }
1388 
1389     /* OMPT state */
1390     this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1391     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1392   }
1393 #endif
1394 }
1395 
1396 /* most of the work for a fork */
1397 /* return true if we really went parallel, false if serialized */
__kmp_fork_call(ident_t * loc,int gtid,enum fork_context_e call_context,kmp_int32 argc,microtask_t microtask,launch_t invoker,kmp_va_list ap)1398 int __kmp_fork_call(ident_t *loc, int gtid,
1399                     enum fork_context_e call_context, // Intel, GNU, ...
1400                     kmp_int32 argc, microtask_t microtask, launch_t invoker,
1401                     kmp_va_list ap) {
1402   void **argv;
1403   int i;
1404   int master_tid;
1405   int master_this_cons;
1406   kmp_team_t *team;
1407   kmp_team_t *parent_team;
1408   kmp_info_t *master_th;
1409   kmp_root_t *root;
1410   int nthreads;
1411   int master_active;
1412   int master_set_numthreads;
1413   int level;
1414   int active_level;
1415   int teams_level;
1416 #if KMP_NESTED_HOT_TEAMS
1417   kmp_hot_team_ptr_t **p_hot_teams;
1418 #endif
1419   { // KMP_TIME_BLOCK
1420     KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1421     KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1422 
1423     KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1424     if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1425       /* Some systems prefer the stack for the root thread(s) to start with */
1426       /* some gap from the parent stack to prevent false sharing. */
1427       void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1428       /* These 2 lines below are so this does not get optimized out */
1429       if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1430         __kmp_stkpadding += (short)((kmp_int64)dummy);
1431     }
1432 
1433     /* initialize if needed */
1434     KMP_DEBUG_ASSERT(
1435         __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1436     if (!TCR_4(__kmp_init_parallel))
1437       __kmp_parallel_initialize();
1438     __kmp_resume_if_soft_paused();
1439 
1440     /* setup current data */
1441     master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with
1442     // shutdown
1443     parent_team = master_th->th.th_team;
1444     master_tid = master_th->th.th_info.ds.ds_tid;
1445     master_this_cons = master_th->th.th_local.this_construct;
1446     root = master_th->th.th_root;
1447     master_active = root->r.r_active;
1448     master_set_numthreads = master_th->th.th_set_nproc;
1449 
1450 #if OMPT_SUPPORT
1451     ompt_data_t ompt_parallel_data = ompt_data_none;
1452     ompt_data_t *parent_task_data;
1453     ompt_frame_t *ompt_frame;
1454     ompt_data_t *implicit_task_data;
1455     void *return_address = NULL;
1456 
1457     if (ompt_enabled.enabled) {
1458       __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1459                                     NULL, NULL);
1460       return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1461     }
1462 #endif
1463 
1464     // Nested level will be an index in the nested nthreads array
1465     level = parent_team->t.t_level;
1466     // used to launch non-serial teams even if nested is not allowed
1467     active_level = parent_team->t.t_active_level;
1468     // needed to check nesting inside the teams
1469     teams_level = master_th->th.th_teams_level;
1470 #if KMP_NESTED_HOT_TEAMS
1471     p_hot_teams = &master_th->th.th_hot_teams;
1472     if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1473       *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1474           sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1475       (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1476       // it is either actual or not needed (when active_level > 0)
1477       (*p_hot_teams)[0].hot_team_nth = 1;
1478     }
1479 #endif
1480 
1481 #if OMPT_SUPPORT
1482     if (ompt_enabled.enabled) {
1483       if (ompt_enabled.ompt_callback_parallel_begin) {
1484         int team_size = master_set_numthreads
1485                             ? master_set_numthreads
1486                             : get__nproc_2(parent_team, master_tid);
1487         int flags = OMPT_INVOKER(call_context) |
1488                     ((microtask == (microtask_t)__kmp_teams_master)
1489                          ? ompt_parallel_league
1490                          : ompt_parallel_team);
1491         ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1492             parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags,
1493             return_address);
1494       }
1495       master_th->th.ompt_thread_info.state = ompt_state_overhead;
1496     }
1497 #endif
1498 
1499     master_th->th.th_ident = loc;
1500 
1501     if (master_th->th.th_teams_microtask && ap &&
1502         microtask != (microtask_t)__kmp_teams_master && level == teams_level) {
1503       // AC: This is start of parallel that is nested inside teams construct.
1504       // The team is actual (hot), all workers are ready at the fork barrier.
1505       // No lock needed to initialize the team a bit, then free workers.
1506       parent_team->t.t_ident = loc;
1507       __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1508       parent_team->t.t_argc = argc;
1509       argv = (void **)parent_team->t.t_argv;
1510       for (i = argc - 1; i >= 0; --i)
1511         *argv++ = va_arg(kmp_va_deref(ap), void *);
1512       // Increment our nested depth levels, but not increase the serialization
1513       if (parent_team == master_th->th.th_serial_team) {
1514         // AC: we are in serialized parallel
1515         __kmpc_serialized_parallel(loc, gtid);
1516         KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1517 
1518         if (call_context == fork_context_gnu) {
1519           // AC: need to decrement t_serialized for enquiry functions to work
1520           // correctly, will restore at join time
1521           parent_team->t.t_serialized--;
1522           return TRUE;
1523         }
1524 
1525 #if OMPT_SUPPORT
1526         void *dummy;
1527         void **exit_frame_p;
1528 
1529         ompt_lw_taskteam_t lw_taskteam;
1530 
1531         if (ompt_enabled.enabled) {
1532           __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1533                                   &ompt_parallel_data, return_address);
1534           exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1535 
1536           __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1537           // don't use lw_taskteam after linking. content was swaped
1538 
1539           /* OMPT implicit task begin */
1540           implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1541           if (ompt_enabled.ompt_callback_implicit_task) {
1542             OMPT_CUR_TASK_INFO(master_th)
1543                 ->thread_num = __kmp_tid_from_gtid(gtid);
1544             ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1545                 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1546                 implicit_task_data, 1,
1547                 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1548           }
1549 
1550           /* OMPT state */
1551           master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1552         } else {
1553           exit_frame_p = &dummy;
1554         }
1555 #endif
1556         // AC: need to decrement t_serialized for enquiry functions to work
1557         // correctly, will restore at join time
1558         parent_team->t.t_serialized--;
1559 
1560         {
1561           KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1562           KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1563           __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1564 #if OMPT_SUPPORT
1565                                  ,
1566                                  exit_frame_p
1567 #endif
1568                                  );
1569         }
1570 
1571 #if OMPT_SUPPORT
1572         if (ompt_enabled.enabled) {
1573           *exit_frame_p = NULL;
1574           OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1575           if (ompt_enabled.ompt_callback_implicit_task) {
1576             ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1577                 ompt_scope_end, NULL, implicit_task_data, 1,
1578                 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1579           }
1580           ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1581           __ompt_lw_taskteam_unlink(master_th);
1582           if (ompt_enabled.ompt_callback_parallel_end) {
1583             ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1584                 &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th),
1585                 OMPT_INVOKER(call_context) | ompt_parallel_team,
1586                 return_address);
1587           }
1588           master_th->th.ompt_thread_info.state = ompt_state_overhead;
1589         }
1590 #endif
1591         return TRUE;
1592       }
1593 
1594       parent_team->t.t_pkfn = microtask;
1595       parent_team->t.t_invoke = invoker;
1596       KMP_ATOMIC_INC(&root->r.r_in_parallel);
1597       parent_team->t.t_active_level++;
1598       parent_team->t.t_level++;
1599       parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1600 
1601 #if OMPT_SUPPORT
1602       if (ompt_enabled.enabled) {
1603         ompt_lw_taskteam_t lw_taskteam;
1604         __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1605                                 &ompt_parallel_data, return_address);
1606         __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true);
1607       }
1608 #endif
1609 
1610       /* Change number of threads in the team if requested */
1611       if (master_set_numthreads) { // The parallel has num_threads clause
1612         if (master_set_numthreads < master_th->th.th_teams_size.nth) {
1613           // AC: only can reduce number of threads dynamically, can't increase
1614           kmp_info_t **other_threads = parent_team->t.t_threads;
1615           parent_team->t.t_nproc = master_set_numthreads;
1616           for (i = 0; i < master_set_numthreads; ++i) {
1617             other_threads[i]->th.th_team_nproc = master_set_numthreads;
1618           }
1619           // Keep extra threads hot in the team for possible next parallels
1620         }
1621         master_th->th.th_set_nproc = 0;
1622       }
1623 
1624 #if USE_DEBUGGER
1625       if (__kmp_debugging) { // Let debugger override number of threads.
1626         int nth = __kmp_omp_num_threads(loc);
1627         if (nth > 0) { // 0 means debugger doesn't want to change num threads
1628           master_set_numthreads = nth;
1629         }
1630       }
1631 #endif
1632 
1633 #if USE_ITT_BUILD
1634       if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) ||
1635            KMP_ITT_DEBUG) &&
1636           __kmp_forkjoin_frames_mode == 3 &&
1637           parent_team->t.t_active_level == 1 // only report frames at level 1
1638           && master_th->th.th_teams_size.nteams == 1) {
1639         kmp_uint64 tmp_time = __itt_get_timestamp();
1640         master_th->th.th_frame_time = tmp_time;
1641         parent_team->t.t_region_time = tmp_time;
1642       }
1643       if (__itt_stack_caller_create_ptr) {
1644         // create new stack stitching id before entering fork barrier
1645         parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
1646       }
1647 #endif /* USE_ITT_BUILD */
1648 
1649       KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "
1650                     "master_th=%p, gtid=%d\n",
1651                     root, parent_team, master_th, gtid));
1652       __kmp_internal_fork(loc, gtid, parent_team);
1653       KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, "
1654                     "master_th=%p, gtid=%d\n",
1655                     root, parent_team, master_th, gtid));
1656 
1657       if (call_context == fork_context_gnu)
1658         return TRUE;
1659 
1660       /* Invoke microtask for MASTER thread */
1661       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
1662                     parent_team->t.t_id, parent_team->t.t_pkfn));
1663 
1664       if (!parent_team->t.t_invoke(gtid)) {
1665         KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
1666       }
1667       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
1668                     parent_team->t.t_id, parent_team->t.t_pkfn));
1669       KMP_MB(); /* Flush all pending memory write invalidates.  */
1670 
1671       KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
1672 
1673       return TRUE;
1674     } // Parallel closely nested in teams construct
1675 
1676 #if KMP_DEBUG
1677     if (__kmp_tasking_mode != tskm_immediate_exec) {
1678       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1679                        parent_team->t.t_task_team[master_th->th.th_task_state]);
1680     }
1681 #endif
1682 
1683     if (parent_team->t.t_active_level >=
1684         master_th->th.th_current_task->td_icvs.max_active_levels) {
1685       nthreads = 1;
1686     } else {
1687       int enter_teams = ((ap == NULL && active_level == 0) ||
1688                          (ap && teams_level > 0 && teams_level == level));
1689       nthreads =
1690           master_set_numthreads
1691               ? master_set_numthreads
1692               : get__nproc_2(
1693                     parent_team,
1694                     master_tid); // TODO: get nproc directly from current task
1695 
1696       // Check if we need to take forkjoin lock? (no need for serialized
1697       // parallel out of teams construct). This code moved here from
1698       // __kmp_reserve_threads() to speedup nested serialized parallels.
1699       if (nthreads > 1) {
1700         if ((get__max_active_levels(master_th) == 1 &&
1701              (root->r.r_in_parallel && !enter_teams)) ||
1702             (__kmp_library == library_serial)) {
1703           KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d"
1704                         " threads\n",
1705                         gtid, nthreads));
1706           nthreads = 1;
1707         }
1708       }
1709       if (nthreads > 1) {
1710         /* determine how many new threads we can use */
1711         __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1712         /* AC: If we execute teams from parallel region (on host), then teams
1713            should be created but each can only have 1 thread if nesting is
1714            disabled. If teams called from serial region, then teams and their
1715            threads should be created regardless of the nesting setting. */
1716         nthreads = __kmp_reserve_threads(root, parent_team, master_tid,
1717                                          nthreads, enter_teams);
1718         if (nthreads == 1) {
1719           // Free lock for single thread execution here; for multi-thread
1720           // execution it will be freed later after team of threads created
1721           // and initialized
1722           __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1723         }
1724       }
1725     }
1726     KMP_DEBUG_ASSERT(nthreads > 0);
1727 
1728     // If we temporarily changed the set number of threads then restore it now
1729     master_th->th.th_set_nproc = 0;
1730 
1731     /* create a serialized parallel region? */
1732     if (nthreads == 1) {
1733 /* josh todo: hypothetical question: what do we do for OS X*? */
1734 #if KMP_OS_LINUX &&                                                            \
1735     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1736       void *args[argc];
1737 #else
1738       void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1739 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1740           KMP_ARCH_AARCH64) */
1741 
1742       KA_TRACE(20,
1743                ("__kmp_fork_call: T#%d serializing parallel region\n", gtid));
1744 
1745       __kmpc_serialized_parallel(loc, gtid);
1746 
1747       if (call_context == fork_context_intel) {
1748         /* TODO this sucks, use the compiler itself to pass args! :) */
1749         master_th->th.th_serial_team->t.t_ident = loc;
1750         if (!ap) {
1751           // revert change made in __kmpc_serialized_parallel()
1752           master_th->th.th_serial_team->t.t_level--;
1753 // Get args from parent team for teams construct
1754 
1755 #if OMPT_SUPPORT
1756           void *dummy;
1757           void **exit_frame_p;
1758           ompt_task_info_t *task_info;
1759 
1760           ompt_lw_taskteam_t lw_taskteam;
1761 
1762           if (ompt_enabled.enabled) {
1763             __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1764                                     &ompt_parallel_data, return_address);
1765 
1766             __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1767             // don't use lw_taskteam after linking. content was swaped
1768 
1769             task_info = OMPT_CUR_TASK_INFO(master_th);
1770             exit_frame_p = &(task_info->frame.exit_frame.ptr);
1771             if (ompt_enabled.ompt_callback_implicit_task) {
1772               OMPT_CUR_TASK_INFO(master_th)
1773                   ->thread_num = __kmp_tid_from_gtid(gtid);
1774               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1775                   ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1776                   &(task_info->task_data), 1,
1777                   OMPT_CUR_TASK_INFO(master_th)->thread_num,
1778                   ompt_task_implicit);
1779             }
1780 
1781             /* OMPT state */
1782             master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1783           } else {
1784             exit_frame_p = &dummy;
1785           }
1786 #endif
1787 
1788           {
1789             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1790             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1791             __kmp_invoke_microtask(microtask, gtid, 0, argc,
1792                                    parent_team->t.t_argv
1793 #if OMPT_SUPPORT
1794                                    ,
1795                                    exit_frame_p
1796 #endif
1797                                    );
1798           }
1799 
1800 #if OMPT_SUPPORT
1801           if (ompt_enabled.enabled) {
1802             *exit_frame_p = NULL;
1803             if (ompt_enabled.ompt_callback_implicit_task) {
1804               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1805                   ompt_scope_end, NULL, &(task_info->task_data), 1,
1806                   OMPT_CUR_TASK_INFO(master_th)->thread_num,
1807                   ompt_task_implicit);
1808             }
1809             ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1810             __ompt_lw_taskteam_unlink(master_th);
1811             if (ompt_enabled.ompt_callback_parallel_end) {
1812               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1813                   &ompt_parallel_data, parent_task_data,
1814                   OMPT_INVOKER(call_context) | ompt_parallel_team,
1815                   return_address);
1816             }
1817             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1818           }
1819 #endif
1820         } else if (microtask == (microtask_t)__kmp_teams_master) {
1821           KMP_DEBUG_ASSERT(master_th->th.th_team ==
1822                            master_th->th.th_serial_team);
1823           team = master_th->th.th_team;
1824           // team->t.t_pkfn = microtask;
1825           team->t.t_invoke = invoker;
1826           __kmp_alloc_argv_entries(argc, team, TRUE);
1827           team->t.t_argc = argc;
1828           argv = (void **)team->t.t_argv;
1829           if (ap) {
1830             for (i = argc - 1; i >= 0; --i)
1831               *argv++ = va_arg(kmp_va_deref(ap), void *);
1832           } else {
1833             for (i = 0; i < argc; ++i)
1834               // Get args from parent team for teams construct
1835               argv[i] = parent_team->t.t_argv[i];
1836           }
1837           // AC: revert change made in __kmpc_serialized_parallel()
1838           //     because initial code in teams should have level=0
1839           team->t.t_level--;
1840           // AC: call special invoker for outer "parallel" of teams construct
1841           invoker(gtid);
1842 #if OMPT_SUPPORT
1843           if (ompt_enabled.enabled) {
1844             ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th);
1845             if (ompt_enabled.ompt_callback_implicit_task) {
1846               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1847                   ompt_scope_end, NULL, &(task_info->task_data), 0,
1848                   OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial);
1849             }
1850             if (ompt_enabled.ompt_callback_parallel_end) {
1851               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1852                   &ompt_parallel_data, parent_task_data,
1853                   OMPT_INVOKER(call_context) | ompt_parallel_league,
1854                   return_address);
1855             }
1856             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1857           }
1858 #endif
1859         } else {
1860           argv = args;
1861           for (i = argc - 1; i >= 0; --i)
1862             *argv++ = va_arg(kmp_va_deref(ap), void *);
1863           KMP_MB();
1864 
1865 #if OMPT_SUPPORT
1866           void *dummy;
1867           void **exit_frame_p;
1868           ompt_task_info_t *task_info;
1869 
1870           ompt_lw_taskteam_t lw_taskteam;
1871 
1872           if (ompt_enabled.enabled) {
1873             __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1874                                     &ompt_parallel_data, return_address);
1875             __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1876             // don't use lw_taskteam after linking. content was swaped
1877             task_info = OMPT_CUR_TASK_INFO(master_th);
1878             exit_frame_p = &(task_info->frame.exit_frame.ptr);
1879 
1880             /* OMPT implicit task begin */
1881             implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1882             if (ompt_enabled.ompt_callback_implicit_task) {
1883               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1884                   ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1885                   implicit_task_data, 1, __kmp_tid_from_gtid(gtid),
1886                   ompt_task_implicit);
1887               OMPT_CUR_TASK_INFO(master_th)
1888                   ->thread_num = __kmp_tid_from_gtid(gtid);
1889             }
1890 
1891             /* OMPT state */
1892             master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1893           } else {
1894             exit_frame_p = &dummy;
1895           }
1896 #endif
1897 
1898           {
1899             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1900             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1901             __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1902 #if OMPT_SUPPORT
1903                                    ,
1904                                    exit_frame_p
1905 #endif
1906                                    );
1907           }
1908 
1909 #if OMPT_SUPPORT
1910           if (ompt_enabled.enabled) {
1911             *exit_frame_p = NULL;
1912             if (ompt_enabled.ompt_callback_implicit_task) {
1913               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1914                   ompt_scope_end, NULL, &(task_info->task_data), 1,
1915                   OMPT_CUR_TASK_INFO(master_th)->thread_num,
1916                   ompt_task_implicit);
1917             }
1918 
1919             ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1920             __ompt_lw_taskteam_unlink(master_th);
1921             if (ompt_enabled.ompt_callback_parallel_end) {
1922               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1923                   &ompt_parallel_data, parent_task_data,
1924                   OMPT_INVOKER(call_context) | ompt_parallel_team,
1925                   return_address);
1926             }
1927             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1928           }
1929 #endif
1930         }
1931       } else if (call_context == fork_context_gnu) {
1932 #if OMPT_SUPPORT
1933         ompt_lw_taskteam_t lwt;
1934         __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data,
1935                                 return_address);
1936 
1937         lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1938         __ompt_lw_taskteam_link(&lwt, master_th, 1);
1939 // don't use lw_taskteam after linking. content was swaped
1940 #endif
1941 
1942         // we were called from GNU native code
1943         KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1944         return FALSE;
1945       } else {
1946         KMP_ASSERT2(call_context < fork_context_last,
1947                     "__kmp_fork_call: unknown fork_context parameter");
1948       }
1949 
1950       KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1951       KMP_MB();
1952       return FALSE;
1953     } // if (nthreads == 1)
1954 
1955     // GEH: only modify the executing flag in the case when not serialized
1956     //      serialized case is handled in kmpc_serialized_parallel
1957     KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
1958                   "curtask=%p, curtask_max_aclevel=%d\n",
1959                   parent_team->t.t_active_level, master_th,
1960                   master_th->th.th_current_task,
1961                   master_th->th.th_current_task->td_icvs.max_active_levels));
1962     // TODO: GEH - cannot do this assertion because root thread not set up as
1963     // executing
1964     // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
1965     master_th->th.th_current_task->td_flags.executing = 0;
1966 
1967     if (!master_th->th.th_teams_microtask || level > teams_level) {
1968       /* Increment our nested depth level */
1969       KMP_ATOMIC_INC(&root->r.r_in_parallel);
1970     }
1971 
1972     // See if we need to make a copy of the ICVs.
1973     int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
1974     if ((level + 1 < __kmp_nested_nth.used) &&
1975         (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
1976       nthreads_icv = __kmp_nested_nth.nth[level + 1];
1977     } else {
1978       nthreads_icv = 0; // don't update
1979     }
1980 
1981     // Figure out the proc_bind_policy for the new team.
1982     kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1983     kmp_proc_bind_t proc_bind_icv =
1984         proc_bind_default; // proc_bind_default means don't update
1985     if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1986       proc_bind = proc_bind_false;
1987     } else {
1988       if (proc_bind == proc_bind_default) {
1989         // No proc_bind clause specified; use current proc-bind-var for this
1990         // parallel region
1991         proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1992       }
1993       /* else: The proc_bind policy was specified explicitly on parallel clause.
1994          This overrides proc-bind-var for this parallel region, but does not
1995          change proc-bind-var. */
1996       // Figure the value of proc-bind-var for the child threads.
1997       if ((level + 1 < __kmp_nested_proc_bind.used) &&
1998           (__kmp_nested_proc_bind.bind_types[level + 1] !=
1999            master_th->th.th_current_task->td_icvs.proc_bind)) {
2000         proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
2001       }
2002     }
2003 
2004     // Reset for next parallel region
2005     master_th->th.th_set_proc_bind = proc_bind_default;
2006 
2007     if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) {
2008       kmp_internal_control_t new_icvs;
2009       copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
2010       new_icvs.next = NULL;
2011       if (nthreads_icv > 0) {
2012         new_icvs.nproc = nthreads_icv;
2013       }
2014       if (proc_bind_icv != proc_bind_default) {
2015         new_icvs.proc_bind = proc_bind_icv;
2016       }
2017 
2018       /* allocate a new parallel team */
2019       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2020       team = __kmp_allocate_team(root, nthreads, nthreads,
2021 #if OMPT_SUPPORT
2022                                  ompt_parallel_data,
2023 #endif
2024                                  proc_bind, &new_icvs,
2025                                  argc USE_NESTED_HOT_ARG(master_th));
2026     } else {
2027       /* allocate a new parallel team */
2028       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2029       team = __kmp_allocate_team(root, nthreads, nthreads,
2030 #if OMPT_SUPPORT
2031                                  ompt_parallel_data,
2032 #endif
2033                                  proc_bind,
2034                                  &master_th->th.th_current_task->td_icvs,
2035                                  argc USE_NESTED_HOT_ARG(master_th));
2036     }
2037     KF_TRACE(
2038         10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2039 
2040     /* setup the new team */
2041     KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2042     KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2043     KMP_CHECK_UPDATE(team->t.t_ident, loc);
2044     KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2045     KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2046 #if OMPT_SUPPORT
2047     KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2048                           return_address);
2049 #endif
2050     KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2051     // TODO: parent_team->t.t_level == INT_MAX ???
2052     if (!master_th->th.th_teams_microtask || level > teams_level) {
2053       int new_level = parent_team->t.t_level + 1;
2054       KMP_CHECK_UPDATE(team->t.t_level, new_level);
2055       new_level = parent_team->t.t_active_level + 1;
2056       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2057     } else {
2058       // AC: Do not increase parallel level at start of the teams construct
2059       int new_level = parent_team->t.t_level;
2060       KMP_CHECK_UPDATE(team->t.t_level, new_level);
2061       new_level = parent_team->t.t_active_level;
2062       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2063     }
2064     kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2065     // set master's schedule as new run-time schedule
2066     KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2067 
2068     KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2069     KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2070 
2071     // Update the floating point rounding in the team if required.
2072     propagateFPControl(team);
2073 
2074     if (__kmp_tasking_mode != tskm_immediate_exec) {
2075       // Set master's task team to team's task team. Unless this is hot team, it
2076       // should be NULL.
2077       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2078                        parent_team->t.t_task_team[master_th->th.th_task_state]);
2079       KA_TRACE(20, ("__kmp_fork_call: Master T#%d pushing task_team %p / team "
2080                     "%p, new task_team %p / team %p\n",
2081                     __kmp_gtid_from_thread(master_th),
2082                     master_th->th.th_task_team, parent_team,
2083                     team->t.t_task_team[master_th->th.th_task_state], team));
2084 
2085       if (active_level || master_th->th.th_task_team) {
2086         // Take a memo of master's task_state
2087         KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2088         if (master_th->th.th_task_state_top >=
2089             master_th->th.th_task_state_stack_sz) { // increase size
2090           kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2091           kmp_uint8 *old_stack, *new_stack;
2092           kmp_uint32 i;
2093           new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2094           for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2095             new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2096           }
2097           for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2098                ++i) { // zero-init rest of stack
2099             new_stack[i] = 0;
2100           }
2101           old_stack = master_th->th.th_task_state_memo_stack;
2102           master_th->th.th_task_state_memo_stack = new_stack;
2103           master_th->th.th_task_state_stack_sz = new_size;
2104           __kmp_free(old_stack);
2105         }
2106         // Store master's task_state on stack
2107         master_th->th
2108             .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2109             master_th->th.th_task_state;
2110         master_th->th.th_task_state_top++;
2111 #if KMP_NESTED_HOT_TEAMS
2112         if (master_th->th.th_hot_teams &&
2113             active_level < __kmp_hot_teams_max_level &&
2114             team == master_th->th.th_hot_teams[active_level].hot_team) {
2115           // Restore master's nested state if nested hot team
2116           master_th->th.th_task_state =
2117               master_th->th
2118                   .th_task_state_memo_stack[master_th->th.th_task_state_top];
2119         } else {
2120 #endif
2121           master_th->th.th_task_state = 0;
2122 #if KMP_NESTED_HOT_TEAMS
2123         }
2124 #endif
2125       }
2126 #if !KMP_NESTED_HOT_TEAMS
2127       KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2128                        (team == root->r.r_hot_team));
2129 #endif
2130     }
2131 
2132     KA_TRACE(
2133         20,
2134         ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2135          gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2136          team->t.t_nproc));
2137     KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2138                      (team->t.t_master_tid == 0 &&
2139                       (team->t.t_parent == root->r.r_root_team ||
2140                        team->t.t_parent->t.t_serialized)));
2141     KMP_MB();
2142 
2143     /* now, setup the arguments */
2144     argv = (void **)team->t.t_argv;
2145     if (ap) {
2146       for (i = argc - 1; i >= 0; --i) {
2147         void *new_argv = va_arg(kmp_va_deref(ap), void *);
2148         KMP_CHECK_UPDATE(*argv, new_argv);
2149         argv++;
2150       }
2151     } else {
2152       for (i = 0; i < argc; ++i) {
2153         // Get args from parent team for teams construct
2154         KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2155       }
2156     }
2157 
2158     /* now actually fork the threads */
2159     KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2160     if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2161       root->r.r_active = TRUE;
2162 
2163     __kmp_fork_team_threads(root, team, master_th, gtid);
2164     __kmp_setup_icv_copy(team, nthreads,
2165                          &master_th->th.th_current_task->td_icvs, loc);
2166 
2167 #if OMPT_SUPPORT
2168     master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2169 #endif
2170 
2171     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2172 
2173 #if USE_ITT_BUILD
2174     if (team->t.t_active_level == 1 // only report frames at level 1
2175         && !master_th->th.th_teams_microtask) { // not in teams construct
2176 #if USE_ITT_NOTIFY
2177       if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2178           (__kmp_forkjoin_frames_mode == 3 ||
2179            __kmp_forkjoin_frames_mode == 1)) {
2180         kmp_uint64 tmp_time = 0;
2181         if (__itt_get_timestamp_ptr)
2182           tmp_time = __itt_get_timestamp();
2183         // Internal fork - report frame begin
2184         master_th->th.th_frame_time = tmp_time;
2185         if (__kmp_forkjoin_frames_mode == 3)
2186           team->t.t_region_time = tmp_time;
2187       } else
2188 // only one notification scheme (either "submit" or "forking/joined", not both)
2189 #endif /* USE_ITT_NOTIFY */
2190           if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2191               __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2192         // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2193         __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2194       }
2195     }
2196 #endif /* USE_ITT_BUILD */
2197 
2198     /* now go on and do the work */
2199     KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2200     KMP_MB();
2201     KF_TRACE(10,
2202              ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2203               root, team, master_th, gtid));
2204 
2205 #if USE_ITT_BUILD
2206     if (__itt_stack_caller_create_ptr) {
2207       team->t.t_stack_id =
2208           __kmp_itt_stack_caller_create(); // create new stack stitching id
2209       // before entering fork barrier
2210     }
2211 #endif /* USE_ITT_BUILD */
2212 
2213     // AC: skip __kmp_internal_fork at teams construct, let only master
2214     // threads execute
2215     if (ap) {
2216       __kmp_internal_fork(loc, gtid, team);
2217       KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2218                     "master_th=%p, gtid=%d\n",
2219                     root, team, master_th, gtid));
2220     }
2221 
2222     if (call_context == fork_context_gnu) {
2223       KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2224       return TRUE;
2225     }
2226 
2227     /* Invoke microtask for MASTER thread */
2228     KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2229                   team->t.t_id, team->t.t_pkfn));
2230   } // END of timer KMP_fork_call block
2231 
2232 #if KMP_STATS_ENABLED
2233   // If beginning a teams construct, then change thread state
2234   stats_state_e previous_state = KMP_GET_THREAD_STATE();
2235   if (!ap) {
2236     KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
2237   }
2238 #endif
2239 
2240   if (!team->t.t_invoke(gtid)) {
2241     KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
2242   }
2243 
2244 #if KMP_STATS_ENABLED
2245   // If was beginning of a teams construct, then reset thread state
2246   if (!ap) {
2247     KMP_SET_THREAD_STATE(previous_state);
2248   }
2249 #endif
2250 
2251   KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2252                 team->t.t_id, team->t.t_pkfn));
2253   KMP_MB(); /* Flush all pending memory write invalidates.  */
2254 
2255   KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2256 
2257 #if OMPT_SUPPORT
2258   if (ompt_enabled.enabled) {
2259     master_th->th.ompt_thread_info.state = ompt_state_overhead;
2260   }
2261 #endif
2262 
2263   return TRUE;
2264 }
2265 
2266 #if OMPT_SUPPORT
__kmp_join_restore_state(kmp_info_t * thread,kmp_team_t * team)2267 static inline void __kmp_join_restore_state(kmp_info_t *thread,
2268                                             kmp_team_t *team) {
2269   // restore state outside the region
2270   thread->th.ompt_thread_info.state =
2271       ((team->t.t_serialized) ? ompt_state_work_serial
2272                               : ompt_state_work_parallel);
2273 }
2274 
__kmp_join_ompt(int gtid,kmp_info_t * thread,kmp_team_t * team,ompt_data_t * parallel_data,int flags,void * codeptr)2275 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2276                                    kmp_team_t *team, ompt_data_t *parallel_data,
2277                                    int flags, void *codeptr) {
2278   ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2279   if (ompt_enabled.ompt_callback_parallel_end) {
2280     ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2281         parallel_data, &(task_info->task_data), flags, codeptr);
2282   }
2283 
2284   task_info->frame.enter_frame = ompt_data_none;
2285   __kmp_join_restore_state(thread, team);
2286 }
2287 #endif
2288 
__kmp_join_call(ident_t * loc,int gtid,enum fork_context_e fork_context,int exit_teams)2289 void __kmp_join_call(ident_t *loc, int gtid
2290 #if OMPT_SUPPORT
2291                      ,
2292                      enum fork_context_e fork_context
2293 #endif
2294                      ,
2295                      int exit_teams) {
2296   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2297   kmp_team_t *team;
2298   kmp_team_t *parent_team;
2299   kmp_info_t *master_th;
2300   kmp_root_t *root;
2301   int master_active;
2302 
2303   KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2304 
2305   /* setup current data */
2306   master_th = __kmp_threads[gtid];
2307   root = master_th->th.th_root;
2308   team = master_th->th.th_team;
2309   parent_team = team->t.t_parent;
2310 
2311   master_th->th.th_ident = loc;
2312 
2313 #if OMPT_SUPPORT
2314   void *team_microtask = (void *)team->t.t_pkfn;
2315   // For GOMP interface with serialized parallel, need the
2316   // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task
2317   // and end-parallel events.
2318   if (ompt_enabled.enabled &&
2319       !(team->t.t_serialized && fork_context == fork_context_gnu)) {
2320     master_th->th.ompt_thread_info.state = ompt_state_overhead;
2321   }
2322 #endif
2323 
2324 #if KMP_DEBUG
2325   if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2326     KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2327                   "th_task_team = %p\n",
2328                   __kmp_gtid_from_thread(master_th), team,
2329                   team->t.t_task_team[master_th->th.th_task_state],
2330                   master_th->th.th_task_team));
2331     KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2332                      team->t.t_task_team[master_th->th.th_task_state]);
2333   }
2334 #endif
2335 
2336   if (team->t.t_serialized) {
2337     if (master_th->th.th_teams_microtask) {
2338       // We are in teams construct
2339       int level = team->t.t_level;
2340       int tlevel = master_th->th.th_teams_level;
2341       if (level == tlevel) {
2342         // AC: we haven't incremented it earlier at start of teams construct,
2343         //     so do it here - at the end of teams construct
2344         team->t.t_level++;
2345       } else if (level == tlevel + 1) {
2346         // AC: we are exiting parallel inside teams, need to increment
2347         // serialization in order to restore it in the next call to
2348         // __kmpc_end_serialized_parallel
2349         team->t.t_serialized++;
2350       }
2351     }
2352     __kmpc_end_serialized_parallel(loc, gtid);
2353 
2354 #if OMPT_SUPPORT
2355     if (ompt_enabled.enabled) {
2356       __kmp_join_restore_state(master_th, parent_team);
2357     }
2358 #endif
2359 
2360     return;
2361   }
2362 
2363   master_active = team->t.t_master_active;
2364 
2365   if (!exit_teams) {
2366     // AC: No barrier for internal teams at exit from teams construct.
2367     //     But there is barrier for external team (league).
2368     __kmp_internal_join(loc, gtid, team);
2369   } else {
2370     master_th->th.th_task_state =
2371         0; // AC: no tasking in teams (out of any parallel)
2372   }
2373 
2374   KMP_MB();
2375 
2376 #if OMPT_SUPPORT
2377   ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2378   void *codeptr = team->t.ompt_team_info.master_return_address;
2379 #endif
2380 
2381 #if USE_ITT_BUILD
2382   if (__itt_stack_caller_create_ptr) {
2383     // destroy the stack stitching id after join barrier
2384     __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id);
2385   }
2386   // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2387   if (team->t.t_active_level == 1 &&
2388       (!master_th->th.th_teams_microtask || /* not in teams construct */
2389        master_th->th.th_teams_size.nteams == 1)) {
2390     master_th->th.th_ident = loc;
2391     // only one notification scheme (either "submit" or "forking/joined", not
2392     // both)
2393     if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2394         __kmp_forkjoin_frames_mode == 3)
2395       __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2396                              master_th->th.th_frame_time, 0, loc,
2397                              master_th->th.th_team_nproc, 1);
2398     else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2399              !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2400       __kmp_itt_region_joined(gtid);
2401   } // active_level == 1
2402 #endif /* USE_ITT_BUILD */
2403 
2404   if (master_th->th.th_teams_microtask && !exit_teams &&
2405       team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2406       team->t.t_level == master_th->th.th_teams_level + 1) {
2407 // AC: We need to leave the team structure intact at the end of parallel
2408 // inside the teams construct, so that at the next parallel same (hot) team
2409 // works, only adjust nesting levels
2410 #if OMPT_SUPPORT
2411     ompt_data_t ompt_parallel_data = ompt_data_none;
2412     if (ompt_enabled.enabled) {
2413       ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2414       if (ompt_enabled.ompt_callback_implicit_task) {
2415         int ompt_team_size = team->t.t_nproc;
2416         ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2417             ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2418             OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
2419       }
2420       task_info->frame.exit_frame = ompt_data_none;
2421       task_info->task_data = ompt_data_none;
2422       ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
2423       __ompt_lw_taskteam_unlink(master_th);
2424     }
2425 #endif
2426     /* Decrement our nested depth level */
2427     team->t.t_level--;
2428     team->t.t_active_level--;
2429     KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2430 
2431     // Restore number of threads in the team if needed. This code relies on
2432     // the proper adjustment of th_teams_size.nth after the fork in
2433     // __kmp_teams_master on each teams master in the case that
2434     // __kmp_reserve_threads reduced it.
2435     if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2436       int old_num = master_th->th.th_team_nproc;
2437       int new_num = master_th->th.th_teams_size.nth;
2438       kmp_info_t **other_threads = team->t.t_threads;
2439       team->t.t_nproc = new_num;
2440       for (int i = 0; i < old_num; ++i) {
2441         other_threads[i]->th.th_team_nproc = new_num;
2442       }
2443       // Adjust states of non-used threads of the team
2444       for (int i = old_num; i < new_num; ++i) {
2445         // Re-initialize thread's barrier data.
2446         KMP_DEBUG_ASSERT(other_threads[i]);
2447         kmp_balign_t *balign = other_threads[i]->th.th_bar;
2448         for (int b = 0; b < bs_last_barrier; ++b) {
2449           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2450           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2451 #if USE_DEBUGGER
2452           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2453 #endif
2454         }
2455         if (__kmp_tasking_mode != tskm_immediate_exec) {
2456           // Synchronize thread's task state
2457           other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2458         }
2459       }
2460     }
2461 
2462 #if OMPT_SUPPORT
2463     if (ompt_enabled.enabled) {
2464       __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data,
2465                       OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr);
2466     }
2467 #endif
2468 
2469     return;
2470   }
2471 
2472   /* do cleanup and restore the parent team */
2473   master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2474   master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2475 
2476   master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2477 
2478   /* jc: The following lock has instructions with REL and ACQ semantics,
2479      separating the parallel user code called in this parallel region
2480      from the serial user code called after this function returns. */
2481   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2482 
2483   if (!master_th->th.th_teams_microtask ||
2484       team->t.t_level > master_th->th.th_teams_level) {
2485     /* Decrement our nested depth level */
2486     KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2487   }
2488   KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2489 
2490 #if OMPT_SUPPORT
2491   if (ompt_enabled.enabled) {
2492     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2493     if (ompt_enabled.ompt_callback_implicit_task) {
2494       int flags = (team_microtask == (void *)__kmp_teams_master)
2495                       ? ompt_task_initial
2496                       : ompt_task_implicit;
2497       int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc;
2498       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2499           ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2500           OMPT_CUR_TASK_INFO(master_th)->thread_num, flags);
2501     }
2502     task_info->frame.exit_frame = ompt_data_none;
2503     task_info->task_data = ompt_data_none;
2504   }
2505 #endif
2506 
2507   KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2508                 master_th, team));
2509   __kmp_pop_current_task_from_thread(master_th);
2510 
2511 #if KMP_AFFINITY_SUPPORTED
2512   // Restore master thread's partition.
2513   master_th->th.th_first_place = team->t.t_first_place;
2514   master_th->th.th_last_place = team->t.t_last_place;
2515 #endif // KMP_AFFINITY_SUPPORTED
2516   master_th->th.th_def_allocator = team->t.t_def_allocator;
2517 
2518   updateHWFPControl(team);
2519 
2520   if (root->r.r_active != master_active)
2521     root->r.r_active = master_active;
2522 
2523   __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2524                             master_th)); // this will free worker threads
2525 
2526   /* this race was fun to find. make sure the following is in the critical
2527      region otherwise assertions may fail occasionally since the old team may be
2528      reallocated and the hierarchy appears inconsistent. it is actually safe to
2529      run and won't cause any bugs, but will cause those assertion failures. it's
2530      only one deref&assign so might as well put this in the critical region */
2531   master_th->th.th_team = parent_team;
2532   master_th->th.th_team_nproc = parent_team->t.t_nproc;
2533   master_th->th.th_team_master = parent_team->t.t_threads[0];
2534   master_th->th.th_team_serialized = parent_team->t.t_serialized;
2535 
2536   /* restore serialized team, if need be */
2537   if (parent_team->t.t_serialized &&
2538       parent_team != master_th->th.th_serial_team &&
2539       parent_team != root->r.r_root_team) {
2540     __kmp_free_team(root,
2541                     master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2542     master_th->th.th_serial_team = parent_team;
2543   }
2544 
2545   if (__kmp_tasking_mode != tskm_immediate_exec) {
2546     if (master_th->th.th_task_state_top >
2547         0) { // Restore task state from memo stack
2548       KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2549       // Remember master's state if we re-use this nested hot team
2550       master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2551           master_th->th.th_task_state;
2552       --master_th->th.th_task_state_top; // pop
2553       // Now restore state at this level
2554       master_th->th.th_task_state =
2555           master_th->th
2556               .th_task_state_memo_stack[master_th->th.th_task_state_top];
2557     }
2558     // Copy the task team from the parent team to the master thread
2559     master_th->th.th_task_team =
2560         parent_team->t.t_task_team[master_th->th.th_task_state];
2561     KA_TRACE(20,
2562              ("__kmp_join_call: Master T#%d restoring task_team %p / team %p\n",
2563               __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2564               parent_team));
2565   }
2566 
2567   // TODO: GEH - cannot do this assertion because root thread not set up as
2568   // executing
2569   // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2570   master_th->th.th_current_task->td_flags.executing = 1;
2571 
2572   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2573 
2574 #if OMPT_SUPPORT
2575   int flags =
2576       OMPT_INVOKER(fork_context) |
2577       ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league
2578                                                       : ompt_parallel_team);
2579   if (ompt_enabled.enabled) {
2580     __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags,
2581                     codeptr);
2582   }
2583 #endif
2584 
2585   KMP_MB();
2586   KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2587 }
2588 
2589 /* Check whether we should push an internal control record onto the
2590    serial team stack.  If so, do it.  */
__kmp_save_internal_controls(kmp_info_t * thread)2591 void __kmp_save_internal_controls(kmp_info_t *thread) {
2592 
2593   if (thread->th.th_team != thread->th.th_serial_team) {
2594     return;
2595   }
2596   if (thread->th.th_team->t.t_serialized > 1) {
2597     int push = 0;
2598 
2599     if (thread->th.th_team->t.t_control_stack_top == NULL) {
2600       push = 1;
2601     } else {
2602       if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2603           thread->th.th_team->t.t_serialized) {
2604         push = 1;
2605       }
2606     }
2607     if (push) { /* push a record on the serial team's stack */
2608       kmp_internal_control_t *control =
2609           (kmp_internal_control_t *)__kmp_allocate(
2610               sizeof(kmp_internal_control_t));
2611 
2612       copy_icvs(control, &thread->th.th_current_task->td_icvs);
2613 
2614       control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2615 
2616       control->next = thread->th.th_team->t.t_control_stack_top;
2617       thread->th.th_team->t.t_control_stack_top = control;
2618     }
2619   }
2620 }
2621 
2622 /* Changes set_nproc */
__kmp_set_num_threads(int new_nth,int gtid)2623 void __kmp_set_num_threads(int new_nth, int gtid) {
2624   kmp_info_t *thread;
2625   kmp_root_t *root;
2626 
2627   KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2628   KMP_DEBUG_ASSERT(__kmp_init_serial);
2629 
2630   if (new_nth < 1)
2631     new_nth = 1;
2632   else if (new_nth > __kmp_max_nth)
2633     new_nth = __kmp_max_nth;
2634 
2635   KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2636   thread = __kmp_threads[gtid];
2637   if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2638     return; // nothing to do
2639 
2640   __kmp_save_internal_controls(thread);
2641 
2642   set__nproc(thread, new_nth);
2643 
2644   // If this omp_set_num_threads() call will cause the hot team size to be
2645   // reduced (in the absence of a num_threads clause), then reduce it now,
2646   // rather than waiting for the next parallel region.
2647   root = thread->th.th_root;
2648   if (__kmp_init_parallel && (!root->r.r_active) &&
2649       (root->r.r_hot_team->t.t_nproc > new_nth)
2650 #if KMP_NESTED_HOT_TEAMS
2651       && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2652 #endif
2653       ) {
2654     kmp_team_t *hot_team = root->r.r_hot_team;
2655     int f;
2656 
2657     __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2658 
2659     // Release the extra threads we don't need any more.
2660     for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2661       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2662       if (__kmp_tasking_mode != tskm_immediate_exec) {
2663         // When decreasing team size, threads no longer in the team should unref
2664         // task team.
2665         hot_team->t.t_threads[f]->th.th_task_team = NULL;
2666       }
2667       __kmp_free_thread(hot_team->t.t_threads[f]);
2668       hot_team->t.t_threads[f] = NULL;
2669     }
2670     hot_team->t.t_nproc = new_nth;
2671 #if KMP_NESTED_HOT_TEAMS
2672     if (thread->th.th_hot_teams) {
2673       KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2674       thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2675     }
2676 #endif
2677 
2678     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2679 
2680     // Update the t_nproc field in the threads that are still active.
2681     for (f = 0; f < new_nth; f++) {
2682       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2683       hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2684     }
2685     // Special flag in case omp_set_num_threads() call
2686     hot_team->t.t_size_changed = -1;
2687   }
2688 }
2689 
2690 /* Changes max_active_levels */
__kmp_set_max_active_levels(int gtid,int max_active_levels)2691 void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2692   kmp_info_t *thread;
2693 
2694   KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2695                 "%d = (%d)\n",
2696                 gtid, max_active_levels));
2697   KMP_DEBUG_ASSERT(__kmp_init_serial);
2698 
2699   // validate max_active_levels
2700   if (max_active_levels < 0) {
2701     KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2702     // We ignore this call if the user has specified a negative value.
2703     // The current setting won't be changed. The last valid setting will be
2704     // used. A warning will be issued (if warnings are allowed as controlled by
2705     // the KMP_WARNINGS env var).
2706     KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2707                   "max_active_levels for thread %d = (%d)\n",
2708                   gtid, max_active_levels));
2709     return;
2710   }
2711   if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2712     // it's OK, the max_active_levels is within the valid range: [ 0;
2713     // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2714     // We allow a zero value. (implementation defined behavior)
2715   } else {
2716     KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2717                 KMP_MAX_ACTIVE_LEVELS_LIMIT);
2718     max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2719     // Current upper limit is MAX_INT. (implementation defined behavior)
2720     // If the input exceeds the upper limit, we correct the input to be the
2721     // upper limit. (implementation defined behavior)
2722     // Actually, the flow should never get here until we use MAX_INT limit.
2723   }
2724   KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2725                 "max_active_levels for thread %d = (%d)\n",
2726                 gtid, max_active_levels));
2727 
2728   thread = __kmp_threads[gtid];
2729 
2730   __kmp_save_internal_controls(thread);
2731 
2732   set__max_active_levels(thread, max_active_levels);
2733 }
2734 
2735 /* Gets max_active_levels */
__kmp_get_max_active_levels(int gtid)2736 int __kmp_get_max_active_levels(int gtid) {
2737   kmp_info_t *thread;
2738 
2739   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2740   KMP_DEBUG_ASSERT(__kmp_init_serial);
2741 
2742   thread = __kmp_threads[gtid];
2743   KMP_DEBUG_ASSERT(thread->th.th_current_task);
2744   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2745                 "curtask_maxaclevel=%d\n",
2746                 gtid, thread->th.th_current_task,
2747                 thread->th.th_current_task->td_icvs.max_active_levels));
2748   return thread->th.th_current_task->td_icvs.max_active_levels;
2749 }
2750 
2751 KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int));
2752 KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int));
2753 
2754 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
__kmp_set_schedule(int gtid,kmp_sched_t kind,int chunk)2755 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2756   kmp_info_t *thread;
2757   kmp_sched_t orig_kind;
2758   //    kmp_team_t *team;
2759 
2760   KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2761                 gtid, (int)kind, chunk));
2762   KMP_DEBUG_ASSERT(__kmp_init_serial);
2763 
2764   // Check if the kind parameter is valid, correct if needed.
2765   // Valid parameters should fit in one of two intervals - standard or extended:
2766   //       <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2767   // 2008-01-25: 0,  1 - 4,       5,         100,     101 - 102, 103
2768   orig_kind = kind;
2769   kind = __kmp_sched_without_mods(kind);
2770 
2771   if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2772       (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2773     // TODO: Hint needs attention in case we change the default schedule.
2774     __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2775               KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2776               __kmp_msg_null);
2777     kind = kmp_sched_default;
2778     chunk = 0; // ignore chunk value in case of bad kind
2779   }
2780 
2781   thread = __kmp_threads[gtid];
2782 
2783   __kmp_save_internal_controls(thread);
2784 
2785   if (kind < kmp_sched_upper_std) {
2786     if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2787       // differ static chunked vs. unchunked:  chunk should be invalid to
2788       // indicate unchunked schedule (which is the default)
2789       thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2790     } else {
2791       thread->th.th_current_task->td_icvs.sched.r_sched_type =
2792           __kmp_sch_map[kind - kmp_sched_lower - 1];
2793     }
2794   } else {
2795     //    __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2796     //    kmp_sched_lower - 2 ];
2797     thread->th.th_current_task->td_icvs.sched.r_sched_type =
2798         __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2799                       kmp_sched_lower - 2];
2800   }
2801   __kmp_sched_apply_mods_intkind(
2802       orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type));
2803   if (kind == kmp_sched_auto || chunk < 1) {
2804     // ignore parameter chunk for schedule auto
2805     thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2806   } else {
2807     thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2808   }
2809 }
2810 
2811 /* Gets def_sched_var ICV values */
__kmp_get_schedule(int gtid,kmp_sched_t * kind,int * chunk)2812 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2813   kmp_info_t *thread;
2814   enum sched_type th_type;
2815 
2816   KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
2817   KMP_DEBUG_ASSERT(__kmp_init_serial);
2818 
2819   thread = __kmp_threads[gtid];
2820 
2821   th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2822   switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) {
2823   case kmp_sch_static:
2824   case kmp_sch_static_greedy:
2825   case kmp_sch_static_balanced:
2826     *kind = kmp_sched_static;
2827     __kmp_sched_apply_mods_stdkind(kind, th_type);
2828     *chunk = 0; // chunk was not set, try to show this fact via zero value
2829     return;
2830   case kmp_sch_static_chunked:
2831     *kind = kmp_sched_static;
2832     break;
2833   case kmp_sch_dynamic_chunked:
2834     *kind = kmp_sched_dynamic;
2835     break;
2836   case kmp_sch_guided_chunked:
2837   case kmp_sch_guided_iterative_chunked:
2838   case kmp_sch_guided_analytical_chunked:
2839     *kind = kmp_sched_guided;
2840     break;
2841   case kmp_sch_auto:
2842     *kind = kmp_sched_auto;
2843     break;
2844   case kmp_sch_trapezoidal:
2845     *kind = kmp_sched_trapezoidal;
2846     break;
2847 #if KMP_STATIC_STEAL_ENABLED
2848   case kmp_sch_static_steal:
2849     *kind = kmp_sched_static_steal;
2850     break;
2851 #endif
2852   default:
2853     KMP_FATAL(UnknownSchedulingType, th_type);
2854   }
2855 
2856   __kmp_sched_apply_mods_stdkind(kind, th_type);
2857   *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
2858 }
2859 
__kmp_get_ancestor_thread_num(int gtid,int level)2860 int __kmp_get_ancestor_thread_num(int gtid, int level) {
2861 
2862   int ii, dd;
2863   kmp_team_t *team;
2864   kmp_info_t *thr;
2865 
2866   KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
2867   KMP_DEBUG_ASSERT(__kmp_init_serial);
2868 
2869   // validate level
2870   if (level == 0)
2871     return 0;
2872   if (level < 0)
2873     return -1;
2874   thr = __kmp_threads[gtid];
2875   team = thr->th.th_team;
2876   ii = team->t.t_level;
2877   if (level > ii)
2878     return -1;
2879 
2880   if (thr->th.th_teams_microtask) {
2881     // AC: we are in teams region where multiple nested teams have same level
2882     int tlevel = thr->th.th_teams_level; // the level of the teams construct
2883     if (level <=
2884         tlevel) { // otherwise usual algorithm works (will not touch the teams)
2885       KMP_DEBUG_ASSERT(ii >= tlevel);
2886       // AC: As we need to pass by the teams league, we need to artificially
2887       // increase ii
2888       if (ii == tlevel) {
2889         ii += 2; // three teams have same level
2890       } else {
2891         ii++; // two teams have same level
2892       }
2893     }
2894   }
2895 
2896   if (ii == level)
2897     return __kmp_tid_from_gtid(gtid);
2898 
2899   dd = team->t.t_serialized;
2900   level++;
2901   while (ii > level) {
2902     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2903     }
2904     if ((team->t.t_serialized) && (!dd)) {
2905       team = team->t.t_parent;
2906       continue;
2907     }
2908     if (ii > level) {
2909       team = team->t.t_parent;
2910       dd = team->t.t_serialized;
2911       ii--;
2912     }
2913   }
2914 
2915   return (dd > 1) ? (0) : (team->t.t_master_tid);
2916 }
2917 
__kmp_get_team_size(int gtid,int level)2918 int __kmp_get_team_size(int gtid, int level) {
2919 
2920   int ii, dd;
2921   kmp_team_t *team;
2922   kmp_info_t *thr;
2923 
2924   KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
2925   KMP_DEBUG_ASSERT(__kmp_init_serial);
2926 
2927   // validate level
2928   if (level == 0)
2929     return 1;
2930   if (level < 0)
2931     return -1;
2932   thr = __kmp_threads[gtid];
2933   team = thr->th.th_team;
2934   ii = team->t.t_level;
2935   if (level > ii)
2936     return -1;
2937 
2938   if (thr->th.th_teams_microtask) {
2939     // AC: we are in teams region where multiple nested teams have same level
2940     int tlevel = thr->th.th_teams_level; // the level of the teams construct
2941     if (level <=
2942         tlevel) { // otherwise usual algorithm works (will not touch the teams)
2943       KMP_DEBUG_ASSERT(ii >= tlevel);
2944       // AC: As we need to pass by the teams league, we need to artificially
2945       // increase ii
2946       if (ii == tlevel) {
2947         ii += 2; // three teams have same level
2948       } else {
2949         ii++; // two teams have same level
2950       }
2951     }
2952   }
2953 
2954   while (ii > level) {
2955     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2956     }
2957     if (team->t.t_serialized && (!dd)) {
2958       team = team->t.t_parent;
2959       continue;
2960     }
2961     if (ii > level) {
2962       team = team->t.t_parent;
2963       ii--;
2964     }
2965   }
2966 
2967   return team->t.t_nproc;
2968 }
2969 
__kmp_get_schedule_global()2970 kmp_r_sched_t __kmp_get_schedule_global() {
2971   // This routine created because pairs (__kmp_sched, __kmp_chunk) and
2972   // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
2973   // independently. So one can get the updated schedule here.
2974 
2975   kmp_r_sched_t r_sched;
2976 
2977   // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
2978   // __kmp_guided. __kmp_sched should keep original value, so that user can set
2979   // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
2980   // different roots (even in OMP 2.5)
2981   enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
2982   enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched);
2983   if (s == kmp_sch_static) {
2984     // replace STATIC with more detailed schedule (balanced or greedy)
2985     r_sched.r_sched_type = __kmp_static;
2986   } else if (s == kmp_sch_guided_chunked) {
2987     // replace GUIDED with more detailed schedule (iterative or analytical)
2988     r_sched.r_sched_type = __kmp_guided;
2989   } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
2990     r_sched.r_sched_type = __kmp_sched;
2991   }
2992   SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers);
2993 
2994   if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
2995     // __kmp_chunk may be wrong here (if it was not ever set)
2996     r_sched.chunk = KMP_DEFAULT_CHUNK;
2997   } else {
2998     r_sched.chunk = __kmp_chunk;
2999   }
3000 
3001   return r_sched;
3002 }
3003 
3004 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
3005    at least argc number of *t_argv entries for the requested team. */
__kmp_alloc_argv_entries(int argc,kmp_team_t * team,int realloc)3006 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
3007 
3008   KMP_DEBUG_ASSERT(team);
3009   if (!realloc || argc > team->t.t_max_argc) {
3010 
3011     KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
3012                    "current entries=%d\n",
3013                    team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
3014     /* if previously allocated heap space for args, free them */
3015     if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
3016       __kmp_free((void *)team->t.t_argv);
3017 
3018     if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3019       /* use unused space in the cache line for arguments */
3020       team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3021       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3022                      "argv entries\n",
3023                      team->t.t_id, team->t.t_max_argc));
3024       team->t.t_argv = &team->t.t_inline_argv[0];
3025       if (__kmp_storage_map) {
3026         __kmp_print_storage_map_gtid(
3027             -1, &team->t.t_inline_argv[0],
3028             &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3029             (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3030             team->t.t_id);
3031       }
3032     } else {
3033       /* allocate space for arguments in the heap */
3034       team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3035                                ? KMP_MIN_MALLOC_ARGV_ENTRIES
3036                                : 2 * argc;
3037       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3038                      "argv entries\n",
3039                      team->t.t_id, team->t.t_max_argc));
3040       team->t.t_argv =
3041           (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3042       if (__kmp_storage_map) {
3043         __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3044                                      &team->t.t_argv[team->t.t_max_argc],
3045                                      sizeof(void *) * team->t.t_max_argc,
3046                                      "team_%d.t_argv", team->t.t_id);
3047       }
3048     }
3049   }
3050 }
3051 
__kmp_allocate_team_arrays(kmp_team_t * team,int max_nth)3052 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3053   int i;
3054   int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3055   team->t.t_threads =
3056       (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3057   team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3058       sizeof(dispatch_shared_info_t) * num_disp_buff);
3059   team->t.t_dispatch =
3060       (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3061   team->t.t_implicit_task_taskdata =
3062       (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3063   team->t.t_max_nproc = max_nth;
3064 
3065   /* setup dispatch buffers */
3066   for (i = 0; i < num_disp_buff; ++i) {
3067     team->t.t_disp_buffer[i].buffer_index = i;
3068     team->t.t_disp_buffer[i].doacross_buf_idx = i;
3069   }
3070 }
3071 
__kmp_free_team_arrays(kmp_team_t * team)3072 static void __kmp_free_team_arrays(kmp_team_t *team) {
3073   /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3074   int i;
3075   for (i = 0; i < team->t.t_max_nproc; ++i) {
3076     if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3077       __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3078       team->t.t_dispatch[i].th_disp_buffer = NULL;
3079     }
3080   }
3081 #if KMP_USE_HIER_SCHED
3082   __kmp_dispatch_free_hierarchies(team);
3083 #endif
3084   __kmp_free(team->t.t_threads);
3085   __kmp_free(team->t.t_disp_buffer);
3086   __kmp_free(team->t.t_dispatch);
3087   __kmp_free(team->t.t_implicit_task_taskdata);
3088   team->t.t_threads = NULL;
3089   team->t.t_disp_buffer = NULL;
3090   team->t.t_dispatch = NULL;
3091   team->t.t_implicit_task_taskdata = 0;
3092 }
3093 
__kmp_reallocate_team_arrays(kmp_team_t * team,int max_nth)3094 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3095   kmp_info_t **oldThreads = team->t.t_threads;
3096 
3097   __kmp_free(team->t.t_disp_buffer);
3098   __kmp_free(team->t.t_dispatch);
3099   __kmp_free(team->t.t_implicit_task_taskdata);
3100   __kmp_allocate_team_arrays(team, max_nth);
3101 
3102   KMP_MEMCPY(team->t.t_threads, oldThreads,
3103              team->t.t_nproc * sizeof(kmp_info_t *));
3104 
3105   __kmp_free(oldThreads);
3106 }
3107 
__kmp_get_global_icvs(void)3108 static kmp_internal_control_t __kmp_get_global_icvs(void) {
3109 
3110   kmp_r_sched_t r_sched =
3111       __kmp_get_schedule_global(); // get current state of scheduling globals
3112 
3113   KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3114 
3115   kmp_internal_control_t g_icvs = {
3116     0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3117     (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3118     // adjustment of threads (per thread)
3119     (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3120     // whether blocktime is explicitly set
3121     __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3122 #if KMP_USE_MONITOR
3123     __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3124 // intervals
3125 #endif
3126     __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3127     // next parallel region (per thread)
3128     // (use a max ub on value if __kmp_parallel_initialize not called yet)
3129     __kmp_cg_max_nth, // int thread_limit;
3130     __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3131     // for max_active_levels
3132     r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3133     // {sched,chunk} pair
3134     __kmp_nested_proc_bind.bind_types[0],
3135     __kmp_default_device,
3136     NULL // struct kmp_internal_control *next;
3137   };
3138 
3139   return g_icvs;
3140 }
3141 
__kmp_get_x_global_icvs(const kmp_team_t * team)3142 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3143 
3144   kmp_internal_control_t gx_icvs;
3145   gx_icvs.serial_nesting_level =
3146       0; // probably =team->t.t_serial like in save_inter_controls
3147   copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3148   gx_icvs.next = NULL;
3149 
3150   return gx_icvs;
3151 }
3152 
__kmp_initialize_root(kmp_root_t * root)3153 static void __kmp_initialize_root(kmp_root_t *root) {
3154   int f;
3155   kmp_team_t *root_team;
3156   kmp_team_t *hot_team;
3157   int hot_team_max_nth;
3158   kmp_r_sched_t r_sched =
3159       __kmp_get_schedule_global(); // get current state of scheduling globals
3160   kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3161   KMP_DEBUG_ASSERT(root);
3162   KMP_ASSERT(!root->r.r_begin);
3163 
3164   /* setup the root state structure */
3165   __kmp_init_lock(&root->r.r_begin_lock);
3166   root->r.r_begin = FALSE;
3167   root->r.r_active = FALSE;
3168   root->r.r_in_parallel = 0;
3169   root->r.r_blocktime = __kmp_dflt_blocktime;
3170 
3171   /* setup the root team for this task */
3172   /* allocate the root team structure */
3173   KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3174 
3175   root_team =
3176       __kmp_allocate_team(root,
3177                           1, // new_nproc
3178                           1, // max_nproc
3179 #if OMPT_SUPPORT
3180                           ompt_data_none, // root parallel id
3181 #endif
3182                           __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3183                           0 // argc
3184                           USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3185                           );
3186 #if USE_DEBUGGER
3187   // Non-NULL value should be assigned to make the debugger display the root
3188   // team.
3189   TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3190 #endif
3191 
3192   KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3193 
3194   root->r.r_root_team = root_team;
3195   root_team->t.t_control_stack_top = NULL;
3196 
3197   /* initialize root team */
3198   root_team->t.t_threads[0] = NULL;
3199   root_team->t.t_nproc = 1;
3200   root_team->t.t_serialized = 1;
3201   // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3202   root_team->t.t_sched.sched = r_sched.sched;
3203   KA_TRACE(
3204       20,
3205       ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3206        root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3207 
3208   /* setup the  hot team for this task */
3209   /* allocate the hot team structure */
3210   KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3211 
3212   hot_team =
3213       __kmp_allocate_team(root,
3214                           1, // new_nproc
3215                           __kmp_dflt_team_nth_ub * 2, // max_nproc
3216 #if OMPT_SUPPORT
3217                           ompt_data_none, // root parallel id
3218 #endif
3219                           __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3220                           0 // argc
3221                           USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3222                           );
3223   KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3224 
3225   root->r.r_hot_team = hot_team;
3226   root_team->t.t_control_stack_top = NULL;
3227 
3228   /* first-time initialization */
3229   hot_team->t.t_parent = root_team;
3230 
3231   /* initialize hot team */
3232   hot_team_max_nth = hot_team->t.t_max_nproc;
3233   for (f = 0; f < hot_team_max_nth; ++f) {
3234     hot_team->t.t_threads[f] = NULL;
3235   }
3236   hot_team->t.t_nproc = 1;
3237   // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3238   hot_team->t.t_sched.sched = r_sched.sched;
3239   hot_team->t.t_size_changed = 0;
3240 }
3241 
3242 #ifdef KMP_DEBUG
3243 
3244 typedef struct kmp_team_list_item {
3245   kmp_team_p const *entry;
3246   struct kmp_team_list_item *next;
3247 } kmp_team_list_item_t;
3248 typedef kmp_team_list_item_t *kmp_team_list_t;
3249 
__kmp_print_structure_team_accum(kmp_team_list_t list,kmp_team_p const * team)3250 static void __kmp_print_structure_team_accum( // Add team to list of teams.
3251     kmp_team_list_t list, // List of teams.
3252     kmp_team_p const *team // Team to add.
3253     ) {
3254 
3255   // List must terminate with item where both entry and next are NULL.
3256   // Team is added to the list only once.
3257   // List is sorted in ascending order by team id.
3258   // Team id is *not* a key.
3259 
3260   kmp_team_list_t l;
3261 
3262   KMP_DEBUG_ASSERT(list != NULL);
3263   if (team == NULL) {
3264     return;
3265   }
3266 
3267   __kmp_print_structure_team_accum(list, team->t.t_parent);
3268   __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3269 
3270   // Search list for the team.
3271   l = list;
3272   while (l->next != NULL && l->entry != team) {
3273     l = l->next;
3274   }
3275   if (l->next != NULL) {
3276     return; // Team has been added before, exit.
3277   }
3278 
3279   // Team is not found. Search list again for insertion point.
3280   l = list;
3281   while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3282     l = l->next;
3283   }
3284 
3285   // Insert team.
3286   {
3287     kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3288         sizeof(kmp_team_list_item_t));
3289     *item = *l;
3290     l->entry = team;
3291     l->next = item;
3292   }
3293 }
3294 
__kmp_print_structure_team(char const * title,kmp_team_p const * team)3295 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3296 
3297                                        ) {
3298   __kmp_printf("%s", title);
3299   if (team != NULL) {
3300     __kmp_printf("%2x %p\n", team->t.t_id, team);
3301   } else {
3302     __kmp_printf(" - (nil)\n");
3303   }
3304 }
3305 
__kmp_print_structure_thread(char const * title,kmp_info_p const * thread)3306 static void __kmp_print_structure_thread(char const *title,
3307                                          kmp_info_p const *thread) {
3308   __kmp_printf("%s", title);
3309   if (thread != NULL) {
3310     __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3311   } else {
3312     __kmp_printf(" - (nil)\n");
3313   }
3314 }
3315 
__kmp_print_structure(void)3316 void __kmp_print_structure(void) {
3317 
3318   kmp_team_list_t list;
3319 
3320   // Initialize list of teams.
3321   list =
3322       (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3323   list->entry = NULL;
3324   list->next = NULL;
3325 
3326   __kmp_printf("\n------------------------------\nGlobal Thread "
3327                "Table\n------------------------------\n");
3328   {
3329     int gtid;
3330     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3331       __kmp_printf("%2d", gtid);
3332       if (__kmp_threads != NULL) {
3333         __kmp_printf(" %p", __kmp_threads[gtid]);
3334       }
3335       if (__kmp_root != NULL) {
3336         __kmp_printf(" %p", __kmp_root[gtid]);
3337       }
3338       __kmp_printf("\n");
3339     }
3340   }
3341 
3342   // Print out __kmp_threads array.
3343   __kmp_printf("\n------------------------------\nThreads\n--------------------"
3344                "----------\n");
3345   if (__kmp_threads != NULL) {
3346     int gtid;
3347     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3348       kmp_info_t const *thread = __kmp_threads[gtid];
3349       if (thread != NULL) {
3350         __kmp_printf("GTID %2d %p:\n", gtid, thread);
3351         __kmp_printf("    Our Root:        %p\n", thread->th.th_root);
3352         __kmp_print_structure_team("    Our Team:     ", thread->th.th_team);
3353         __kmp_print_structure_team("    Serial Team:  ",
3354                                    thread->th.th_serial_team);
3355         __kmp_printf("    Threads:      %2d\n", thread->th.th_team_nproc);
3356         __kmp_print_structure_thread("    Master:       ",
3357                                      thread->th.th_team_master);
3358         __kmp_printf("    Serialized?:  %2d\n", thread->th.th_team_serialized);
3359         __kmp_printf("    Set NProc:    %2d\n", thread->th.th_set_nproc);
3360         __kmp_printf("    Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3361         __kmp_print_structure_thread("    Next in pool: ",
3362                                      thread->th.th_next_pool);
3363         __kmp_printf("\n");
3364         __kmp_print_structure_team_accum(list, thread->th.th_team);
3365         __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3366       }
3367     }
3368   } else {
3369     __kmp_printf("Threads array is not allocated.\n");
3370   }
3371 
3372   // Print out __kmp_root array.
3373   __kmp_printf("\n------------------------------\nUbers\n----------------------"
3374                "--------\n");
3375   if (__kmp_root != NULL) {
3376     int gtid;
3377     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3378       kmp_root_t const *root = __kmp_root[gtid];
3379       if (root != NULL) {
3380         __kmp_printf("GTID %2d %p:\n", gtid, root);
3381         __kmp_print_structure_team("    Root Team:    ", root->r.r_root_team);
3382         __kmp_print_structure_team("    Hot Team:     ", root->r.r_hot_team);
3383         __kmp_print_structure_thread("    Uber Thread:  ",
3384                                      root->r.r_uber_thread);
3385         __kmp_printf("    Active?:      %2d\n", root->r.r_active);
3386         __kmp_printf("    In Parallel:  %2d\n",
3387                      KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3388         __kmp_printf("\n");
3389         __kmp_print_structure_team_accum(list, root->r.r_root_team);
3390         __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3391       }
3392     }
3393   } else {
3394     __kmp_printf("Ubers array is not allocated.\n");
3395   }
3396 
3397   __kmp_printf("\n------------------------------\nTeams\n----------------------"
3398                "--------\n");
3399   while (list->next != NULL) {
3400     kmp_team_p const *team = list->entry;
3401     int i;
3402     __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3403     __kmp_print_structure_team("    Parent Team:      ", team->t.t_parent);
3404     __kmp_printf("    Master TID:       %2d\n", team->t.t_master_tid);
3405     __kmp_printf("    Max threads:      %2d\n", team->t.t_max_nproc);
3406     __kmp_printf("    Levels of serial: %2d\n", team->t.t_serialized);
3407     __kmp_printf("    Number threads:   %2d\n", team->t.t_nproc);
3408     for (i = 0; i < team->t.t_nproc; ++i) {
3409       __kmp_printf("    Thread %2d:      ", i);
3410       __kmp_print_structure_thread("", team->t.t_threads[i]);
3411     }
3412     __kmp_print_structure_team("    Next in pool:     ", team->t.t_next_pool);
3413     __kmp_printf("\n");
3414     list = list->next;
3415   }
3416 
3417   // Print out __kmp_thread_pool and __kmp_team_pool.
3418   __kmp_printf("\n------------------------------\nPools\n----------------------"
3419                "--------\n");
3420   __kmp_print_structure_thread("Thread pool:          ",
3421                                CCAST(kmp_info_t *, __kmp_thread_pool));
3422   __kmp_print_structure_team("Team pool:            ",
3423                              CCAST(kmp_team_t *, __kmp_team_pool));
3424   __kmp_printf("\n");
3425 
3426   // Free team list.
3427   while (list != NULL) {
3428     kmp_team_list_item_t *item = list;
3429     list = list->next;
3430     KMP_INTERNAL_FREE(item);
3431   }
3432 }
3433 
3434 #endif
3435 
3436 //---------------------------------------------------------------------------
3437 //  Stuff for per-thread fast random number generator
3438 //  Table of primes
3439 static const unsigned __kmp_primes[] = {
3440     0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3441     0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3442     0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3443     0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3444     0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3445     0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3446     0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3447     0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3448     0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3449     0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3450     0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3451 
3452 //---------------------------------------------------------------------------
3453 //  __kmp_get_random: Get a random number using a linear congruential method.
__kmp_get_random(kmp_info_t * thread)3454 unsigned short __kmp_get_random(kmp_info_t *thread) {
3455   unsigned x = thread->th.th_x;
3456   unsigned short r = x >> 16;
3457 
3458   thread->th.th_x = x * thread->th.th_a + 1;
3459 
3460   KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3461                 thread->th.th_info.ds.ds_tid, r));
3462 
3463   return r;
3464 }
3465 //--------------------------------------------------------
3466 // __kmp_init_random: Initialize a random number generator
__kmp_init_random(kmp_info_t * thread)3467 void __kmp_init_random(kmp_info_t *thread) {
3468   unsigned seed = thread->th.th_info.ds.ds_tid;
3469 
3470   thread->th.th_a =
3471       __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3472   thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3473   KA_TRACE(30,
3474            ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3475 }
3476 
3477 #if KMP_OS_WINDOWS
3478 /* reclaim array entries for root threads that are already dead, returns number
3479  * reclaimed */
__kmp_reclaim_dead_roots(void)3480 static int __kmp_reclaim_dead_roots(void) {
3481   int i, r = 0;
3482 
3483   for (i = 0; i < __kmp_threads_capacity; ++i) {
3484     if (KMP_UBER_GTID(i) &&
3485         !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3486         !__kmp_root[i]
3487              ->r.r_active) { // AC: reclaim only roots died in non-active state
3488       r += __kmp_unregister_root_other_thread(i);
3489     }
3490   }
3491   return r;
3492 }
3493 #endif
3494 
3495 /* This function attempts to create free entries in __kmp_threads and
3496    __kmp_root, and returns the number of free entries generated.
3497 
3498    For Windows* OS static library, the first mechanism used is to reclaim array
3499    entries for root threads that are already dead.
3500 
3501    On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3502    __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3503    capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3504    threadprivate cache array has been created. Synchronization with
3505    __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3506 
3507    After any dead root reclamation, if the clipping value allows array expansion
3508    to result in the generation of a total of nNeed free slots, the function does
3509    that expansion. If not, nothing is done beyond the possible initial root
3510    thread reclamation.
3511 
3512    If any argument is negative, the behavior is undefined. */
__kmp_expand_threads(int nNeed)3513 static int __kmp_expand_threads(int nNeed) {
3514   int added = 0;
3515   int minimumRequiredCapacity;
3516   int newCapacity;
3517   kmp_info_t **newThreads;
3518   kmp_root_t **newRoot;
3519 
3520 // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3521 // resizing __kmp_threads does not need additional protection if foreign
3522 // threads are present
3523 
3524 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3525   /* only for Windows static library */
3526   /* reclaim array entries for root threads that are already dead */
3527   added = __kmp_reclaim_dead_roots();
3528 
3529   if (nNeed) {
3530     nNeed -= added;
3531     if (nNeed < 0)
3532       nNeed = 0;
3533   }
3534 #endif
3535   if (nNeed <= 0)
3536     return added;
3537 
3538   // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3539   // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3540   // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3541   // > __kmp_max_nth in one of two ways:
3542   //
3543   // 1) The initialization thread (gtid = 0) exits.  __kmp_threads[0]
3544   //    may not be reused by another thread, so we may need to increase
3545   //    __kmp_threads_capacity to __kmp_max_nth + 1.
3546   //
3547   // 2) New foreign root(s) are encountered.  We always register new foreign
3548   //    roots. This may cause a smaller # of threads to be allocated at
3549   //    subsequent parallel regions, but the worker threads hang around (and
3550   //    eventually go to sleep) and need slots in the __kmp_threads[] array.
3551   //
3552   // Anyway, that is the reason for moving the check to see if
3553   // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3554   // instead of having it performed here. -BB
3555 
3556   KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3557 
3558   /* compute expansion headroom to check if we can expand */
3559   if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3560     /* possible expansion too small -- give up */
3561     return added;
3562   }
3563   minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3564 
3565   newCapacity = __kmp_threads_capacity;
3566   do {
3567     newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3568                                                           : __kmp_sys_max_nth;
3569   } while (newCapacity < minimumRequiredCapacity);
3570   newThreads = (kmp_info_t **)__kmp_allocate(
3571       (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3572   newRoot =
3573       (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3574   KMP_MEMCPY(newThreads, __kmp_threads,
3575              __kmp_threads_capacity * sizeof(kmp_info_t *));
3576   KMP_MEMCPY(newRoot, __kmp_root,
3577              __kmp_threads_capacity * sizeof(kmp_root_t *));
3578 
3579   kmp_info_t **temp_threads = __kmp_threads;
3580   *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3581   *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3582   __kmp_free(temp_threads);
3583   added += newCapacity - __kmp_threads_capacity;
3584   *(volatile int *)&__kmp_threads_capacity = newCapacity;
3585 
3586   if (newCapacity > __kmp_tp_capacity) {
3587     __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3588     if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3589       __kmp_threadprivate_resize_cache(newCapacity);
3590     } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3591       *(volatile int *)&__kmp_tp_capacity = newCapacity;
3592     }
3593     __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3594   }
3595 
3596   return added;
3597 }
3598 
3599 /* Register the current thread as a root thread and obtain our gtid. We must
3600    have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3601    thread that calls from __kmp_do_serial_initialize() */
__kmp_register_root(int initial_thread)3602 int __kmp_register_root(int initial_thread) {
3603   kmp_info_t *root_thread;
3604   kmp_root_t *root;
3605   int gtid;
3606   int capacity;
3607   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3608   KA_TRACE(20, ("__kmp_register_root: entered\n"));
3609   KMP_MB();
3610 
3611   /* 2007-03-02:
3612      If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3613      initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3614      work as expected -- it may return false (that means there is at least one
3615      empty slot in __kmp_threads array), but it is possible the only free slot
3616      is #0, which is reserved for initial thread and so cannot be used for this
3617      one. Following code workarounds this bug.
3618 
3619      However, right solution seems to be not reserving slot #0 for initial
3620      thread because:
3621      (1) there is no magic in slot #0,
3622      (2) we cannot detect initial thread reliably (the first thread which does
3623         serial initialization may be not a real initial thread).
3624   */
3625   capacity = __kmp_threads_capacity;
3626   if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3627     --capacity;
3628   }
3629 
3630   /* see if there are too many threads */
3631   if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3632     if (__kmp_tp_cached) {
3633       __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3634                   KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3635                   KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3636     } else {
3637       __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3638                   __kmp_msg_null);
3639     }
3640   }
3641 
3642   /* find an available thread slot */
3643   /* Don't reassign the zero slot since we need that to only be used by initial
3644      thread */
3645   for (gtid = (initial_thread ? 0 : 1); TCR_PTR(__kmp_threads[gtid]) != NULL;
3646        gtid++)
3647     ;
3648   KA_TRACE(1,
3649            ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3650   KMP_ASSERT(gtid < __kmp_threads_capacity);
3651 
3652   /* update global accounting */
3653   __kmp_all_nth++;
3654   TCW_4(__kmp_nth, __kmp_nth + 1);
3655 
3656   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3657   // numbers of procs, and method #2 (keyed API call) for higher numbers.
3658   if (__kmp_adjust_gtid_mode) {
3659     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3660       if (TCR_4(__kmp_gtid_mode) != 2) {
3661         TCW_4(__kmp_gtid_mode, 2);
3662       }
3663     } else {
3664       if (TCR_4(__kmp_gtid_mode) != 1) {
3665         TCW_4(__kmp_gtid_mode, 1);
3666       }
3667     }
3668   }
3669 
3670 #ifdef KMP_ADJUST_BLOCKTIME
3671   /* Adjust blocktime to zero if necessary            */
3672   /* Middle initialization might not have occurred yet */
3673   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3674     if (__kmp_nth > __kmp_avail_proc) {
3675       __kmp_zero_bt = TRUE;
3676     }
3677   }
3678 #endif /* KMP_ADJUST_BLOCKTIME */
3679 
3680   /* setup this new hierarchy */
3681   if (!(root = __kmp_root[gtid])) {
3682     root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3683     KMP_DEBUG_ASSERT(!root->r.r_root_team);
3684   }
3685 
3686 #if KMP_STATS_ENABLED
3687   // Initialize stats as soon as possible (right after gtid assignment).
3688   __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3689   __kmp_stats_thread_ptr->startLife();
3690   KMP_SET_THREAD_STATE(SERIAL_REGION);
3691   KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3692 #endif
3693   __kmp_initialize_root(root);
3694 
3695   /* setup new root thread structure */
3696   if (root->r.r_uber_thread) {
3697     root_thread = root->r.r_uber_thread;
3698   } else {
3699     root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3700     if (__kmp_storage_map) {
3701       __kmp_print_thread_storage_map(root_thread, gtid);
3702     }
3703     root_thread->th.th_info.ds.ds_gtid = gtid;
3704 #if OMPT_SUPPORT
3705     root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3706 #endif
3707     root_thread->th.th_root = root;
3708     if (__kmp_env_consistency_check) {
3709       root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3710     }
3711 #if USE_FAST_MEMORY
3712     __kmp_initialize_fast_memory(root_thread);
3713 #endif /* USE_FAST_MEMORY */
3714 
3715 #if KMP_USE_BGET
3716     KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3717     __kmp_initialize_bget(root_thread);
3718 #endif
3719     __kmp_init_random(root_thread); // Initialize random number generator
3720   }
3721 
3722   /* setup the serial team held in reserve by the root thread */
3723   if (!root_thread->th.th_serial_team) {
3724     kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3725     KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3726     root_thread->th.th_serial_team = __kmp_allocate_team(
3727         root, 1, 1,
3728 #if OMPT_SUPPORT
3729         ompt_data_none, // root parallel id
3730 #endif
3731         proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3732   }
3733   KMP_ASSERT(root_thread->th.th_serial_team);
3734   KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3735                 root_thread->th.th_serial_team));
3736 
3737   /* drop root_thread into place */
3738   TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3739 
3740   root->r.r_root_team->t.t_threads[0] = root_thread;
3741   root->r.r_hot_team->t.t_threads[0] = root_thread;
3742   root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3743   // AC: the team created in reserve, not for execution (it is unused for now).
3744   root_thread->th.th_serial_team->t.t_serialized = 0;
3745   root->r.r_uber_thread = root_thread;
3746 
3747   /* initialize the thread, get it ready to go */
3748   __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3749   TCW_4(__kmp_init_gtid, TRUE);
3750 
3751   /* prepare the master thread for get_gtid() */
3752   __kmp_gtid_set_specific(gtid);
3753 
3754 #if USE_ITT_BUILD
3755   __kmp_itt_thread_name(gtid);
3756 #endif /* USE_ITT_BUILD */
3757 
3758 #ifdef KMP_TDATA_GTID
3759   __kmp_gtid = gtid;
3760 #endif
3761   __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3762   KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3763 
3764   KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3765                 "plain=%u\n",
3766                 gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3767                 root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3768                 KMP_INIT_BARRIER_STATE));
3769   { // Initialize barrier data.
3770     int b;
3771     for (b = 0; b < bs_last_barrier; ++b) {
3772       root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3773 #if USE_DEBUGGER
3774       root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3775 #endif
3776     }
3777   }
3778   KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
3779                    KMP_INIT_BARRIER_STATE);
3780 
3781 #if KMP_AFFINITY_SUPPORTED
3782   root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3783   root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3784   root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3785   root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3786   if (TCR_4(__kmp_init_middle)) {
3787     __kmp_affinity_set_init_mask(gtid, TRUE);
3788   }
3789 #endif /* KMP_AFFINITY_SUPPORTED */
3790   root_thread->th.th_def_allocator = __kmp_def_allocator;
3791   root_thread->th.th_prev_level = 0;
3792   root_thread->th.th_prev_num_threads = 1;
3793 
3794   kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
3795   tmp->cg_root = root_thread;
3796   tmp->cg_thread_limit = __kmp_cg_max_nth;
3797   tmp->cg_nthreads = 1;
3798   KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
3799                  " cg_nthreads init to 1\n",
3800                  root_thread, tmp));
3801   tmp->up = NULL;
3802   root_thread->th.th_cg_roots = tmp;
3803 
3804   __kmp_root_counter++;
3805 
3806 #if OMPT_SUPPORT
3807   if (!initial_thread && ompt_enabled.enabled) {
3808 
3809     kmp_info_t *root_thread = ompt_get_thread();
3810 
3811     ompt_set_thread_state(root_thread, ompt_state_overhead);
3812 
3813     if (ompt_enabled.ompt_callback_thread_begin) {
3814       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
3815           ompt_thread_initial, __ompt_get_thread_data_internal());
3816     }
3817     ompt_data_t *task_data;
3818     ompt_data_t *parallel_data;
3819     __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data, NULL);
3820     if (ompt_enabled.ompt_callback_implicit_task) {
3821       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3822           ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
3823     }
3824 
3825     ompt_set_thread_state(root_thread, ompt_state_work_serial);
3826   }
3827 #endif
3828 
3829   KMP_MB();
3830   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3831 
3832   return gtid;
3833 }
3834 
3835 #if KMP_NESTED_HOT_TEAMS
__kmp_free_hot_teams(kmp_root_t * root,kmp_info_t * thr,int level,const int max_level)3836 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
3837                                 const int max_level) {
3838   int i, n, nth;
3839   kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
3840   if (!hot_teams || !hot_teams[level].hot_team) {
3841     return 0;
3842   }
3843   KMP_DEBUG_ASSERT(level < max_level);
3844   kmp_team_t *team = hot_teams[level].hot_team;
3845   nth = hot_teams[level].hot_team_nth;
3846   n = nth - 1; // master is not freed
3847   if (level < max_level - 1) {
3848     for (i = 0; i < nth; ++i) {
3849       kmp_info_t *th = team->t.t_threads[i];
3850       n += __kmp_free_hot_teams(root, th, level + 1, max_level);
3851       if (i > 0 && th->th.th_hot_teams) {
3852         __kmp_free(th->th.th_hot_teams);
3853         th->th.th_hot_teams = NULL;
3854       }
3855     }
3856   }
3857   __kmp_free_team(root, team, NULL);
3858   return n;
3859 }
3860 #endif
3861 
3862 // Resets a root thread and clear its root and hot teams.
3863 // Returns the number of __kmp_threads entries directly and indirectly freed.
__kmp_reset_root(int gtid,kmp_root_t * root)3864 static int __kmp_reset_root(int gtid, kmp_root_t *root) {
3865   kmp_team_t *root_team = root->r.r_root_team;
3866   kmp_team_t *hot_team = root->r.r_hot_team;
3867   int n = hot_team->t.t_nproc;
3868   int i;
3869 
3870   KMP_DEBUG_ASSERT(!root->r.r_active);
3871 
3872   root->r.r_root_team = NULL;
3873   root->r.r_hot_team = NULL;
3874   // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
3875   // before call to __kmp_free_team().
3876   __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
3877 #if KMP_NESTED_HOT_TEAMS
3878   if (__kmp_hot_teams_max_level >
3879       0) { // need to free nested hot teams and their threads if any
3880     for (i = 0; i < hot_team->t.t_nproc; ++i) {
3881       kmp_info_t *th = hot_team->t.t_threads[i];
3882       if (__kmp_hot_teams_max_level > 1) {
3883         n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
3884       }
3885       if (th->th.th_hot_teams) {
3886         __kmp_free(th->th.th_hot_teams);
3887         th->th.th_hot_teams = NULL;
3888       }
3889     }
3890   }
3891 #endif
3892   __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
3893 
3894   // Before we can reap the thread, we need to make certain that all other
3895   // threads in the teams that had this root as ancestor have stopped trying to
3896   // steal tasks.
3897   if (__kmp_tasking_mode != tskm_immediate_exec) {
3898     __kmp_wait_to_unref_task_teams();
3899   }
3900 
3901 #if KMP_OS_WINDOWS
3902   /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
3903   KA_TRACE(
3904       10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
3905            "\n",
3906            (LPVOID) & (root->r.r_uber_thread->th),
3907            root->r.r_uber_thread->th.th_info.ds.ds_thread));
3908   __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
3909 #endif /* KMP_OS_WINDOWS */
3910 
3911 #if OMPT_SUPPORT
3912   ompt_data_t *task_data;
3913   ompt_data_t *parallel_data;
3914   __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data, NULL);
3915   if (ompt_enabled.ompt_callback_implicit_task) {
3916     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3917         ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial);
3918   }
3919   if (ompt_enabled.ompt_callback_thread_end) {
3920     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
3921         &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
3922   }
3923 #endif
3924 
3925   TCW_4(__kmp_nth,
3926         __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
3927   i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
3928   KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
3929                  " to %d\n",
3930                  root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
3931                  root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
3932   if (i == 1) {
3933     // need to free contention group structure
3934     KMP_DEBUG_ASSERT(root->r.r_uber_thread ==
3935                      root->r.r_uber_thread->th.th_cg_roots->cg_root);
3936     KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL);
3937     __kmp_free(root->r.r_uber_thread->th.th_cg_roots);
3938     root->r.r_uber_thread->th.th_cg_roots = NULL;
3939   }
3940   __kmp_reap_thread(root->r.r_uber_thread, 1);
3941 
3942   // We canot put root thread to __kmp_thread_pool, so we have to reap it
3943   // instead of freeing.
3944   root->r.r_uber_thread = NULL;
3945   /* mark root as no longer in use */
3946   root->r.r_begin = FALSE;
3947 
3948   return n;
3949 }
3950 
__kmp_unregister_root_current_thread(int gtid)3951 void __kmp_unregister_root_current_thread(int gtid) {
3952   KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
3953   /* this lock should be ok, since unregister_root_current_thread is never
3954      called during an abort, only during a normal close. furthermore, if you
3955      have the forkjoin lock, you should never try to get the initz lock */
3956   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3957   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
3958     KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
3959                   "exiting T#%d\n",
3960                   gtid));
3961     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3962     return;
3963   }
3964   kmp_root_t *root = __kmp_root[gtid];
3965 
3966   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
3967   KMP_ASSERT(KMP_UBER_GTID(gtid));
3968   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
3969   KMP_ASSERT(root->r.r_active == FALSE);
3970 
3971   KMP_MB();
3972 
3973   kmp_info_t *thread = __kmp_threads[gtid];
3974   kmp_team_t *team = thread->th.th_team;
3975   kmp_task_team_t *task_team = thread->th.th_task_team;
3976 
3977   // we need to wait for the proxy tasks before finishing the thread
3978   if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) {
3979 #if OMPT_SUPPORT
3980     // the runtime is shutting down so we won't report any events
3981     thread->th.ompt_thread_info.state = ompt_state_undefined;
3982 #endif
3983     __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
3984   }
3985 
3986   __kmp_reset_root(gtid, root);
3987 
3988   KMP_MB();
3989   KC_TRACE(10,
3990            ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
3991 
3992   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3993 }
3994 
3995 #if KMP_OS_WINDOWS
3996 /* __kmp_forkjoin_lock must be already held
3997    Unregisters a root thread that is not the current thread.  Returns the number
3998    of __kmp_threads entries freed as a result. */
__kmp_unregister_root_other_thread(int gtid)3999 static int __kmp_unregister_root_other_thread(int gtid) {
4000   kmp_root_t *root = __kmp_root[gtid];
4001   int r;
4002 
4003   KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
4004   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4005   KMP_ASSERT(KMP_UBER_GTID(gtid));
4006   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4007   KMP_ASSERT(root->r.r_active == FALSE);
4008 
4009   r = __kmp_reset_root(gtid, root);
4010   KC_TRACE(10,
4011            ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4012   return r;
4013 }
4014 #endif
4015 
4016 #if KMP_DEBUG
__kmp_task_info()4017 void __kmp_task_info() {
4018 
4019   kmp_int32 gtid = __kmp_entry_gtid();
4020   kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4021   kmp_info_t *this_thr = __kmp_threads[gtid];
4022   kmp_team_t *steam = this_thr->th.th_serial_team;
4023   kmp_team_t *team = this_thr->th.th_team;
4024 
4025   __kmp_printf(
4026       "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
4027       "ptask=%p\n",
4028       gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
4029       team->t.t_implicit_task_taskdata[tid].td_parent);
4030 }
4031 #endif // KMP_DEBUG
4032 
4033 /* TODO optimize with one big memclr, take out what isn't needed, split
4034    responsibility to workers as much as possible, and delay initialization of
4035    features as much as possible  */
__kmp_initialize_info(kmp_info_t * this_thr,kmp_team_t * team,int tid,int gtid)4036 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4037                                   int tid, int gtid) {
4038   /* this_thr->th.th_info.ds.ds_gtid is setup in
4039      kmp_allocate_thread/create_worker.
4040      this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4041   kmp_info_t *master = team->t.t_threads[0];
4042   KMP_DEBUG_ASSERT(this_thr != NULL);
4043   KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4044   KMP_DEBUG_ASSERT(team);
4045   KMP_DEBUG_ASSERT(team->t.t_threads);
4046   KMP_DEBUG_ASSERT(team->t.t_dispatch);
4047   KMP_DEBUG_ASSERT(master);
4048   KMP_DEBUG_ASSERT(master->th.th_root);
4049 
4050   KMP_MB();
4051 
4052   TCW_SYNC_PTR(this_thr->th.th_team, team);
4053 
4054   this_thr->th.th_info.ds.ds_tid = tid;
4055   this_thr->th.th_set_nproc = 0;
4056   if (__kmp_tasking_mode != tskm_immediate_exec)
4057     // When tasking is possible, threads are not safe to reap until they are
4058     // done tasking; this will be set when tasking code is exited in wait
4059     this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4060   else // no tasking --> always safe to reap
4061     this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4062   this_thr->th.th_set_proc_bind = proc_bind_default;
4063 #if KMP_AFFINITY_SUPPORTED
4064   this_thr->th.th_new_place = this_thr->th.th_current_place;
4065 #endif
4066   this_thr->th.th_root = master->th.th_root;
4067 
4068   /* setup the thread's cache of the team structure */
4069   this_thr->th.th_team_nproc = team->t.t_nproc;
4070   this_thr->th.th_team_master = master;
4071   this_thr->th.th_team_serialized = team->t.t_serialized;
4072   TCW_PTR(this_thr->th.th_sleep_loc, NULL);
4073 
4074   KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4075 
4076   KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4077                 tid, gtid, this_thr, this_thr->th.th_current_task));
4078 
4079   __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4080                            team, tid, TRUE);
4081 
4082   KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4083                 tid, gtid, this_thr, this_thr->th.th_current_task));
4084   // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4085   // __kmp_initialize_team()?
4086 
4087   /* TODO no worksharing in speculative threads */
4088   this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4089 
4090   this_thr->th.th_local.this_construct = 0;
4091 
4092   if (!this_thr->th.th_pri_common) {
4093     this_thr->th.th_pri_common =
4094         (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4095     if (__kmp_storage_map) {
4096       __kmp_print_storage_map_gtid(
4097           gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4098           sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4099     }
4100     this_thr->th.th_pri_head = NULL;
4101   }
4102 
4103   if (this_thr != master && // Master's CG root is initialized elsewhere
4104       this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
4105     // Make new thread's CG root same as master's
4106     KMP_DEBUG_ASSERT(master->th.th_cg_roots);
4107     kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
4108     if (tmp) {
4109       // worker changes CG, need to check if old CG should be freed
4110       int i = tmp->cg_nthreads--;
4111       KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads"
4112                      " on node %p of thread %p to %d\n",
4113                      this_thr, tmp, tmp->cg_root, tmp->cg_nthreads));
4114       if (i == 1) {
4115         __kmp_free(tmp); // last thread left CG --> free it
4116       }
4117     }
4118     this_thr->th.th_cg_roots = master->th.th_cg_roots;
4119     // Increment new thread's CG root's counter to add the new thread
4120     this_thr->th.th_cg_roots->cg_nthreads++;
4121     KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
4122                    " node %p of thread %p to %d\n",
4123                    this_thr, this_thr->th.th_cg_roots,
4124                    this_thr->th.th_cg_roots->cg_root,
4125                    this_thr->th.th_cg_roots->cg_nthreads));
4126     this_thr->th.th_current_task->td_icvs.thread_limit =
4127         this_thr->th.th_cg_roots->cg_thread_limit;
4128   }
4129 
4130   /* Initialize dynamic dispatch */
4131   {
4132     volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4133     // Use team max_nproc since this will never change for the team.
4134     size_t disp_size =
4135         sizeof(dispatch_private_info_t) *
4136         (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4137     KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4138                   team->t.t_max_nproc));
4139     KMP_ASSERT(dispatch);
4140     KMP_DEBUG_ASSERT(team->t.t_dispatch);
4141     KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4142 
4143     dispatch->th_disp_index = 0;
4144     dispatch->th_doacross_buf_idx = 0;
4145     if (!dispatch->th_disp_buffer) {
4146       dispatch->th_disp_buffer =
4147           (dispatch_private_info_t *)__kmp_allocate(disp_size);
4148 
4149       if (__kmp_storage_map) {
4150         __kmp_print_storage_map_gtid(
4151             gtid, &dispatch->th_disp_buffer[0],
4152             &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4153                                           ? 1
4154                                           : __kmp_dispatch_num_buffers],
4155             disp_size, "th_%d.th_dispatch.th_disp_buffer "
4156                        "(team_%d.t_dispatch[%d].th_disp_buffer)",
4157             gtid, team->t.t_id, gtid);
4158       }
4159     } else {
4160       memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4161     }
4162 
4163     dispatch->th_dispatch_pr_current = 0;
4164     dispatch->th_dispatch_sh_current = 0;
4165 
4166     dispatch->th_deo_fcn = 0; /* ORDERED     */
4167     dispatch->th_dxo_fcn = 0; /* END ORDERED */
4168   }
4169 
4170   this_thr->th.th_next_pool = NULL;
4171 
4172   if (!this_thr->th.th_task_state_memo_stack) {
4173     size_t i;
4174     this_thr->th.th_task_state_memo_stack =
4175         (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4176     this_thr->th.th_task_state_top = 0;
4177     this_thr->th.th_task_state_stack_sz = 4;
4178     for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4179          ++i) // zero init the stack
4180       this_thr->th.th_task_state_memo_stack[i] = 0;
4181   }
4182 
4183   KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4184   KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4185 
4186   KMP_MB();
4187 }
4188 
4189 /* allocate a new thread for the requesting team. this is only called from
4190    within a forkjoin critical section. we will first try to get an available
4191    thread from the thread pool. if none is available, we will fork a new one
4192    assuming we are able to create a new one. this should be assured, as the
4193    caller should check on this first. */
__kmp_allocate_thread(kmp_root_t * root,kmp_team_t * team,int new_tid)4194 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4195                                   int new_tid) {
4196   kmp_team_t *serial_team;
4197   kmp_info_t *new_thr;
4198   int new_gtid;
4199 
4200   KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4201   KMP_DEBUG_ASSERT(root && team);
4202 #if !KMP_NESTED_HOT_TEAMS
4203   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4204 #endif
4205   KMP_MB();
4206 
4207   /* first, try to get one from the thread pool */
4208   if (__kmp_thread_pool) {
4209     new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4210     __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4211     if (new_thr == __kmp_thread_pool_insert_pt) {
4212       __kmp_thread_pool_insert_pt = NULL;
4213     }
4214     TCW_4(new_thr->th.th_in_pool, FALSE);
4215     __kmp_suspend_initialize_thread(new_thr);
4216     __kmp_lock_suspend_mx(new_thr);
4217     if (new_thr->th.th_active_in_pool == TRUE) {
4218       KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE);
4219       KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
4220       new_thr->th.th_active_in_pool = FALSE;
4221     }
4222     __kmp_unlock_suspend_mx(new_thr);
4223 
4224     KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4225                   __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4226     KMP_ASSERT(!new_thr->th.th_team);
4227     KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4228 
4229     /* setup the thread structure */
4230     __kmp_initialize_info(new_thr, team, new_tid,
4231                           new_thr->th.th_info.ds.ds_gtid);
4232     KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4233 
4234     TCW_4(__kmp_nth, __kmp_nth + 1);
4235 
4236     new_thr->th.th_task_state = 0;
4237     new_thr->th.th_task_state_top = 0;
4238     new_thr->th.th_task_state_stack_sz = 4;
4239 
4240 #ifdef KMP_ADJUST_BLOCKTIME
4241     /* Adjust blocktime back to zero if necessary */
4242     /* Middle initialization might not have occurred yet */
4243     if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4244       if (__kmp_nth > __kmp_avail_proc) {
4245         __kmp_zero_bt = TRUE;
4246       }
4247     }
4248 #endif /* KMP_ADJUST_BLOCKTIME */
4249 
4250 #if KMP_DEBUG
4251     // If thread entered pool via __kmp_free_thread, wait_flag should !=
4252     // KMP_BARRIER_PARENT_FLAG.
4253     int b;
4254     kmp_balign_t *balign = new_thr->th.th_bar;
4255     for (b = 0; b < bs_last_barrier; ++b)
4256       KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4257 #endif
4258 
4259     KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4260                   __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4261 
4262     KMP_MB();
4263     return new_thr;
4264   }
4265 
4266   /* no, well fork a new one */
4267   KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4268   KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4269 
4270 #if KMP_USE_MONITOR
4271   // If this is the first worker thread the RTL is creating, then also
4272   // launch the monitor thread.  We try to do this as early as possible.
4273   if (!TCR_4(__kmp_init_monitor)) {
4274     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4275     if (!TCR_4(__kmp_init_monitor)) {
4276       KF_TRACE(10, ("before __kmp_create_monitor\n"));
4277       TCW_4(__kmp_init_monitor, 1);
4278       __kmp_create_monitor(&__kmp_monitor);
4279       KF_TRACE(10, ("after __kmp_create_monitor\n"));
4280 #if KMP_OS_WINDOWS
4281       // AC: wait until monitor has started. This is a fix for CQ232808.
4282       // The reason is that if the library is loaded/unloaded in a loop with
4283       // small (parallel) work in between, then there is high probability that
4284       // monitor thread started after the library shutdown. At shutdown it is
4285       // too late to cope with the problem, because when the master is in
4286       // DllMain (process detach) the monitor has no chances to start (it is
4287       // blocked), and master has no means to inform the monitor that the
4288       // library has gone, because all the memory which the monitor can access
4289       // is going to be released/reset.
4290       while (TCR_4(__kmp_init_monitor) < 2) {
4291         KMP_YIELD(TRUE);
4292       }
4293       KF_TRACE(10, ("after monitor thread has started\n"));
4294 #endif
4295     }
4296     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4297   }
4298 #endif
4299 
4300   KMP_MB();
4301   for (new_gtid = 1; TCR_PTR(__kmp_threads[new_gtid]) != NULL; ++new_gtid) {
4302     KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4303   }
4304 
4305   /* allocate space for it. */
4306   new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4307 
4308   TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4309 
4310 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
4311   // suppress race conditions detection on synchronization flags in debug mode
4312   // this helps to analyze library internals eliminating false positives
4313   __itt_suppress_mark_range(
4314       __itt_suppress_range, __itt_suppress_threading_errors,
4315       &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc));
4316   __itt_suppress_mark_range(
4317       __itt_suppress_range, __itt_suppress_threading_errors,
4318       &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state));
4319 #if KMP_OS_WINDOWS
4320   __itt_suppress_mark_range(
4321       __itt_suppress_range, __itt_suppress_threading_errors,
4322       &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init));
4323 #else
4324   __itt_suppress_mark_range(__itt_suppress_range,
4325                             __itt_suppress_threading_errors,
4326                             &new_thr->th.th_suspend_init_count,
4327                             sizeof(new_thr->th.th_suspend_init_count));
4328 #endif
4329   // TODO: check if we need to also suppress b_arrived flags
4330   __itt_suppress_mark_range(__itt_suppress_range,
4331                             __itt_suppress_threading_errors,
4332                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go),
4333                             sizeof(new_thr->th.th_bar[0].bb.b_go));
4334   __itt_suppress_mark_range(__itt_suppress_range,
4335                             __itt_suppress_threading_errors,
4336                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go),
4337                             sizeof(new_thr->th.th_bar[1].bb.b_go));
4338   __itt_suppress_mark_range(__itt_suppress_range,
4339                             __itt_suppress_threading_errors,
4340                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go),
4341                             sizeof(new_thr->th.th_bar[2].bb.b_go));
4342 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
4343   if (__kmp_storage_map) {
4344     __kmp_print_thread_storage_map(new_thr, new_gtid);
4345   }
4346 
4347   // add the reserve serialized team, initialized from the team's master thread
4348   {
4349     kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4350     KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4351     new_thr->th.th_serial_team = serial_team =
4352         (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4353 #if OMPT_SUPPORT
4354                                           ompt_data_none, // root parallel id
4355 #endif
4356                                           proc_bind_default, &r_icvs,
4357                                           0 USE_NESTED_HOT_ARG(NULL));
4358   }
4359   KMP_ASSERT(serial_team);
4360   serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4361   // execution (it is unused for now).
4362   serial_team->t.t_threads[0] = new_thr;
4363   KF_TRACE(10,
4364            ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4365             new_thr));
4366 
4367   /* setup the thread structures */
4368   __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4369 
4370 #if USE_FAST_MEMORY
4371   __kmp_initialize_fast_memory(new_thr);
4372 #endif /* USE_FAST_MEMORY */
4373 
4374 #if KMP_USE_BGET
4375   KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4376   __kmp_initialize_bget(new_thr);
4377 #endif
4378 
4379   __kmp_init_random(new_thr); // Initialize random number generator
4380 
4381   /* Initialize these only once when thread is grabbed for a team allocation */
4382   KA_TRACE(20,
4383            ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4384             __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4385 
4386   int b;
4387   kmp_balign_t *balign = new_thr->th.th_bar;
4388   for (b = 0; b < bs_last_barrier; ++b) {
4389     balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4390     balign[b].bb.team = NULL;
4391     balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4392     balign[b].bb.use_oncore_barrier = 0;
4393   }
4394 
4395   new_thr->th.th_spin_here = FALSE;
4396   new_thr->th.th_next_waiting = 0;
4397 #if KMP_OS_UNIX
4398   new_thr->th.th_blocking = false;
4399 #endif
4400 
4401 #if KMP_AFFINITY_SUPPORTED
4402   new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4403   new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4404   new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4405   new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4406 #endif
4407   new_thr->th.th_def_allocator = __kmp_def_allocator;
4408   new_thr->th.th_prev_level = 0;
4409   new_thr->th.th_prev_num_threads = 1;
4410 
4411   TCW_4(new_thr->th.th_in_pool, FALSE);
4412   new_thr->th.th_active_in_pool = FALSE;
4413   TCW_4(new_thr->th.th_active, TRUE);
4414 
4415   /* adjust the global counters */
4416   __kmp_all_nth++;
4417   __kmp_nth++;
4418 
4419   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4420   // numbers of procs, and method #2 (keyed API call) for higher numbers.
4421   if (__kmp_adjust_gtid_mode) {
4422     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4423       if (TCR_4(__kmp_gtid_mode) != 2) {
4424         TCW_4(__kmp_gtid_mode, 2);
4425       }
4426     } else {
4427       if (TCR_4(__kmp_gtid_mode) != 1) {
4428         TCW_4(__kmp_gtid_mode, 1);
4429       }
4430     }
4431   }
4432 
4433 #ifdef KMP_ADJUST_BLOCKTIME
4434   /* Adjust blocktime back to zero if necessary       */
4435   /* Middle initialization might not have occurred yet */
4436   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4437     if (__kmp_nth > __kmp_avail_proc) {
4438       __kmp_zero_bt = TRUE;
4439     }
4440   }
4441 #endif /* KMP_ADJUST_BLOCKTIME */
4442 
4443   /* actually fork it and create the new worker thread */
4444   KF_TRACE(
4445       10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4446   __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4447   KF_TRACE(10,
4448            ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4449 
4450   KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4451                 new_gtid));
4452   KMP_MB();
4453   return new_thr;
4454 }
4455 
4456 /* Reinitialize team for reuse.
4457    The hot team code calls this case at every fork barrier, so EPCC barrier
4458    test are extremely sensitive to changes in it, esp. writes to the team
4459    struct, which cause a cache invalidation in all threads.
4460    IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
__kmp_reinitialize_team(kmp_team_t * team,kmp_internal_control_t * new_icvs,ident_t * loc)4461 static void __kmp_reinitialize_team(kmp_team_t *team,
4462                                     kmp_internal_control_t *new_icvs,
4463                                     ident_t *loc) {
4464   KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4465                 team->t.t_threads[0], team));
4466   KMP_DEBUG_ASSERT(team && new_icvs);
4467   KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4468   KMP_CHECK_UPDATE(team->t.t_ident, loc);
4469 
4470   KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4471   // Copy ICVs to the master thread's implicit taskdata
4472   __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4473   copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4474 
4475   KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4476                 team->t.t_threads[0], team));
4477 }
4478 
4479 /* Initialize the team data structure.
4480    This assumes the t_threads and t_max_nproc are already set.
4481    Also, we don't touch the arguments */
__kmp_initialize_team(kmp_team_t * team,int new_nproc,kmp_internal_control_t * new_icvs,ident_t * loc)4482 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4483                                   kmp_internal_control_t *new_icvs,
4484                                   ident_t *loc) {
4485   KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4486 
4487   /* verify */
4488   KMP_DEBUG_ASSERT(team);
4489   KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4490   KMP_DEBUG_ASSERT(team->t.t_threads);
4491   KMP_MB();
4492 
4493   team->t.t_master_tid = 0; /* not needed */
4494   /* team->t.t_master_bar;        not needed */
4495   team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4496   team->t.t_nproc = new_nproc;
4497 
4498   /* team->t.t_parent     = NULL; TODO not needed & would mess up hot team */
4499   team->t.t_next_pool = NULL;
4500   /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4501    * up hot team */
4502 
4503   TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4504   team->t.t_invoke = NULL; /* not needed */
4505 
4506   // TODO???: team->t.t_max_active_levels       = new_max_active_levels;
4507   team->t.t_sched.sched = new_icvs->sched.sched;
4508 
4509 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4510   team->t.t_fp_control_saved = FALSE; /* not needed */
4511   team->t.t_x87_fpu_control_word = 0; /* not needed */
4512   team->t.t_mxcsr = 0; /* not needed */
4513 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4514 
4515   team->t.t_construct = 0;
4516 
4517   team->t.t_ordered.dt.t_value = 0;
4518   team->t.t_master_active = FALSE;
4519 
4520 #ifdef KMP_DEBUG
4521   team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4522 #endif
4523 #if KMP_OS_WINDOWS
4524   team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4525 #endif
4526 
4527   team->t.t_control_stack_top = NULL;
4528 
4529   __kmp_reinitialize_team(team, new_icvs, loc);
4530 
4531   KMP_MB();
4532   KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4533 }
4534 
4535 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
4536 /* Sets full mask for thread and returns old mask, no changes to structures. */
4537 static void
__kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t * old_mask)4538 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
4539   if (KMP_AFFINITY_CAPABLE()) {
4540     int status;
4541     if (old_mask != NULL) {
4542       status = __kmp_get_system_affinity(old_mask, TRUE);
4543       int error = errno;
4544       if (status != 0) {
4545         __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error),
4546                     __kmp_msg_null);
4547       }
4548     }
4549     __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE);
4550   }
4551 }
4552 #endif
4553 
4554 #if KMP_AFFINITY_SUPPORTED
4555 
4556 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4557 // It calculates the worker + master thread's partition based upon the parent
4558 // thread's partition, and binds each worker to a thread in their partition.
4559 // The master thread's partition should already include its current binding.
__kmp_partition_places(kmp_team_t * team,int update_master_only)4560 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4561   // Copy the master thread's place partition to the team struct
4562   kmp_info_t *master_th = team->t.t_threads[0];
4563   KMP_DEBUG_ASSERT(master_th != NULL);
4564   kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4565   int first_place = master_th->th.th_first_place;
4566   int last_place = master_th->th.th_last_place;
4567   int masters_place = master_th->th.th_current_place;
4568   team->t.t_first_place = first_place;
4569   team->t.t_last_place = last_place;
4570 
4571   KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4572                 "bound to place %d partition = [%d,%d]\n",
4573                 proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4574                 team->t.t_id, masters_place, first_place, last_place));
4575 
4576   switch (proc_bind) {
4577 
4578   case proc_bind_default:
4579     // serial teams might have the proc_bind policy set to proc_bind_default. It
4580     // doesn't matter, as we don't rebind master thread for any proc_bind policy
4581     KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4582     break;
4583 
4584   case proc_bind_master: {
4585     int f;
4586     int n_th = team->t.t_nproc;
4587     for (f = 1; f < n_th; f++) {
4588       kmp_info_t *th = team->t.t_threads[f];
4589       KMP_DEBUG_ASSERT(th != NULL);
4590       th->th.th_first_place = first_place;
4591       th->th.th_last_place = last_place;
4592       th->th.th_new_place = masters_place;
4593       if (__kmp_display_affinity && masters_place != th->th.th_current_place &&
4594           team->t.t_display_affinity != 1) {
4595         team->t.t_display_affinity = 1;
4596       }
4597 
4598       KA_TRACE(100, ("__kmp_partition_places: master: T#%d(%d:%d) place %d "
4599                      "partition = [%d,%d]\n",
4600                      __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4601                      f, masters_place, first_place, last_place));
4602     }
4603   } break;
4604 
4605   case proc_bind_close: {
4606     int f;
4607     int n_th = team->t.t_nproc;
4608     int n_places;
4609     if (first_place <= last_place) {
4610       n_places = last_place - first_place + 1;
4611     } else {
4612       n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4613     }
4614     if (n_th <= n_places) {
4615       int place = masters_place;
4616       for (f = 1; f < n_th; f++) {
4617         kmp_info_t *th = team->t.t_threads[f];
4618         KMP_DEBUG_ASSERT(th != NULL);
4619 
4620         if (place == last_place) {
4621           place = first_place;
4622         } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4623           place = 0;
4624         } else {
4625           place++;
4626         }
4627         th->th.th_first_place = first_place;
4628         th->th.th_last_place = last_place;
4629         th->th.th_new_place = place;
4630         if (__kmp_display_affinity && place != th->th.th_current_place &&
4631             team->t.t_display_affinity != 1) {
4632           team->t.t_display_affinity = 1;
4633         }
4634 
4635         KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4636                        "partition = [%d,%d]\n",
4637                        __kmp_gtid_from_thread(team->t.t_threads[f]),
4638                        team->t.t_id, f, place, first_place, last_place));
4639       }
4640     } else {
4641       int S, rem, gap, s_count;
4642       S = n_th / n_places;
4643       s_count = 0;
4644       rem = n_th - (S * n_places);
4645       gap = rem > 0 ? n_places / rem : n_places;
4646       int place = masters_place;
4647       int gap_ct = gap;
4648       for (f = 0; f < n_th; f++) {
4649         kmp_info_t *th = team->t.t_threads[f];
4650         KMP_DEBUG_ASSERT(th != NULL);
4651 
4652         th->th.th_first_place = first_place;
4653         th->th.th_last_place = last_place;
4654         th->th.th_new_place = place;
4655         if (__kmp_display_affinity && place != th->th.th_current_place &&
4656             team->t.t_display_affinity != 1) {
4657           team->t.t_display_affinity = 1;
4658         }
4659         s_count++;
4660 
4661         if ((s_count == S) && rem && (gap_ct == gap)) {
4662           // do nothing, add an extra thread to place on next iteration
4663         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4664           // we added an extra thread to this place; move to next place
4665           if (place == last_place) {
4666             place = first_place;
4667           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4668             place = 0;
4669           } else {
4670             place++;
4671           }
4672           s_count = 0;
4673           gap_ct = 1;
4674           rem--;
4675         } else if (s_count == S) { // place full; don't add extra
4676           if (place == last_place) {
4677             place = first_place;
4678           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4679             place = 0;
4680           } else {
4681             place++;
4682           }
4683           gap_ct++;
4684           s_count = 0;
4685         }
4686 
4687         KA_TRACE(100,
4688                  ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4689                   "partition = [%d,%d]\n",
4690                   __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4691                   th->th.th_new_place, first_place, last_place));
4692       }
4693       KMP_DEBUG_ASSERT(place == masters_place);
4694     }
4695   } break;
4696 
4697   case proc_bind_spread: {
4698     int f;
4699     int n_th = team->t.t_nproc;
4700     int n_places;
4701     int thidx;
4702     if (first_place <= last_place) {
4703       n_places = last_place - first_place + 1;
4704     } else {
4705       n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4706     }
4707     if (n_th <= n_places) {
4708       int place = -1;
4709 
4710       if (n_places != static_cast<int>(__kmp_affinity_num_masks)) {
4711         int S = n_places / n_th;
4712         int s_count, rem, gap, gap_ct;
4713 
4714         place = masters_place;
4715         rem = n_places - n_th * S;
4716         gap = rem ? n_th / rem : 1;
4717         gap_ct = gap;
4718         thidx = n_th;
4719         if (update_master_only == 1)
4720           thidx = 1;
4721         for (f = 0; f < thidx; f++) {
4722           kmp_info_t *th = team->t.t_threads[f];
4723           KMP_DEBUG_ASSERT(th != NULL);
4724 
4725           th->th.th_first_place = place;
4726           th->th.th_new_place = place;
4727           if (__kmp_display_affinity && place != th->th.th_current_place &&
4728               team->t.t_display_affinity != 1) {
4729             team->t.t_display_affinity = 1;
4730           }
4731           s_count = 1;
4732           while (s_count < S) {
4733             if (place == last_place) {
4734               place = first_place;
4735             } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4736               place = 0;
4737             } else {
4738               place++;
4739             }
4740             s_count++;
4741           }
4742           if (rem && (gap_ct == gap)) {
4743             if (place == last_place) {
4744               place = first_place;
4745             } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4746               place = 0;
4747             } else {
4748               place++;
4749             }
4750             rem--;
4751             gap_ct = 0;
4752           }
4753           th->th.th_last_place = place;
4754           gap_ct++;
4755 
4756           if (place == last_place) {
4757             place = first_place;
4758           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4759             place = 0;
4760           } else {
4761             place++;
4762           }
4763 
4764           KA_TRACE(100,
4765                    ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4766                     "partition = [%d,%d], __kmp_affinity_num_masks: %u\n",
4767                     __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4768                     f, th->th.th_new_place, th->th.th_first_place,
4769                     th->th.th_last_place, __kmp_affinity_num_masks));
4770         }
4771       } else {
4772         /* Having uniform space of available computation places I can create
4773            T partitions of round(P/T) size and put threads into the first
4774            place of each partition. */
4775         double current = static_cast<double>(masters_place);
4776         double spacing =
4777             (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
4778         int first, last;
4779         kmp_info_t *th;
4780 
4781         thidx = n_th + 1;
4782         if (update_master_only == 1)
4783           thidx = 1;
4784         for (f = 0; f < thidx; f++) {
4785           first = static_cast<int>(current);
4786           last = static_cast<int>(current + spacing) - 1;
4787           KMP_DEBUG_ASSERT(last >= first);
4788           if (first >= n_places) {
4789             if (masters_place) {
4790               first -= n_places;
4791               last -= n_places;
4792               if (first == (masters_place + 1)) {
4793                 KMP_DEBUG_ASSERT(f == n_th);
4794                 first--;
4795               }
4796               if (last == masters_place) {
4797                 KMP_DEBUG_ASSERT(f == (n_th - 1));
4798                 last--;
4799               }
4800             } else {
4801               KMP_DEBUG_ASSERT(f == n_th);
4802               first = 0;
4803               last = 0;
4804             }
4805           }
4806           if (last >= n_places) {
4807             last = (n_places - 1);
4808           }
4809           place = first;
4810           current += spacing;
4811           if (f < n_th) {
4812             KMP_DEBUG_ASSERT(0 <= first);
4813             KMP_DEBUG_ASSERT(n_places > first);
4814             KMP_DEBUG_ASSERT(0 <= last);
4815             KMP_DEBUG_ASSERT(n_places > last);
4816             KMP_DEBUG_ASSERT(last_place >= first_place);
4817             th = team->t.t_threads[f];
4818             KMP_DEBUG_ASSERT(th);
4819             th->th.th_first_place = first;
4820             th->th.th_new_place = place;
4821             th->th.th_last_place = last;
4822             if (__kmp_display_affinity && place != th->th.th_current_place &&
4823                 team->t.t_display_affinity != 1) {
4824               team->t.t_display_affinity = 1;
4825             }
4826             KA_TRACE(100,
4827                      ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4828                       "partition = [%d,%d], spacing = %.4f\n",
4829                       __kmp_gtid_from_thread(team->t.t_threads[f]),
4830                       team->t.t_id, f, th->th.th_new_place,
4831                       th->th.th_first_place, th->th.th_last_place, spacing));
4832           }
4833         }
4834       }
4835       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4836     } else {
4837       int S, rem, gap, s_count;
4838       S = n_th / n_places;
4839       s_count = 0;
4840       rem = n_th - (S * n_places);
4841       gap = rem > 0 ? n_places / rem : n_places;
4842       int place = masters_place;
4843       int gap_ct = gap;
4844       thidx = n_th;
4845       if (update_master_only == 1)
4846         thidx = 1;
4847       for (f = 0; f < thidx; f++) {
4848         kmp_info_t *th = team->t.t_threads[f];
4849         KMP_DEBUG_ASSERT(th != NULL);
4850 
4851         th->th.th_first_place = place;
4852         th->th.th_last_place = place;
4853         th->th.th_new_place = place;
4854         if (__kmp_display_affinity && place != th->th.th_current_place &&
4855             team->t.t_display_affinity != 1) {
4856           team->t.t_display_affinity = 1;
4857         }
4858         s_count++;
4859 
4860         if ((s_count == S) && rem && (gap_ct == gap)) {
4861           // do nothing, add an extra thread to place on next iteration
4862         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4863           // we added an extra thread to this place; move on to next place
4864           if (place == last_place) {
4865             place = first_place;
4866           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4867             place = 0;
4868           } else {
4869             place++;
4870           }
4871           s_count = 0;
4872           gap_ct = 1;
4873           rem--;
4874         } else if (s_count == S) { // place is full; don't add extra thread
4875           if (place == last_place) {
4876             place = first_place;
4877           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4878             place = 0;
4879           } else {
4880             place++;
4881           }
4882           gap_ct++;
4883           s_count = 0;
4884         }
4885 
4886         KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4887                        "partition = [%d,%d]\n",
4888                        __kmp_gtid_from_thread(team->t.t_threads[f]),
4889                        team->t.t_id, f, th->th.th_new_place,
4890                        th->th.th_first_place, th->th.th_last_place));
4891       }
4892       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4893     }
4894   } break;
4895 
4896   default:
4897     break;
4898   }
4899 
4900   KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
4901 }
4902 
4903 #endif // KMP_AFFINITY_SUPPORTED
4904 
4905 /* allocate a new team data structure to use.  take one off of the free pool if
4906    available */
4907 kmp_team_t *
__kmp_allocate_team(kmp_root_t * root,int new_nproc,int max_nproc,ompt_data_t ompt_parallel_data,kmp_proc_bind_t new_proc_bind,kmp_internal_control_t * new_icvs,int argc USE_NESTED_HOT_ARG (kmp_info_t * master))4908 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
4909 #if OMPT_SUPPORT
4910                     ompt_data_t ompt_parallel_data,
4911 #endif
4912                     kmp_proc_bind_t new_proc_bind,
4913                     kmp_internal_control_t *new_icvs,
4914                     int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
4915   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
4916   int f;
4917   kmp_team_t *team;
4918   int use_hot_team = !root->r.r_active;
4919   int level = 0;
4920 
4921   KA_TRACE(20, ("__kmp_allocate_team: called\n"));
4922   KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
4923   KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
4924   KMP_MB();
4925 
4926 #if KMP_NESTED_HOT_TEAMS
4927   kmp_hot_team_ptr_t *hot_teams;
4928   if (master) {
4929     team = master->th.th_team;
4930     level = team->t.t_active_level;
4931     if (master->th.th_teams_microtask) { // in teams construct?
4932       if (master->th.th_teams_size.nteams > 1 &&
4933           ( // #teams > 1
4934               team->t.t_pkfn ==
4935                   (microtask_t)__kmp_teams_master || // inner fork of the teams
4936               master->th.th_teams_level <
4937                   team->t.t_level)) { // or nested parallel inside the teams
4938         ++level; // not increment if #teams==1, or for outer fork of the teams;
4939         // increment otherwise
4940       }
4941     }
4942     hot_teams = master->th.th_hot_teams;
4943     if (level < __kmp_hot_teams_max_level && hot_teams &&
4944         hot_teams[level].hot_team) {
4945       // hot team has already been allocated for given level
4946       use_hot_team = 1;
4947     } else {
4948       use_hot_team = 0;
4949     }
4950   } else {
4951     // check we won't access uninitialized hot_teams, just in case
4952     KMP_DEBUG_ASSERT(new_nproc == 1);
4953   }
4954 #endif
4955   // Optimization to use a "hot" team
4956   if (use_hot_team && new_nproc > 1) {
4957     KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
4958 #if KMP_NESTED_HOT_TEAMS
4959     team = hot_teams[level].hot_team;
4960 #else
4961     team = root->r.r_hot_team;
4962 #endif
4963 #if KMP_DEBUG
4964     if (__kmp_tasking_mode != tskm_immediate_exec) {
4965       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
4966                     "task_team[1] = %p before reinit\n",
4967                     team->t.t_task_team[0], team->t.t_task_team[1]));
4968     }
4969 #endif
4970 
4971     // Has the number of threads changed?
4972     /* Let's assume the most common case is that the number of threads is
4973        unchanged, and put that case first. */
4974     if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
4975       KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
4976       // This case can mean that omp_set_num_threads() was called and the hot
4977       // team size was already reduced, so we check the special flag
4978       if (team->t.t_size_changed == -1) {
4979         team->t.t_size_changed = 1;
4980       } else {
4981         KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
4982       }
4983 
4984       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4985       kmp_r_sched_t new_sched = new_icvs->sched;
4986       // set master's schedule as new run-time schedule
4987       KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
4988 
4989       __kmp_reinitialize_team(team, new_icvs,
4990                               root->r.r_uber_thread->th.th_ident);
4991 
4992       KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
4993                     team->t.t_threads[0], team));
4994       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
4995 
4996 #if KMP_AFFINITY_SUPPORTED
4997       if ((team->t.t_size_changed == 0) &&
4998           (team->t.t_proc_bind == new_proc_bind)) {
4999         if (new_proc_bind == proc_bind_spread) {
5000           __kmp_partition_places(
5001               team, 1); // add flag to update only master for spread
5002         }
5003         KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
5004                        "proc_bind = %d, partition = [%d,%d]\n",
5005                        team->t.t_id, new_proc_bind, team->t.t_first_place,
5006                        team->t.t_last_place));
5007       } else {
5008         KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5009         __kmp_partition_places(team);
5010       }
5011 #else
5012       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5013 #endif /* KMP_AFFINITY_SUPPORTED */
5014     } else if (team->t.t_nproc > new_nproc) {
5015       KA_TRACE(20,
5016                ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
5017                 new_nproc));
5018 
5019       team->t.t_size_changed = 1;
5020 #if KMP_NESTED_HOT_TEAMS
5021       if (__kmp_hot_teams_mode == 0) {
5022         // AC: saved number of threads should correspond to team's value in this
5023         // mode, can be bigger in mode 1, when hot team has threads in reserve
5024         KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
5025         hot_teams[level].hot_team_nth = new_nproc;
5026 #endif // KMP_NESTED_HOT_TEAMS
5027         /* release the extra threads we don't need any more */
5028         for (f = new_nproc; f < team->t.t_nproc; f++) {
5029           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5030           if (__kmp_tasking_mode != tskm_immediate_exec) {
5031             // When decreasing team size, threads no longer in the team should
5032             // unref task team.
5033             team->t.t_threads[f]->th.th_task_team = NULL;
5034           }
5035           __kmp_free_thread(team->t.t_threads[f]);
5036           team->t.t_threads[f] = NULL;
5037         }
5038 #if KMP_NESTED_HOT_TEAMS
5039       } // (__kmp_hot_teams_mode == 0)
5040       else {
5041         // When keeping extra threads in team, switch threads to wait on own
5042         // b_go flag
5043         for (f = new_nproc; f < team->t.t_nproc; ++f) {
5044           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5045           kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
5046           for (int b = 0; b < bs_last_barrier; ++b) {
5047             if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
5048               balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5049             }
5050             KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
5051           }
5052         }
5053       }
5054 #endif // KMP_NESTED_HOT_TEAMS
5055       team->t.t_nproc = new_nproc;
5056       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5057       KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
5058       __kmp_reinitialize_team(team, new_icvs,
5059                               root->r.r_uber_thread->th.th_ident);
5060 
5061       // Update remaining threads
5062       for (f = 0; f < new_nproc; ++f) {
5063         team->t.t_threads[f]->th.th_team_nproc = new_nproc;
5064       }
5065 
5066       // restore the current task state of the master thread: should be the
5067       // implicit task
5068       KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5069                     team->t.t_threads[0], team));
5070 
5071       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5072 
5073 #ifdef KMP_DEBUG
5074       for (f = 0; f < team->t.t_nproc; f++) {
5075         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5076                          team->t.t_threads[f]->th.th_team_nproc ==
5077                              team->t.t_nproc);
5078       }
5079 #endif
5080 
5081       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5082 #if KMP_AFFINITY_SUPPORTED
5083       __kmp_partition_places(team);
5084 #endif
5085     } else { // team->t.t_nproc < new_nproc
5086 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5087       kmp_affin_mask_t *old_mask;
5088       if (KMP_AFFINITY_CAPABLE()) {
5089         KMP_CPU_ALLOC(old_mask);
5090       }
5091 #endif
5092 
5093       KA_TRACE(20,
5094                ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5095                 new_nproc));
5096 
5097       team->t.t_size_changed = 1;
5098 
5099 #if KMP_NESTED_HOT_TEAMS
5100       int avail_threads = hot_teams[level].hot_team_nth;
5101       if (new_nproc < avail_threads)
5102         avail_threads = new_nproc;
5103       kmp_info_t **other_threads = team->t.t_threads;
5104       for (f = team->t.t_nproc; f < avail_threads; ++f) {
5105         // Adjust barrier data of reserved threads (if any) of the team
5106         // Other data will be set in __kmp_initialize_info() below.
5107         int b;
5108         kmp_balign_t *balign = other_threads[f]->th.th_bar;
5109         for (b = 0; b < bs_last_barrier; ++b) {
5110           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5111           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5112 #if USE_DEBUGGER
5113           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5114 #endif
5115         }
5116       }
5117       if (hot_teams[level].hot_team_nth >= new_nproc) {
5118         // we have all needed threads in reserve, no need to allocate any
5119         // this only possible in mode 1, cannot have reserved threads in mode 0
5120         KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5121         team->t.t_nproc = new_nproc; // just get reserved threads involved
5122       } else {
5123         // we may have some threads in reserve, but not enough
5124         team->t.t_nproc =
5125             hot_teams[level]
5126                 .hot_team_nth; // get reserved threads involved if any
5127         hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5128 #endif // KMP_NESTED_HOT_TEAMS
5129         if (team->t.t_max_nproc < new_nproc) {
5130           /* reallocate larger arrays */
5131           __kmp_reallocate_team_arrays(team, new_nproc);
5132           __kmp_reinitialize_team(team, new_icvs, NULL);
5133         }
5134 
5135 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5136         /* Temporarily set full mask for master thread before creation of
5137            workers. The reason is that workers inherit the affinity from master,
5138            so if a lot of workers are created on the single core quickly, they
5139            don't get a chance to set their own affinity for a long time. */
5140         __kmp_set_thread_affinity_mask_full_tmp(old_mask);
5141 #endif
5142 
5143         /* allocate new threads for the hot team */
5144         for (f = team->t.t_nproc; f < new_nproc; f++) {
5145           kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5146           KMP_DEBUG_ASSERT(new_worker);
5147           team->t.t_threads[f] = new_worker;
5148 
5149           KA_TRACE(20,
5150                    ("__kmp_allocate_team: team %d init T#%d arrived: "
5151                     "join=%llu, plain=%llu\n",
5152                     team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5153                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5154                     team->t.t_bar[bs_plain_barrier].b_arrived));
5155 
5156           { // Initialize barrier data for new threads.
5157             int b;
5158             kmp_balign_t *balign = new_worker->th.th_bar;
5159             for (b = 0; b < bs_last_barrier; ++b) {
5160               balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5161               KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5162                                KMP_BARRIER_PARENT_FLAG);
5163 #if USE_DEBUGGER
5164               balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5165 #endif
5166             }
5167           }
5168         }
5169 
5170 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5171         if (KMP_AFFINITY_CAPABLE()) {
5172           /* Restore initial master thread's affinity mask */
5173           __kmp_set_system_affinity(old_mask, TRUE);
5174           KMP_CPU_FREE(old_mask);
5175         }
5176 #endif
5177 #if KMP_NESTED_HOT_TEAMS
5178       } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5179 #endif // KMP_NESTED_HOT_TEAMS
5180       /* make sure everyone is syncronized */
5181       int old_nproc = team->t.t_nproc; // save old value and use to update only
5182       // new threads below
5183       __kmp_initialize_team(team, new_nproc, new_icvs,
5184                             root->r.r_uber_thread->th.th_ident);
5185 
5186       /* reinitialize the threads */
5187       KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5188       for (f = 0; f < team->t.t_nproc; ++f)
5189         __kmp_initialize_info(team->t.t_threads[f], team, f,
5190                               __kmp_gtid_from_tid(f, team));
5191 
5192       if (level) { // set th_task_state for new threads in nested hot team
5193         // __kmp_initialize_info() no longer zeroes th_task_state, so we should
5194         // only need to set the th_task_state for the new threads. th_task_state
5195         // for master thread will not be accurate until after this in
5196         // __kmp_fork_call(), so we look to the master's memo_stack to get the
5197         // correct value.
5198         for (f = old_nproc; f < team->t.t_nproc; ++f)
5199           team->t.t_threads[f]->th.th_task_state =
5200               team->t.t_threads[0]->th.th_task_state_memo_stack[level];
5201       } else { // set th_task_state for new threads in non-nested hot team
5202         int old_state =
5203             team->t.t_threads[0]->th.th_task_state; // copy master's state
5204         for (f = old_nproc; f < team->t.t_nproc; ++f)
5205           team->t.t_threads[f]->th.th_task_state = old_state;
5206       }
5207 
5208 #ifdef KMP_DEBUG
5209       for (f = 0; f < team->t.t_nproc; ++f) {
5210         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5211                          team->t.t_threads[f]->th.th_team_nproc ==
5212                              team->t.t_nproc);
5213       }
5214 #endif
5215 
5216       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5217 #if KMP_AFFINITY_SUPPORTED
5218       __kmp_partition_places(team);
5219 #endif
5220     } // Check changes in number of threads
5221 
5222     kmp_info_t *master = team->t.t_threads[0];
5223     if (master->th.th_teams_microtask) {
5224       for (f = 1; f < new_nproc; ++f) {
5225         // propagate teams construct specific info to workers
5226         kmp_info_t *thr = team->t.t_threads[f];
5227         thr->th.th_teams_microtask = master->th.th_teams_microtask;
5228         thr->th.th_teams_level = master->th.th_teams_level;
5229         thr->th.th_teams_size = master->th.th_teams_size;
5230       }
5231     }
5232 #if KMP_NESTED_HOT_TEAMS
5233     if (level) {
5234       // Sync barrier state for nested hot teams, not needed for outermost hot
5235       // team.
5236       for (f = 1; f < new_nproc; ++f) {
5237         kmp_info_t *thr = team->t.t_threads[f];
5238         int b;
5239         kmp_balign_t *balign = thr->th.th_bar;
5240         for (b = 0; b < bs_last_barrier; ++b) {
5241           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5242           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5243 #if USE_DEBUGGER
5244           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5245 #endif
5246         }
5247       }
5248     }
5249 #endif // KMP_NESTED_HOT_TEAMS
5250 
5251     /* reallocate space for arguments if necessary */
5252     __kmp_alloc_argv_entries(argc, team, TRUE);
5253     KMP_CHECK_UPDATE(team->t.t_argc, argc);
5254     // The hot team re-uses the previous task team,
5255     // if untouched during the previous release->gather phase.
5256 
5257     KF_TRACE(10, (" hot_team = %p\n", team));
5258 
5259 #if KMP_DEBUG
5260     if (__kmp_tasking_mode != tskm_immediate_exec) {
5261       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5262                     "task_team[1] = %p after reinit\n",
5263                     team->t.t_task_team[0], team->t.t_task_team[1]));
5264     }
5265 #endif
5266 
5267 #if OMPT_SUPPORT
5268     __ompt_team_assign_id(team, ompt_parallel_data);
5269 #endif
5270 
5271     KMP_MB();
5272 
5273     return team;
5274   }
5275 
5276   /* next, let's try to take one from the team pool */
5277   KMP_MB();
5278   for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5279     /* TODO: consider resizing undersized teams instead of reaping them, now
5280        that we have a resizing mechanism */
5281     if (team->t.t_max_nproc >= max_nproc) {
5282       /* take this team from the team pool */
5283       __kmp_team_pool = team->t.t_next_pool;
5284 
5285       /* setup the team for fresh use */
5286       __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5287 
5288       KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5289                     "task_team[1] %p to NULL\n",
5290                     &team->t.t_task_team[0], &team->t.t_task_team[1]));
5291       team->t.t_task_team[0] = NULL;
5292       team->t.t_task_team[1] = NULL;
5293 
5294       /* reallocate space for arguments if necessary */
5295       __kmp_alloc_argv_entries(argc, team, TRUE);
5296       KMP_CHECK_UPDATE(team->t.t_argc, argc);
5297 
5298       KA_TRACE(
5299           20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5300                team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5301       { // Initialize barrier data.
5302         int b;
5303         for (b = 0; b < bs_last_barrier; ++b) {
5304           team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5305 #if USE_DEBUGGER
5306           team->t.t_bar[b].b_master_arrived = 0;
5307           team->t.t_bar[b].b_team_arrived = 0;
5308 #endif
5309         }
5310       }
5311 
5312       team->t.t_proc_bind = new_proc_bind;
5313 
5314       KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5315                     team->t.t_id));
5316 
5317 #if OMPT_SUPPORT
5318       __ompt_team_assign_id(team, ompt_parallel_data);
5319 #endif
5320 
5321       KMP_MB();
5322 
5323       return team;
5324     }
5325 
5326     /* reap team if it is too small, then loop back and check the next one */
5327     // not sure if this is wise, but, will be redone during the hot-teams
5328     // rewrite.
5329     /* TODO: Use technique to find the right size hot-team, don't reap them */
5330     team = __kmp_reap_team(team);
5331     __kmp_team_pool = team;
5332   }
5333 
5334   /* nothing available in the pool, no matter, make a new team! */
5335   KMP_MB();
5336   team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5337 
5338   /* and set it up */
5339   team->t.t_max_nproc = max_nproc;
5340   /* NOTE well, for some reason allocating one big buffer and dividing it up
5341      seems to really hurt performance a lot on the P4, so, let's not use this */
5342   __kmp_allocate_team_arrays(team, max_nproc);
5343 
5344   KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5345   __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5346 
5347   KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5348                 "%p to NULL\n",
5349                 &team->t.t_task_team[0], &team->t.t_task_team[1]));
5350   team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5351   // memory, no need to duplicate
5352   team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5353   // memory, no need to duplicate
5354 
5355   if (__kmp_storage_map) {
5356     __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5357   }
5358 
5359   /* allocate space for arguments */
5360   __kmp_alloc_argv_entries(argc, team, FALSE);
5361   team->t.t_argc = argc;
5362 
5363   KA_TRACE(20,
5364            ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5365             team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5366   { // Initialize barrier data.
5367     int b;
5368     for (b = 0; b < bs_last_barrier; ++b) {
5369       team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5370 #if USE_DEBUGGER
5371       team->t.t_bar[b].b_master_arrived = 0;
5372       team->t.t_bar[b].b_team_arrived = 0;
5373 #endif
5374     }
5375   }
5376 
5377   team->t.t_proc_bind = new_proc_bind;
5378 
5379 #if OMPT_SUPPORT
5380   __ompt_team_assign_id(team, ompt_parallel_data);
5381   team->t.ompt_serialized_team_info = NULL;
5382 #endif
5383 
5384   KMP_MB();
5385 
5386   KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5387                 team->t.t_id));
5388 
5389   return team;
5390 }
5391 
5392 /* TODO implement hot-teams at all levels */
5393 /* TODO implement lazy thread release on demand (disband request) */
5394 
5395 /* free the team.  return it to the team pool.  release all the threads
5396  * associated with it */
__kmp_free_team(kmp_root_t * root,kmp_team_t * team USE_NESTED_HOT_ARG (kmp_info_t * master))5397 void __kmp_free_team(kmp_root_t *root,
5398                      kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5399   int f;
5400   KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5401                 team->t.t_id));
5402 
5403   /* verify state */
5404   KMP_DEBUG_ASSERT(root);
5405   KMP_DEBUG_ASSERT(team);
5406   KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5407   KMP_DEBUG_ASSERT(team->t.t_threads);
5408 
5409   int use_hot_team = team == root->r.r_hot_team;
5410 #if KMP_NESTED_HOT_TEAMS
5411   int level;
5412   kmp_hot_team_ptr_t *hot_teams;
5413   if (master) {
5414     level = team->t.t_active_level - 1;
5415     if (master->th.th_teams_microtask) { // in teams construct?
5416       if (master->th.th_teams_size.nteams > 1) {
5417         ++level; // level was not increased in teams construct for
5418         // team_of_masters
5419       }
5420       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5421           master->th.th_teams_level == team->t.t_level) {
5422         ++level; // level was not increased in teams construct for
5423         // team_of_workers before the parallel
5424       } // team->t.t_level will be increased inside parallel
5425     }
5426     hot_teams = master->th.th_hot_teams;
5427     if (level < __kmp_hot_teams_max_level) {
5428       KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5429       use_hot_team = 1;
5430     }
5431   }
5432 #endif // KMP_NESTED_HOT_TEAMS
5433 
5434   /* team is done working */
5435   TCW_SYNC_PTR(team->t.t_pkfn,
5436                NULL); // Important for Debugging Support Library.
5437 #if KMP_OS_WINDOWS
5438   team->t.t_copyin_counter = 0; // init counter for possible reuse
5439 #endif
5440   // Do not reset pointer to parent team to NULL for hot teams.
5441 
5442   /* if we are non-hot team, release our threads */
5443   if (!use_hot_team) {
5444     if (__kmp_tasking_mode != tskm_immediate_exec) {
5445       // Wait for threads to reach reapable state
5446       for (f = 1; f < team->t.t_nproc; ++f) {
5447         KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5448         kmp_info_t *th = team->t.t_threads[f];
5449         volatile kmp_uint32 *state = &th->th.th_reap_state;
5450         while (*state != KMP_SAFE_TO_REAP) {
5451 #if KMP_OS_WINDOWS
5452           // On Windows a thread can be killed at any time, check this
5453           DWORD ecode;
5454           if (!__kmp_is_thread_alive(th, &ecode)) {
5455             *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5456             break;
5457           }
5458 #endif
5459           // first check if thread is sleeping
5460           kmp_flag_64<> fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5461           if (fl.is_sleeping())
5462             fl.resume(__kmp_gtid_from_thread(th));
5463           KMP_CPU_PAUSE();
5464         }
5465       }
5466 
5467       // Delete task teams
5468       int tt_idx;
5469       for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5470         kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5471         if (task_team != NULL) {
5472           for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
5473             KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5474             team->t.t_threads[f]->th.th_task_team = NULL;
5475           }
5476           KA_TRACE(
5477               20,
5478               ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5479                __kmp_get_gtid(), task_team, team->t.t_id));
5480 #if KMP_NESTED_HOT_TEAMS
5481           __kmp_free_task_team(master, task_team);
5482 #endif
5483           team->t.t_task_team[tt_idx] = NULL;
5484         }
5485       }
5486     }
5487 
5488     // Reset pointer to parent team only for non-hot teams.
5489     team->t.t_parent = NULL;
5490     team->t.t_level = 0;
5491     team->t.t_active_level = 0;
5492 
5493     /* free the worker threads */
5494     for (f = 1; f < team->t.t_nproc; ++f) {
5495       KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5496       __kmp_free_thread(team->t.t_threads[f]);
5497       team->t.t_threads[f] = NULL;
5498     }
5499 
5500     /* put the team back in the team pool */
5501     /* TODO limit size of team pool, call reap_team if pool too large */
5502     team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5503     __kmp_team_pool = (volatile kmp_team_t *)team;
5504   } else { // Check if team was created for the masters in a teams construct
5505     // See if first worker is a CG root
5506     KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
5507                      team->t.t_threads[1]->th.th_cg_roots);
5508     if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
5509       // Clean up the CG root nodes on workers so that this team can be re-used
5510       for (f = 1; f < team->t.t_nproc; ++f) {
5511         kmp_info_t *thr = team->t.t_threads[f];
5512         KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
5513                          thr->th.th_cg_roots->cg_root == thr);
5514         // Pop current CG root off list
5515         kmp_cg_root_t *tmp = thr->th.th_cg_roots;
5516         thr->th.th_cg_roots = tmp->up;
5517         KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
5518                        " up to node %p. cg_nthreads was %d\n",
5519                        thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
5520         int i = tmp->cg_nthreads--;
5521         if (i == 1) {
5522           __kmp_free(tmp); // free CG if we are the last thread in it
5523         }
5524         // Restore current task's thread_limit from CG root
5525         if (thr->th.th_cg_roots)
5526           thr->th.th_current_task->td_icvs.thread_limit =
5527               thr->th.th_cg_roots->cg_thread_limit;
5528       }
5529     }
5530   }
5531 
5532   KMP_MB();
5533 }
5534 
5535 /* reap the team.  destroy it, reclaim all its resources and free its memory */
__kmp_reap_team(kmp_team_t * team)5536 kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5537   kmp_team_t *next_pool = team->t.t_next_pool;
5538 
5539   KMP_DEBUG_ASSERT(team);
5540   KMP_DEBUG_ASSERT(team->t.t_dispatch);
5541   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5542   KMP_DEBUG_ASSERT(team->t.t_threads);
5543   KMP_DEBUG_ASSERT(team->t.t_argv);
5544 
5545   /* TODO clean the threads that are a part of this? */
5546 
5547   /* free stuff */
5548   __kmp_free_team_arrays(team);
5549   if (team->t.t_argv != &team->t.t_inline_argv[0])
5550     __kmp_free((void *)team->t.t_argv);
5551   __kmp_free(team);
5552 
5553   KMP_MB();
5554   return next_pool;
5555 }
5556 
5557 // Free the thread.  Don't reap it, just place it on the pool of available
5558 // threads.
5559 //
5560 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5561 // binding for the affinity mechanism to be useful.
5562 //
5563 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5564 // However, we want to avoid a potential performance problem by always
5565 // scanning through the list to find the correct point at which to insert
5566 // the thread (potential N**2 behavior).  To do this we keep track of the
5567 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5568 // With single-level parallelism, threads will always be added to the tail
5569 // of the list, kept track of by __kmp_thread_pool_insert_pt.  With nested
5570 // parallelism, all bets are off and we may need to scan through the entire
5571 // free list.
5572 //
5573 // This change also has a potentially large performance benefit, for some
5574 // applications.  Previously, as threads were freed from the hot team, they
5575 // would be placed back on the free list in inverse order.  If the hot team
5576 // grew back to it's original size, then the freed thread would be placed
5577 // back on the hot team in reverse order.  This could cause bad cache
5578 // locality problems on programs where the size of the hot team regularly
5579 // grew and shrunk.
5580 //
5581 // Now, for single-level parallelism, the OMP tid is always == gtid.
__kmp_free_thread(kmp_info_t * this_th)5582 void __kmp_free_thread(kmp_info_t *this_th) {
5583   int gtid;
5584   kmp_info_t **scan;
5585 
5586   KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5587                 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5588 
5589   KMP_DEBUG_ASSERT(this_th);
5590 
5591   // When moving thread to pool, switch thread to wait on own b_go flag, and
5592   // uninitialized (NULL team).
5593   int b;
5594   kmp_balign_t *balign = this_th->th.th_bar;
5595   for (b = 0; b < bs_last_barrier; ++b) {
5596     if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5597       balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5598     balign[b].bb.team = NULL;
5599     balign[b].bb.leaf_kids = 0;
5600   }
5601   this_th->th.th_task_state = 0;
5602   this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5603 
5604   /* put thread back on the free pool */
5605   TCW_PTR(this_th->th.th_team, NULL);
5606   TCW_PTR(this_th->th.th_root, NULL);
5607   TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5608 
5609   while (this_th->th.th_cg_roots) {
5610     this_th->th.th_cg_roots->cg_nthreads--;
5611     KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
5612                    " %p of thread  %p to %d\n",
5613                    this_th, this_th->th.th_cg_roots,
5614                    this_th->th.th_cg_roots->cg_root,
5615                    this_th->th.th_cg_roots->cg_nthreads));
5616     kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
5617     if (tmp->cg_root == this_th) { // Thread is a cg_root
5618       KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
5619       KA_TRACE(
5620           5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
5621       this_th->th.th_cg_roots = tmp->up;
5622       __kmp_free(tmp);
5623     } else { // Worker thread
5624       if (tmp->cg_nthreads == 0) { // last thread leaves contention group
5625         __kmp_free(tmp);
5626       }
5627       this_th->th.th_cg_roots = NULL;
5628       break;
5629     }
5630   }
5631 
5632   /* If the implicit task assigned to this thread can be used by other threads
5633    * -> multiple threads can share the data and try to free the task at
5634    * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5635    * with higher probability when hot team is disabled but can occurs even when
5636    * the hot team is enabled */
5637   __kmp_free_implicit_task(this_th);
5638   this_th->th.th_current_task = NULL;
5639 
5640   // If the __kmp_thread_pool_insert_pt is already past the new insert
5641   // point, then we need to re-scan the entire list.
5642   gtid = this_th->th.th_info.ds.ds_gtid;
5643   if (__kmp_thread_pool_insert_pt != NULL) {
5644     KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5645     if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5646       __kmp_thread_pool_insert_pt = NULL;
5647     }
5648   }
5649 
5650   // Scan down the list to find the place to insert the thread.
5651   // scan is the address of a link in the list, possibly the address of
5652   // __kmp_thread_pool itself.
5653   //
5654   // In the absence of nested parallelism, the for loop will have 0 iterations.
5655   if (__kmp_thread_pool_insert_pt != NULL) {
5656     scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5657   } else {
5658     scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5659   }
5660   for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5661        scan = &((*scan)->th.th_next_pool))
5662     ;
5663 
5664   // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5665   // to its address.
5666   TCW_PTR(this_th->th.th_next_pool, *scan);
5667   __kmp_thread_pool_insert_pt = *scan = this_th;
5668   KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5669                    (this_th->th.th_info.ds.ds_gtid <
5670                     this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5671   TCW_4(this_th->th.th_in_pool, TRUE);
5672   __kmp_suspend_initialize_thread(this_th);
5673   __kmp_lock_suspend_mx(this_th);
5674   if (this_th->th.th_active == TRUE) {
5675     KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
5676     this_th->th.th_active_in_pool = TRUE;
5677   }
5678 #if KMP_DEBUG
5679   else {
5680     KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE);
5681   }
5682 #endif
5683   __kmp_unlock_suspend_mx(this_th);
5684 
5685   TCW_4(__kmp_nth, __kmp_nth - 1);
5686 
5687 #ifdef KMP_ADJUST_BLOCKTIME
5688   /* Adjust blocktime back to user setting or default if necessary */
5689   /* Middle initialization might never have occurred                */
5690   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5691     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5692     if (__kmp_nth <= __kmp_avail_proc) {
5693       __kmp_zero_bt = FALSE;
5694     }
5695   }
5696 #endif /* KMP_ADJUST_BLOCKTIME */
5697 
5698   KMP_MB();
5699 }
5700 
5701 /* ------------------------------------------------------------------------ */
5702 
__kmp_launch_thread(kmp_info_t * this_thr)5703 void *__kmp_launch_thread(kmp_info_t *this_thr) {
5704   int gtid = this_thr->th.th_info.ds.ds_gtid;
5705   /*    void                 *stack_data;*/
5706   kmp_team_t **volatile pteam;
5707 
5708   KMP_MB();
5709   KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
5710 
5711   if (__kmp_env_consistency_check) {
5712     this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
5713   }
5714 
5715 #if OMPT_SUPPORT
5716   ompt_data_t *thread_data;
5717   if (ompt_enabled.enabled) {
5718     thread_data = &(this_thr->th.ompt_thread_info.thread_data);
5719     *thread_data = ompt_data_none;
5720 
5721     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5722     this_thr->th.ompt_thread_info.wait_id = 0;
5723     this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
5724     this_thr->th.ompt_thread_info.parallel_flags = 0;
5725     if (ompt_enabled.ompt_callback_thread_begin) {
5726       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
5727           ompt_thread_worker, thread_data);
5728     }
5729     this_thr->th.ompt_thread_info.state = ompt_state_idle;
5730   }
5731 #endif
5732 
5733   /* This is the place where threads wait for work */
5734   while (!TCR_4(__kmp_global.g.g_done)) {
5735     KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
5736     KMP_MB();
5737 
5738     /* wait for work to do */
5739     KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
5740 
5741     /* No tid yet since not part of a team */
5742     __kmp_fork_barrier(gtid, KMP_GTID_DNE);
5743 
5744 #if OMPT_SUPPORT
5745     if (ompt_enabled.enabled) {
5746       this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5747     }
5748 #endif
5749 
5750     pteam = &this_thr->th.th_team;
5751 
5752     /* have we been allocated? */
5753     if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
5754       /* we were just woken up, so run our new task */
5755       if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
5756         int rc;
5757         KA_TRACE(20,
5758                  ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
5759                   gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5760                   (*pteam)->t.t_pkfn));
5761 
5762         updateHWFPControl(*pteam);
5763 
5764 #if OMPT_SUPPORT
5765         if (ompt_enabled.enabled) {
5766           this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
5767         }
5768 #endif
5769 
5770         rc = (*pteam)->t.t_invoke(gtid);
5771         KMP_ASSERT(rc);
5772 
5773         KMP_MB();
5774         KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
5775                       gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5776                       (*pteam)->t.t_pkfn));
5777       }
5778 #if OMPT_SUPPORT
5779       if (ompt_enabled.enabled) {
5780         /* no frame set while outside task */
5781         __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
5782 
5783         this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5784       }
5785 #endif
5786       /* join barrier after parallel region */
5787       __kmp_join_barrier(gtid);
5788     }
5789   }
5790   TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
5791 
5792 #if OMPT_SUPPORT
5793   if (ompt_enabled.ompt_callback_thread_end) {
5794     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
5795   }
5796 #endif
5797 
5798   this_thr->th.th_task_team = NULL;
5799   /* run the destructors for the threadprivate data for this thread */
5800   __kmp_common_destroy_gtid(gtid);
5801 
5802   KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
5803   KMP_MB();
5804   return this_thr;
5805 }
5806 
5807 /* ------------------------------------------------------------------------ */
5808 
__kmp_internal_end_dest(void * specific_gtid)5809 void __kmp_internal_end_dest(void *specific_gtid) {
5810 #if KMP_COMPILER_ICC
5811 #pragma warning(push)
5812 #pragma warning(disable : 810) // conversion from "void *" to "int" may lose
5813 // significant bits
5814 #endif
5815   // Make sure no significant bits are lost
5816   int gtid = (kmp_intptr_t)specific_gtid - 1;
5817 #if KMP_COMPILER_ICC
5818 #pragma warning(pop)
5819 #endif
5820 
5821   KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
5822   /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
5823    * this is because 0 is reserved for the nothing-stored case */
5824 
5825   __kmp_internal_end_thread(gtid);
5826 }
5827 
5828 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
5829 
__kmp_internal_end_dtor(void)5830 __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
5831   __kmp_internal_end_atexit();
5832 }
5833 
5834 #endif
5835 
5836 /* [Windows] josh: when the atexit handler is called, there may still be more
5837    than one thread alive */
__kmp_internal_end_atexit(void)5838 void __kmp_internal_end_atexit(void) {
5839   KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
5840   /* [Windows]
5841      josh: ideally, we want to completely shutdown the library in this atexit
5842      handler, but stat code that depends on thread specific data for gtid fails
5843      because that data becomes unavailable at some point during the shutdown, so
5844      we call __kmp_internal_end_thread instead. We should eventually remove the
5845      dependency on __kmp_get_specific_gtid in the stat code and use
5846      __kmp_internal_end_library to cleanly shutdown the library.
5847 
5848      // TODO: Can some of this comment about GVS be removed?
5849      I suspect that the offending stat code is executed when the calling thread
5850      tries to clean up a dead root thread's data structures, resulting in GVS
5851      code trying to close the GVS structures for that thread, but since the stat
5852      code uses __kmp_get_specific_gtid to get the gtid with the assumption that
5853      the calling thread is cleaning up itself instead of another thread, it get
5854      confused. This happens because allowing a thread to unregister and cleanup
5855      another thread is a recent modification for addressing an issue.
5856      Based on the current design (20050722), a thread may end up
5857      trying to unregister another thread only if thread death does not trigger
5858      the calling of __kmp_internal_end_thread.  For Linux* OS, there is the
5859      thread specific data destructor function to detect thread death. For
5860      Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
5861      is nothing.  Thus, the workaround is applicable only for Windows static
5862      stat library. */
5863   __kmp_internal_end_library(-1);
5864 #if KMP_OS_WINDOWS
5865   __kmp_close_console();
5866 #endif
5867 }
5868 
__kmp_reap_thread(kmp_info_t * thread,int is_root)5869 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
5870   // It is assumed __kmp_forkjoin_lock is acquired.
5871 
5872   int gtid;
5873 
5874   KMP_DEBUG_ASSERT(thread != NULL);
5875 
5876   gtid = thread->th.th_info.ds.ds_gtid;
5877 
5878   if (!is_root) {
5879     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5880       /* Assume the threads are at the fork barrier here */
5881       KA_TRACE(
5882           20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
5883                gtid));
5884       /* Need release fence here to prevent seg faults for tree forkjoin barrier
5885        * (GEH) */
5886       ANNOTATE_HAPPENS_BEFORE(thread);
5887       kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
5888                          thread);
5889       __kmp_release_64(&flag);
5890     }
5891 
5892     // Terminate OS thread.
5893     __kmp_reap_worker(thread);
5894 
5895     // The thread was killed asynchronously.  If it was actively
5896     // spinning in the thread pool, decrement the global count.
5897     //
5898     // There is a small timing hole here - if the worker thread was just waking
5899     // up after sleeping in the pool, had reset it's th_active_in_pool flag but
5900     // not decremented the global counter __kmp_thread_pool_active_nth yet, then
5901     // the global counter might not get updated.
5902     //
5903     // Currently, this can only happen as the library is unloaded,
5904     // so there are no harmful side effects.
5905     if (thread->th.th_active_in_pool) {
5906       thread->th.th_active_in_pool = FALSE;
5907       KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
5908       KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
5909     }
5910   }
5911 
5912   __kmp_free_implicit_task(thread);
5913 
5914 // Free the fast memory for tasking
5915 #if USE_FAST_MEMORY
5916   __kmp_free_fast_memory(thread);
5917 #endif /* USE_FAST_MEMORY */
5918 
5919   __kmp_suspend_uninitialize_thread(thread);
5920 
5921   KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
5922   TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
5923 
5924   --__kmp_all_nth;
5925 // __kmp_nth was decremented when thread is added to the pool.
5926 
5927 #ifdef KMP_ADJUST_BLOCKTIME
5928   /* Adjust blocktime back to user setting or default if necessary */
5929   /* Middle initialization might never have occurred                */
5930   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5931     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5932     if (__kmp_nth <= __kmp_avail_proc) {
5933       __kmp_zero_bt = FALSE;
5934     }
5935   }
5936 #endif /* KMP_ADJUST_BLOCKTIME */
5937 
5938   /* free the memory being used */
5939   if (__kmp_env_consistency_check) {
5940     if (thread->th.th_cons) {
5941       __kmp_free_cons_stack(thread->th.th_cons);
5942       thread->th.th_cons = NULL;
5943     }
5944   }
5945 
5946   if (thread->th.th_pri_common != NULL) {
5947     __kmp_free(thread->th.th_pri_common);
5948     thread->th.th_pri_common = NULL;
5949   }
5950 
5951   if (thread->th.th_task_state_memo_stack != NULL) {
5952     __kmp_free(thread->th.th_task_state_memo_stack);
5953     thread->th.th_task_state_memo_stack = NULL;
5954   }
5955 
5956 #if KMP_USE_BGET
5957   if (thread->th.th_local.bget_data != NULL) {
5958     __kmp_finalize_bget(thread);
5959   }
5960 #endif
5961 
5962 #if KMP_AFFINITY_SUPPORTED
5963   if (thread->th.th_affin_mask != NULL) {
5964     KMP_CPU_FREE(thread->th.th_affin_mask);
5965     thread->th.th_affin_mask = NULL;
5966   }
5967 #endif /* KMP_AFFINITY_SUPPORTED */
5968 
5969 #if KMP_USE_HIER_SCHED
5970   if (thread->th.th_hier_bar_data != NULL) {
5971     __kmp_free(thread->th.th_hier_bar_data);
5972     thread->th.th_hier_bar_data = NULL;
5973   }
5974 #endif
5975 
5976   __kmp_reap_team(thread->th.th_serial_team);
5977   thread->th.th_serial_team = NULL;
5978   __kmp_free(thread);
5979 
5980   KMP_MB();
5981 
5982 } // __kmp_reap_thread
5983 
__kmp_internal_end(void)5984 static void __kmp_internal_end(void) {
5985   int i;
5986 
5987   /* First, unregister the library */
5988   __kmp_unregister_library();
5989 
5990 #if KMP_OS_WINDOWS
5991   /* In Win static library, we can't tell when a root actually dies, so we
5992      reclaim the data structures for any root threads that have died but not
5993      unregistered themselves, in order to shut down cleanly.
5994      In Win dynamic library we also can't tell when a thread dies.  */
5995   __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
5996 // dead roots
5997 #endif
5998 
5999   for (i = 0; i < __kmp_threads_capacity; i++)
6000     if (__kmp_root[i])
6001       if (__kmp_root[i]->r.r_active)
6002         break;
6003   KMP_MB(); /* Flush all pending memory write invalidates.  */
6004   TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6005 
6006   if (i < __kmp_threads_capacity) {
6007 #if KMP_USE_MONITOR
6008     // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
6009     KMP_MB(); /* Flush all pending memory write invalidates.  */
6010 
6011     // Need to check that monitor was initialized before reaping it. If we are
6012     // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
6013     // __kmp_monitor will appear to contain valid data, but it is only valid in
6014     // the parent process, not the child.
6015     // New behavior (201008): instead of keying off of the flag
6016     // __kmp_init_parallel, the monitor thread creation is keyed off
6017     // of the new flag __kmp_init_monitor.
6018     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6019     if (TCR_4(__kmp_init_monitor)) {
6020       __kmp_reap_monitor(&__kmp_monitor);
6021       TCW_4(__kmp_init_monitor, 0);
6022     }
6023     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6024     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6025 #endif // KMP_USE_MONITOR
6026   } else {
6027 /* TODO move this to cleanup code */
6028 #ifdef KMP_DEBUG
6029     /* make sure that everything has properly ended */
6030     for (i = 0; i < __kmp_threads_capacity; i++) {
6031       if (__kmp_root[i]) {
6032         //                    KMP_ASSERT( ! KMP_UBER_GTID( i ) );         // AC:
6033         //                    there can be uber threads alive here
6034         KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
6035       }
6036     }
6037 #endif
6038 
6039     KMP_MB();
6040 
6041     // Reap the worker threads.
6042     // This is valid for now, but be careful if threads are reaped sooner.
6043     while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
6044       // Get the next thread from the pool.
6045       kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
6046       __kmp_thread_pool = thread->th.th_next_pool;
6047       // Reap it.
6048       KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
6049       thread->th.th_next_pool = NULL;
6050       thread->th.th_in_pool = FALSE;
6051       __kmp_reap_thread(thread, 0);
6052     }
6053     __kmp_thread_pool_insert_pt = NULL;
6054 
6055     // Reap teams.
6056     while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
6057       // Get the next team from the pool.
6058       kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
6059       __kmp_team_pool = team->t.t_next_pool;
6060       // Reap it.
6061       team->t.t_next_pool = NULL;
6062       __kmp_reap_team(team);
6063     }
6064 
6065     __kmp_reap_task_teams();
6066 
6067 #if KMP_OS_UNIX
6068     // Threads that are not reaped should not access any resources since they
6069     // are going to be deallocated soon, so the shutdown sequence should wait
6070     // until all threads either exit the final spin-waiting loop or begin
6071     // sleeping after the given blocktime.
6072     for (i = 0; i < __kmp_threads_capacity; i++) {
6073       kmp_info_t *thr = __kmp_threads[i];
6074       while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
6075         KMP_CPU_PAUSE();
6076     }
6077 #endif
6078 
6079     for (i = 0; i < __kmp_threads_capacity; ++i) {
6080       // TBD: Add some checking...
6081       // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6082     }
6083 
6084     /* Make sure all threadprivate destructors get run by joining with all
6085        worker threads before resetting this flag */
6086     TCW_SYNC_4(__kmp_init_common, FALSE);
6087 
6088     KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6089     KMP_MB();
6090 
6091 #if KMP_USE_MONITOR
6092     // See note above: One of the possible fixes for CQ138434 / CQ140126
6093     //
6094     // FIXME: push both code fragments down and CSE them?
6095     // push them into __kmp_cleanup() ?
6096     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6097     if (TCR_4(__kmp_init_monitor)) {
6098       __kmp_reap_monitor(&__kmp_monitor);
6099       TCW_4(__kmp_init_monitor, 0);
6100     }
6101     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6102     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6103 #endif
6104   } /* else !__kmp_global.t_active */
6105   TCW_4(__kmp_init_gtid, FALSE);
6106   KMP_MB(); /* Flush all pending memory write invalidates.  */
6107 
6108   __kmp_cleanup();
6109 #if OMPT_SUPPORT
6110   ompt_fini();
6111 #endif
6112 }
6113 
__kmp_internal_end_library(int gtid_req)6114 void __kmp_internal_end_library(int gtid_req) {
6115   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6116   /* this shouldn't be a race condition because __kmp_internal_end() is the
6117      only place to clear __kmp_serial_init */
6118   /* we'll check this later too, after we get the lock */
6119   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6120   // redundant, because the next check will work in any case.
6121   if (__kmp_global.g.g_abort) {
6122     KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6123     /* TODO abort? */
6124     return;
6125   }
6126   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6127     KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6128     return;
6129   }
6130 
6131   KMP_MB(); /* Flush all pending memory write invalidates.  */
6132   /* find out who we are and what we should do */
6133   {
6134     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6135     KA_TRACE(
6136         10, ("__kmp_internal_end_library: enter T#%d  (%d)\n", gtid, gtid_req));
6137     if (gtid == KMP_GTID_SHUTDOWN) {
6138       KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6139                     "already shutdown\n"));
6140       return;
6141     } else if (gtid == KMP_GTID_MONITOR) {
6142       KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6143                     "registered, or system shutdown\n"));
6144       return;
6145     } else if (gtid == KMP_GTID_DNE) {
6146       KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6147                     "shutdown\n"));
6148       /* we don't know who we are, but we may still shutdown the library */
6149     } else if (KMP_UBER_GTID(gtid)) {
6150       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6151       if (__kmp_root[gtid]->r.r_active) {
6152         __kmp_global.g.g_abort = -1;
6153         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6154         __kmp_unregister_library();
6155         KA_TRACE(10,
6156                  ("__kmp_internal_end_library: root still active, abort T#%d\n",
6157                   gtid));
6158         return;
6159       } else {
6160         KA_TRACE(
6161             10,
6162             ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6163         __kmp_unregister_root_current_thread(gtid);
6164       }
6165     } else {
6166 /* worker threads may call this function through the atexit handler, if they
6167  * call exit() */
6168 /* For now, skip the usual subsequent processing and just dump the debug buffer.
6169    TODO: do a thorough shutdown instead */
6170 #ifdef DUMP_DEBUG_ON_EXIT
6171       if (__kmp_debug_buf)
6172         __kmp_dump_debug_buffer();
6173 #endif
6174       // added unregister library call here when we switch to shm linux
6175       // if we don't, it will leave lots of files in /dev/shm
6176       // cleanup shared memory file before exiting.
6177       __kmp_unregister_library();
6178       return;
6179     }
6180   }
6181   /* synchronize the termination process */
6182   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6183 
6184   /* have we already finished */
6185   if (__kmp_global.g.g_abort) {
6186     KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6187     /* TODO abort? */
6188     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6189     return;
6190   }
6191   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6192     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6193     return;
6194   }
6195 
6196   /* We need this lock to enforce mutex between this reading of
6197      __kmp_threads_capacity and the writing by __kmp_register_root.
6198      Alternatively, we can use a counter of roots that is atomically updated by
6199      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6200      __kmp_internal_end_*.  */
6201   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6202 
6203   /* now we can safely conduct the actual termination */
6204   __kmp_internal_end();
6205 
6206   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6207   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6208 
6209   KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6210 
6211 #ifdef DUMP_DEBUG_ON_EXIT
6212   if (__kmp_debug_buf)
6213     __kmp_dump_debug_buffer();
6214 #endif
6215 
6216 #if KMP_OS_WINDOWS
6217   __kmp_close_console();
6218 #endif
6219 
6220   __kmp_fini_allocator();
6221 
6222 } // __kmp_internal_end_library
6223 
__kmp_internal_end_thread(int gtid_req)6224 void __kmp_internal_end_thread(int gtid_req) {
6225   int i;
6226 
6227   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6228   /* this shouldn't be a race condition because __kmp_internal_end() is the
6229    * only place to clear __kmp_serial_init */
6230   /* we'll check this later too, after we get the lock */
6231   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6232   // redundant, because the next check will work in any case.
6233   if (__kmp_global.g.g_abort) {
6234     KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6235     /* TODO abort? */
6236     return;
6237   }
6238   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6239     KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6240     return;
6241   }
6242 
6243   KMP_MB(); /* Flush all pending memory write invalidates.  */
6244 
6245   /* find out who we are and what we should do */
6246   {
6247     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6248     KA_TRACE(10,
6249              ("__kmp_internal_end_thread: enter T#%d  (%d)\n", gtid, gtid_req));
6250     if (gtid == KMP_GTID_SHUTDOWN) {
6251       KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6252                     "already shutdown\n"));
6253       return;
6254     } else if (gtid == KMP_GTID_MONITOR) {
6255       KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6256                     "registered, or system shutdown\n"));
6257       return;
6258     } else if (gtid == KMP_GTID_DNE) {
6259       KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6260                     "shutdown\n"));
6261       return;
6262       /* we don't know who we are */
6263     } else if (KMP_UBER_GTID(gtid)) {
6264       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6265       if (__kmp_root[gtid]->r.r_active) {
6266         __kmp_global.g.g_abort = -1;
6267         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6268         KA_TRACE(10,
6269                  ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6270                   gtid));
6271         return;
6272       } else {
6273         KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6274                       gtid));
6275         __kmp_unregister_root_current_thread(gtid);
6276       }
6277     } else {
6278       /* just a worker thread, let's leave */
6279       KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6280 
6281       if (gtid >= 0) {
6282         __kmp_threads[gtid]->th.th_task_team = NULL;
6283       }
6284 
6285       KA_TRACE(10,
6286                ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6287                 gtid));
6288       return;
6289     }
6290   }
6291 #if KMP_DYNAMIC_LIB
6292   if (__kmp_pause_status != kmp_hard_paused)
6293   // AC: lets not shutdown the dynamic library at the exit of uber thread,
6294   // because we will better shutdown later in the library destructor.
6295   {
6296     KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6297     return;
6298   }
6299 #endif
6300   /* synchronize the termination process */
6301   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6302 
6303   /* have we already finished */
6304   if (__kmp_global.g.g_abort) {
6305     KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6306     /* TODO abort? */
6307     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6308     return;
6309   }
6310   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6311     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6312     return;
6313   }
6314 
6315   /* We need this lock to enforce mutex between this reading of
6316      __kmp_threads_capacity and the writing by __kmp_register_root.
6317      Alternatively, we can use a counter of roots that is atomically updated by
6318      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6319      __kmp_internal_end_*.  */
6320 
6321   /* should we finish the run-time?  are all siblings done? */
6322   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6323 
6324   for (i = 0; i < __kmp_threads_capacity; ++i) {
6325     if (KMP_UBER_GTID(i)) {
6326       KA_TRACE(
6327           10,
6328           ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6329       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6330       __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6331       return;
6332     }
6333   }
6334 
6335   /* now we can safely conduct the actual termination */
6336 
6337   __kmp_internal_end();
6338 
6339   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6340   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6341 
6342   KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6343 
6344 #ifdef DUMP_DEBUG_ON_EXIT
6345   if (__kmp_debug_buf)
6346     __kmp_dump_debug_buffer();
6347 #endif
6348 } // __kmp_internal_end_thread
6349 
6350 // -----------------------------------------------------------------------------
6351 // Library registration stuff.
6352 
6353 static long __kmp_registration_flag = 0;
6354 // Random value used to indicate library initialization.
6355 static char *__kmp_registration_str = NULL;
6356 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6357 
__kmp_reg_status_name()6358 static inline char *__kmp_reg_status_name() {
6359   /* On RHEL 3u5 if linked statically, getpid() returns different values in
6360      each thread. If registration and unregistration go in different threads
6361      (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6362      env var can not be found, because the name will contain different pid. */
6363 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6364   return __kmp_str_format("__KMP_REGISTERED_LIB_%d_%d", (int)getpid(),
6365                           (int)getuid());
6366 #else
6367   return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6368 #endif
6369 } // __kmp_reg_status_get
6370 
__kmp_register_library_startup(void)6371 void __kmp_register_library_startup(void) {
6372 
6373   char *name = __kmp_reg_status_name(); // Name of the environment variable.
6374   int done = 0;
6375   union {
6376     double dtime;
6377     long ltime;
6378   } time;
6379 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6380   __kmp_initialize_system_tick();
6381 #endif
6382   __kmp_read_system_time(&time.dtime);
6383   __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6384   __kmp_registration_str =
6385       __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6386                        __kmp_registration_flag, KMP_LIBRARY_FILE);
6387 
6388   KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6389                 __kmp_registration_str));
6390 
6391   while (!done) {
6392 
6393     char *value = NULL; // Actual value of the environment variable.
6394 
6395 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6396     char *shm_name = __kmp_str_format("/%s", name);
6397     int shm_preexist = 0;
6398     char *data1;
6399     int fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0666);
6400     if ((fd1 == -1) && (errno == EEXIST)) {
6401       // file didn't open because it already exists.
6402       // try opening existing file
6403       fd1 = shm_open(shm_name, O_RDWR, 0666);
6404       if (fd1 == -1) { // file didn't open
6405         // error out here
6406         __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM"), KMP_ERR(0),
6407                     __kmp_msg_null);
6408       } else {
6409         // able to open existing file
6410         shm_preexist = 1;
6411       }
6412     } else if (fd1 == -1) { // SHM didn't open; it was due to error other than
6413       // already exists.
6414       // error out here.
6415       __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM2"), KMP_ERR(errno),
6416                   __kmp_msg_null);
6417     }
6418     if (shm_preexist == 0) {
6419       // we created SHM now set size
6420       if (ftruncate(fd1, SHM_SIZE) == -1) {
6421         // error occured setting size;
6422         __kmp_fatal(KMP_MSG(FunctionError, "Can't set size of SHM"),
6423                     KMP_ERR(errno), __kmp_msg_null);
6424       }
6425     }
6426     data1 =
6427         (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd1, 0);
6428     if (data1 == MAP_FAILED) {
6429       // failed to map shared memory
6430       __kmp_fatal(KMP_MSG(FunctionError, "Can't map SHM"), KMP_ERR(errno),
6431                   __kmp_msg_null);
6432     }
6433     if (shm_preexist == 0) { // set data to SHM, set value
6434       KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6435     }
6436     // Read value from either what we just wrote or existing file.
6437     value = __kmp_str_format("%s", data1); // read value from SHM
6438     munmap(data1, SHM_SIZE);
6439     close(fd1);
6440 #else // Windows and unix with static library
6441     // Set environment variable, but do not overwrite if it is exist.
6442     __kmp_env_set(name, __kmp_registration_str, 0);
6443     // read value to see if it got set
6444     value = __kmp_env_get(name);
6445 #endif
6446 
6447     if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6448       done = 1; // Ok, environment variable set successfully, exit the loop.
6449     } else {
6450       // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6451       // Check whether it alive or dead.
6452       int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6453       char *tail = value;
6454       char *flag_addr_str = NULL;
6455       char *flag_val_str = NULL;
6456       char const *file_name = NULL;
6457       __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6458       __kmp_str_split(tail, '-', &flag_val_str, &tail);
6459       file_name = tail;
6460       if (tail != NULL) {
6461         long *flag_addr = 0;
6462         long flag_val = 0;
6463         KMP_SSCANF(flag_addr_str, "%p", RCAST(void**, &flag_addr));
6464         KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6465         if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6466           // First, check whether environment-encoded address is mapped into
6467           // addr space.
6468           // If so, dereference it to see if it still has the right value.
6469           if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6470             neighbor = 1;
6471           } else {
6472             // If not, then we know the other copy of the library is no longer
6473             // running.
6474             neighbor = 2;
6475           }
6476         }
6477       }
6478       switch (neighbor) {
6479       case 0: // Cannot parse environment variable -- neighbor status unknown.
6480         // Assume it is the incompatible format of future version of the
6481         // library. Assume the other library is alive.
6482         // WARN( ... ); // TODO: Issue a warning.
6483         file_name = "unknown library";
6484         KMP_FALLTHROUGH();
6485       // Attention! Falling to the next case. That's intentional.
6486       case 1: { // Neighbor is alive.
6487         // Check it is allowed.
6488         char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6489         if (!__kmp_str_match_true(duplicate_ok)) {
6490           // That's not allowed. Issue fatal error.
6491           __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6492                       KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6493         }
6494         KMP_INTERNAL_FREE(duplicate_ok);
6495         __kmp_duplicate_library_ok = 1;
6496         done = 1; // Exit the loop.
6497       } break;
6498       case 2: { // Neighbor is dead.
6499 
6500 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6501         // close shared memory.
6502         shm_unlink(shm_name); // this removes file in /dev/shm
6503 #else
6504         // Clear the variable and try to register library again.
6505         __kmp_env_unset(name);
6506 #endif
6507       } break;
6508       default: { KMP_DEBUG_ASSERT(0); } break;
6509       }
6510     }
6511     KMP_INTERNAL_FREE((void *)value);
6512 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6513     KMP_INTERNAL_FREE((void *)shm_name);
6514 #endif
6515   } // while
6516   KMP_INTERNAL_FREE((void *)name);
6517 
6518 } // func __kmp_register_library_startup
6519 
__kmp_unregister_library(void)6520 void __kmp_unregister_library(void) {
6521 
6522   char *name = __kmp_reg_status_name();
6523   char *value = NULL;
6524 
6525 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6526   char *shm_name = __kmp_str_format("/%s", name);
6527   int fd1 = shm_open(shm_name, O_RDONLY, 0666);
6528   if (fd1 == -1) {
6529     // file did not open. return.
6530     return;
6531   }
6532   char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
6533   if (data1 != MAP_FAILED) {
6534     value = __kmp_str_format("%s", data1); // read value from SHM
6535     munmap(data1, SHM_SIZE);
6536   }
6537   close(fd1);
6538 #else
6539   value = __kmp_env_get(name);
6540 #endif
6541 
6542   KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6543   KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6544   if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6545 //  Ok, this is our variable. Delete it.
6546 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6547     shm_unlink(shm_name); // this removes file in /dev/shm
6548 #else
6549     __kmp_env_unset(name);
6550 #endif
6551   }
6552 
6553 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6554   KMP_INTERNAL_FREE(shm_name);
6555 #endif
6556 
6557   KMP_INTERNAL_FREE(__kmp_registration_str);
6558   KMP_INTERNAL_FREE(value);
6559   KMP_INTERNAL_FREE(name);
6560 
6561   __kmp_registration_flag = 0;
6562   __kmp_registration_str = NULL;
6563 
6564 } // __kmp_unregister_library
6565 
6566 // End of Library registration stuff.
6567 // -----------------------------------------------------------------------------
6568 
6569 #if KMP_MIC_SUPPORTED
6570 
__kmp_check_mic_type()6571 static void __kmp_check_mic_type() {
6572   kmp_cpuid_t cpuid_state = {0};
6573   kmp_cpuid_t *cs_p = &cpuid_state;
6574   __kmp_x86_cpuid(1, 0, cs_p);
6575   // We don't support mic1 at the moment
6576   if ((cs_p->eax & 0xff0) == 0xB10) {
6577     __kmp_mic_type = mic2;
6578   } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
6579     __kmp_mic_type = mic3;
6580   } else {
6581     __kmp_mic_type = non_mic;
6582   }
6583 }
6584 
6585 #endif /* KMP_MIC_SUPPORTED */
6586 
6587 #if KMP_HAVE_UMWAIT
__kmp_user_level_mwait_init()6588 static void __kmp_user_level_mwait_init() {
6589   struct kmp_cpuid buf;
6590   __kmp_x86_cpuid(7, 0, &buf);
6591   __kmp_umwait_enabled = ((buf.ecx >> 5) & 1) && __kmp_user_level_mwait;
6592   KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n",
6593                 __kmp_umwait_enabled));
6594 }
6595 #elif KMP_HAVE_MWAIT
6596 #ifndef AT_INTELPHIUSERMWAIT
6597 // Spurious, non-existent value that should always fail to return anything.
6598 // Will be replaced with the correct value when we know that.
6599 #define AT_INTELPHIUSERMWAIT 10000
6600 #endif
6601 // getauxval() function is available in RHEL7 and SLES12. If a system with an
6602 // earlier OS is used to build the RTL, we'll use the following internal
6603 // function when the entry is not found.
6604 unsigned long getauxval(unsigned long) KMP_WEAK_ATTRIBUTE_EXTERNAL;
getauxval(unsigned long)6605 unsigned long getauxval(unsigned long) { return 0; }
6606 
__kmp_user_level_mwait_init()6607 static void __kmp_user_level_mwait_init() {
6608   // When getauxval() and correct value of AT_INTELPHIUSERMWAIT are available
6609   // use them to find if the user-level mwait is enabled. Otherwise, forcibly
6610   // set __kmp_mwait_enabled=TRUE on Intel MIC if the environment variable
6611   // KMP_USER_LEVEL_MWAIT was set to TRUE.
6612   if (__kmp_mic_type == mic3) {
6613     unsigned long res = getauxval(AT_INTELPHIUSERMWAIT);
6614     if ((res & 0x1) || __kmp_user_level_mwait) {
6615       __kmp_mwait_enabled = TRUE;
6616       if (__kmp_user_level_mwait) {
6617         KMP_INFORM(EnvMwaitWarn);
6618       }
6619     } else {
6620       __kmp_mwait_enabled = FALSE;
6621     }
6622   }
6623   KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_mic_type = %d, "
6624                 "__kmp_mwait_enabled = %d\n",
6625                 __kmp_mic_type, __kmp_mwait_enabled));
6626 }
6627 #endif /* KMP_HAVE_UMWAIT */
6628 
__kmp_do_serial_initialize(void)6629 static void __kmp_do_serial_initialize(void) {
6630   int i, gtid;
6631   int size;
6632 
6633   KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
6634 
6635   KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
6636   KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
6637   KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
6638   KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
6639   KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
6640 
6641 #if OMPT_SUPPORT
6642   ompt_pre_init();
6643 #endif
6644 
6645   __kmp_validate_locks();
6646 
6647   /* Initialize internal memory allocator */
6648   __kmp_init_allocator();
6649 
6650   /* Register the library startup via an environment variable and check to see
6651      whether another copy of the library is already registered. */
6652 
6653   __kmp_register_library_startup();
6654 
6655   /* TODO reinitialization of library */
6656   if (TCR_4(__kmp_global.g.g_done)) {
6657     KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
6658   }
6659 
6660   __kmp_global.g.g_abort = 0;
6661   TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
6662 
6663 /* initialize the locks */
6664 #if KMP_USE_ADAPTIVE_LOCKS
6665 #if KMP_DEBUG_ADAPTIVE_LOCKS
6666   __kmp_init_speculative_stats();
6667 #endif
6668 #endif
6669 #if KMP_STATS_ENABLED
6670   __kmp_stats_init();
6671 #endif
6672   __kmp_init_lock(&__kmp_global_lock);
6673   __kmp_init_queuing_lock(&__kmp_dispatch_lock);
6674   __kmp_init_lock(&__kmp_debug_lock);
6675   __kmp_init_atomic_lock(&__kmp_atomic_lock);
6676   __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
6677   __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
6678   __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
6679   __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
6680   __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
6681   __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
6682   __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
6683   __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
6684   __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
6685   __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
6686   __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
6687   __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
6688   __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
6689   __kmp_init_bootstrap_lock(&__kmp_exit_lock);
6690 #if KMP_USE_MONITOR
6691   __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
6692 #endif
6693   __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
6694 
6695   /* conduct initialization and initial setup of configuration */
6696 
6697   __kmp_runtime_initialize();
6698 
6699 #if KMP_MIC_SUPPORTED
6700   __kmp_check_mic_type();
6701 #endif
6702 
6703 // Some global variable initialization moved here from kmp_env_initialize()
6704 #ifdef KMP_DEBUG
6705   kmp_diag = 0;
6706 #endif
6707   __kmp_abort_delay = 0;
6708 
6709   // From __kmp_init_dflt_team_nth()
6710   /* assume the entire machine will be used */
6711   __kmp_dflt_team_nth_ub = __kmp_xproc;
6712   if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
6713     __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
6714   }
6715   if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
6716     __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
6717   }
6718   __kmp_max_nth = __kmp_sys_max_nth;
6719   __kmp_cg_max_nth = __kmp_sys_max_nth;
6720   __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
6721   if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
6722     __kmp_teams_max_nth = __kmp_sys_max_nth;
6723   }
6724 
6725   // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
6726   // part
6727   __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
6728 #if KMP_USE_MONITOR
6729   __kmp_monitor_wakeups =
6730       KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6731   __kmp_bt_intervals =
6732       KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6733 #endif
6734   // From "KMP_LIBRARY" part of __kmp_env_initialize()
6735   __kmp_library = library_throughput;
6736   // From KMP_SCHEDULE initialization
6737   __kmp_static = kmp_sch_static_balanced;
6738 // AC: do not use analytical here, because it is non-monotonous
6739 //__kmp_guided = kmp_sch_guided_iterative_chunked;
6740 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
6741 // need to repeat assignment
6742 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
6743 // bit control and barrier method control parts
6744 #if KMP_FAST_REDUCTION_BARRIER
6745 #define kmp_reduction_barrier_gather_bb ((int)1)
6746 #define kmp_reduction_barrier_release_bb ((int)1)
6747 #define kmp_reduction_barrier_gather_pat bp_hyper_bar
6748 #define kmp_reduction_barrier_release_pat bp_hyper_bar
6749 #endif // KMP_FAST_REDUCTION_BARRIER
6750   for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
6751     __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
6752     __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
6753     __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
6754     __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
6755 #if KMP_FAST_REDUCTION_BARRIER
6756     if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
6757       // lin_64 ): hyper,1
6758       __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
6759       __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
6760       __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
6761       __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
6762     }
6763 #endif // KMP_FAST_REDUCTION_BARRIER
6764   }
6765 #if KMP_FAST_REDUCTION_BARRIER
6766 #undef kmp_reduction_barrier_release_pat
6767 #undef kmp_reduction_barrier_gather_pat
6768 #undef kmp_reduction_barrier_release_bb
6769 #undef kmp_reduction_barrier_gather_bb
6770 #endif // KMP_FAST_REDUCTION_BARRIER
6771 #if KMP_MIC_SUPPORTED
6772   if (__kmp_mic_type == mic2) { // KNC
6773     // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
6774     __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
6775     __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
6776         1; // forkjoin release
6777     __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6778     __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6779   }
6780 #if KMP_FAST_REDUCTION_BARRIER
6781   if (__kmp_mic_type == mic2) { // KNC
6782     __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6783     __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6784   }
6785 #endif // KMP_FAST_REDUCTION_BARRIER
6786 #endif // KMP_MIC_SUPPORTED
6787 
6788 // From KMP_CHECKS initialization
6789 #ifdef KMP_DEBUG
6790   __kmp_env_checks = TRUE; /* development versions have the extra checks */
6791 #else
6792   __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
6793 #endif
6794 
6795   // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
6796   __kmp_foreign_tp = TRUE;
6797 
6798   __kmp_global.g.g_dynamic = FALSE;
6799   __kmp_global.g.g_dynamic_mode = dynamic_default;
6800 
6801   __kmp_env_initialize(NULL);
6802 
6803 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
6804   __kmp_user_level_mwait_init();
6805 #endif
6806 // Print all messages in message catalog for testing purposes.
6807 #ifdef KMP_DEBUG
6808   char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
6809   if (__kmp_str_match_true(val)) {
6810     kmp_str_buf_t buffer;
6811     __kmp_str_buf_init(&buffer);
6812     __kmp_i18n_dump_catalog(&buffer);
6813     __kmp_printf("%s", buffer.str);
6814     __kmp_str_buf_free(&buffer);
6815   }
6816   __kmp_env_free(&val);
6817 #endif
6818 
6819   __kmp_threads_capacity =
6820       __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
6821   // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
6822   __kmp_tp_capacity = __kmp_default_tp_capacity(
6823       __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
6824 
6825   // If the library is shut down properly, both pools must be NULL. Just in
6826   // case, set them to NULL -- some memory may leak, but subsequent code will
6827   // work even if pools are not freed.
6828   KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
6829   KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
6830   KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
6831   __kmp_thread_pool = NULL;
6832   __kmp_thread_pool_insert_pt = NULL;
6833   __kmp_team_pool = NULL;
6834 
6835   /* Allocate all of the variable sized records */
6836   /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
6837    * expandable */
6838   /* Since allocation is cache-aligned, just add extra padding at the end */
6839   size =
6840       (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
6841       CACHE_LINE;
6842   __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
6843   __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
6844                                sizeof(kmp_info_t *) * __kmp_threads_capacity);
6845 
6846   /* init thread counts */
6847   KMP_DEBUG_ASSERT(__kmp_all_nth ==
6848                    0); // Asserts fail if the library is reinitializing and
6849   KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
6850   __kmp_all_nth = 0;
6851   __kmp_nth = 0;
6852 
6853   /* setup the uber master thread and hierarchy */
6854   gtid = __kmp_register_root(TRUE);
6855   KA_TRACE(10, ("__kmp_do_serial_initialize  T#%d\n", gtid));
6856   KMP_ASSERT(KMP_UBER_GTID(gtid));
6857   KMP_ASSERT(KMP_INITIAL_GTID(gtid));
6858 
6859   KMP_MB(); /* Flush all pending memory write invalidates.  */
6860 
6861   __kmp_common_initialize();
6862 
6863 #if KMP_OS_UNIX
6864   /* invoke the child fork handler */
6865   __kmp_register_atfork();
6866 #endif
6867 
6868 #if !KMP_DYNAMIC_LIB
6869   {
6870     /* Invoke the exit handler when the program finishes, only for static
6871        library. For dynamic library, we already have _fini and DllMain. */
6872     int rc = atexit(__kmp_internal_end_atexit);
6873     if (rc != 0) {
6874       __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
6875                   __kmp_msg_null);
6876     }
6877   }
6878 #endif
6879 
6880 #if KMP_HANDLE_SIGNALS
6881 #if KMP_OS_UNIX
6882   /* NOTE: make sure that this is called before the user installs their own
6883      signal handlers so that the user handlers are called first. this way they
6884      can return false, not call our handler, avoid terminating the library, and
6885      continue execution where they left off. */
6886   __kmp_install_signals(FALSE);
6887 #endif /* KMP_OS_UNIX */
6888 #if KMP_OS_WINDOWS
6889   __kmp_install_signals(TRUE);
6890 #endif /* KMP_OS_WINDOWS */
6891 #endif
6892 
6893   /* we have finished the serial initialization */
6894   __kmp_init_counter++;
6895 
6896   __kmp_init_serial = TRUE;
6897 
6898   if (__kmp_settings) {
6899     __kmp_env_print();
6900   }
6901 
6902   if (__kmp_display_env || __kmp_display_env_verbose) {
6903     __kmp_env_print_2();
6904   }
6905 
6906 #if OMPT_SUPPORT
6907   ompt_post_init();
6908 #endif
6909 
6910   KMP_MB();
6911 
6912   KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
6913 }
6914 
__kmp_serial_initialize(void)6915 void __kmp_serial_initialize(void) {
6916   if (__kmp_init_serial) {
6917     return;
6918   }
6919   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6920   if (__kmp_init_serial) {
6921     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6922     return;
6923   }
6924   __kmp_do_serial_initialize();
6925   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6926 }
6927 
__kmp_do_middle_initialize(void)6928 static void __kmp_do_middle_initialize(void) {
6929   int i, j;
6930   int prev_dflt_team_nth;
6931 
6932   if (!__kmp_init_serial) {
6933     __kmp_do_serial_initialize();
6934   }
6935 
6936   KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
6937 
6938   // Save the previous value for the __kmp_dflt_team_nth so that
6939   // we can avoid some reinitialization if it hasn't changed.
6940   prev_dflt_team_nth = __kmp_dflt_team_nth;
6941 
6942 #if KMP_AFFINITY_SUPPORTED
6943   // __kmp_affinity_initialize() will try to set __kmp_ncores to the
6944   // number of cores on the machine.
6945   __kmp_affinity_initialize();
6946 
6947   // Run through the __kmp_threads array and set the affinity mask
6948   // for each root thread that is currently registered with the RTL.
6949   for (i = 0; i < __kmp_threads_capacity; i++) {
6950     if (TCR_PTR(__kmp_threads[i]) != NULL) {
6951       __kmp_affinity_set_init_mask(i, TRUE);
6952     }
6953   }
6954 #endif /* KMP_AFFINITY_SUPPORTED */
6955 
6956   KMP_ASSERT(__kmp_xproc > 0);
6957   if (__kmp_avail_proc == 0) {
6958     __kmp_avail_proc = __kmp_xproc;
6959   }
6960 
6961   // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
6962   // correct them now
6963   j = 0;
6964   while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
6965     __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
6966         __kmp_avail_proc;
6967     j++;
6968   }
6969 
6970   if (__kmp_dflt_team_nth == 0) {
6971 #ifdef KMP_DFLT_NTH_CORES
6972     // Default #threads = #cores
6973     __kmp_dflt_team_nth = __kmp_ncores;
6974     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6975                   "__kmp_ncores (%d)\n",
6976                   __kmp_dflt_team_nth));
6977 #else
6978     // Default #threads = #available OS procs
6979     __kmp_dflt_team_nth = __kmp_avail_proc;
6980     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6981                   "__kmp_avail_proc(%d)\n",
6982                   __kmp_dflt_team_nth));
6983 #endif /* KMP_DFLT_NTH_CORES */
6984   }
6985 
6986   if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
6987     __kmp_dflt_team_nth = KMP_MIN_NTH;
6988   }
6989   if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
6990     __kmp_dflt_team_nth = __kmp_sys_max_nth;
6991   }
6992 
6993   // There's no harm in continuing if the following check fails,
6994   // but it indicates an error in the previous logic.
6995   KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
6996 
6997   if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
6998     // Run through the __kmp_threads array and set the num threads icv for each
6999     // root thread that is currently registered with the RTL (which has not
7000     // already explicitly set its nthreads-var with a call to
7001     // omp_set_num_threads()).
7002     for (i = 0; i < __kmp_threads_capacity; i++) {
7003       kmp_info_t *thread = __kmp_threads[i];
7004       if (thread == NULL)
7005         continue;
7006       if (thread->th.th_current_task->td_icvs.nproc != 0)
7007         continue;
7008 
7009       set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
7010     }
7011   }
7012   KA_TRACE(
7013       20,
7014       ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
7015        __kmp_dflt_team_nth));
7016 
7017 #ifdef KMP_ADJUST_BLOCKTIME
7018   /* Adjust blocktime to zero if necessary  now that __kmp_avail_proc is set */
7019   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
7020     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
7021     if (__kmp_nth > __kmp_avail_proc) {
7022       __kmp_zero_bt = TRUE;
7023     }
7024   }
7025 #endif /* KMP_ADJUST_BLOCKTIME */
7026 
7027   /* we have finished middle initialization */
7028   TCW_SYNC_4(__kmp_init_middle, TRUE);
7029 
7030   KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
7031 }
7032 
__kmp_middle_initialize(void)7033 void __kmp_middle_initialize(void) {
7034   if (__kmp_init_middle) {
7035     return;
7036   }
7037   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7038   if (__kmp_init_middle) {
7039     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7040     return;
7041   }
7042   __kmp_do_middle_initialize();
7043   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7044 }
7045 
__kmp_parallel_initialize(void)7046 void __kmp_parallel_initialize(void) {
7047   int gtid = __kmp_entry_gtid(); // this might be a new root
7048 
7049   /* synchronize parallel initialization (for sibling) */
7050   if (TCR_4(__kmp_init_parallel))
7051     return;
7052   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7053   if (TCR_4(__kmp_init_parallel)) {
7054     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7055     return;
7056   }
7057 
7058   /* TODO reinitialization after we have already shut down */
7059   if (TCR_4(__kmp_global.g.g_done)) {
7060     KA_TRACE(
7061         10,
7062         ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
7063     __kmp_infinite_loop();
7064   }
7065 
7066   /* jc: The lock __kmp_initz_lock is already held, so calling
7067      __kmp_serial_initialize would cause a deadlock.  So we call
7068      __kmp_do_serial_initialize directly. */
7069   if (!__kmp_init_middle) {
7070     __kmp_do_middle_initialize();
7071   }
7072   __kmp_resume_if_hard_paused();
7073 
7074   /* begin initialization */
7075   KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
7076   KMP_ASSERT(KMP_UBER_GTID(gtid));
7077 
7078 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
7079   // Save the FP control regs.
7080   // Worker threads will set theirs to these values at thread startup.
7081   __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
7082   __kmp_store_mxcsr(&__kmp_init_mxcsr);
7083   __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
7084 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
7085 
7086 #if KMP_OS_UNIX
7087 #if KMP_HANDLE_SIGNALS
7088   /*  must be after __kmp_serial_initialize  */
7089   __kmp_install_signals(TRUE);
7090 #endif
7091 #endif
7092 
7093   __kmp_suspend_initialize();
7094 
7095 #if defined(USE_LOAD_BALANCE)
7096   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7097     __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
7098   }
7099 #else
7100   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7101     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7102   }
7103 #endif
7104 
7105   if (__kmp_version) {
7106     __kmp_print_version_2();
7107   }
7108 
7109   /* we have finished parallel initialization */
7110   TCW_SYNC_4(__kmp_init_parallel, TRUE);
7111 
7112   KMP_MB();
7113   KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
7114 
7115   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7116 }
7117 
7118 /* ------------------------------------------------------------------------ */
7119 
__kmp_run_before_invoked_task(int gtid,int tid,kmp_info_t * this_thr,kmp_team_t * team)7120 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7121                                    kmp_team_t *team) {
7122   kmp_disp_t *dispatch;
7123 
7124   KMP_MB();
7125 
7126   /* none of the threads have encountered any constructs, yet. */
7127   this_thr->th.th_local.this_construct = 0;
7128 #if KMP_CACHE_MANAGE
7129   KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
7130 #endif /* KMP_CACHE_MANAGE */
7131   dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
7132   KMP_DEBUG_ASSERT(dispatch);
7133   KMP_DEBUG_ASSERT(team->t.t_dispatch);
7134   // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
7135   // this_thr->th.th_info.ds.ds_tid ] );
7136 
7137   dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
7138   dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter
7139   if (__kmp_env_consistency_check)
7140     __kmp_push_parallel(gtid, team->t.t_ident);
7141 
7142   KMP_MB(); /* Flush all pending memory write invalidates.  */
7143 }
7144 
__kmp_run_after_invoked_task(int gtid,int tid,kmp_info_t * this_thr,kmp_team_t * team)7145 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7146                                   kmp_team_t *team) {
7147   if (__kmp_env_consistency_check)
7148     __kmp_pop_parallel(gtid, team->t.t_ident);
7149 
7150   __kmp_finish_implicit_task(this_thr);
7151 }
7152 
__kmp_invoke_task_func(int gtid)7153 int __kmp_invoke_task_func(int gtid) {
7154   int rc;
7155   int tid = __kmp_tid_from_gtid(gtid);
7156   kmp_info_t *this_thr = __kmp_threads[gtid];
7157   kmp_team_t *team = this_thr->th.th_team;
7158 
7159   __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
7160 #if USE_ITT_BUILD
7161   if (__itt_stack_caller_create_ptr) {
7162     __kmp_itt_stack_callee_enter(
7163         (__itt_caller)
7164             team->t.t_stack_id); // inform ittnotify about entering user's code
7165   }
7166 #endif /* USE_ITT_BUILD */
7167 #if INCLUDE_SSC_MARKS
7168   SSC_MARK_INVOKING();
7169 #endif
7170 
7171 #if OMPT_SUPPORT
7172   void *dummy;
7173   void **exit_frame_p;
7174   ompt_data_t *my_task_data;
7175   ompt_data_t *my_parallel_data;
7176   int ompt_team_size;
7177 
7178   if (ompt_enabled.enabled) {
7179     exit_frame_p = &(
7180         team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame.exit_frame.ptr);
7181   } else {
7182     exit_frame_p = &dummy;
7183   }
7184 
7185   my_task_data =
7186       &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
7187   my_parallel_data = &(team->t.ompt_team_info.parallel_data);
7188   if (ompt_enabled.ompt_callback_implicit_task) {
7189     ompt_team_size = team->t.t_nproc;
7190     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7191         ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
7192         __kmp_tid_from_gtid(gtid), ompt_task_implicit);
7193     OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
7194   }
7195 #endif
7196 
7197 #if KMP_STATS_ENABLED
7198   stats_state_e previous_state = KMP_GET_THREAD_STATE();
7199   if (previous_state == stats_state_e::TEAMS_REGION) {
7200     KMP_PUSH_PARTITIONED_TIMER(OMP_teams);
7201   } else {
7202     KMP_PUSH_PARTITIONED_TIMER(OMP_parallel);
7203   }
7204   KMP_SET_THREAD_STATE(IMPLICIT_TASK);
7205 #endif
7206 
7207   rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
7208                               tid, (int)team->t.t_argc, (void **)team->t.t_argv
7209 #if OMPT_SUPPORT
7210                               ,
7211                               exit_frame_p
7212 #endif
7213                               );
7214 #if OMPT_SUPPORT
7215   *exit_frame_p = NULL;
7216    this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_team;
7217 #endif
7218 
7219 #if KMP_STATS_ENABLED
7220   if (previous_state == stats_state_e::TEAMS_REGION) {
7221     KMP_SET_THREAD_STATE(previous_state);
7222   }
7223   KMP_POP_PARTITIONED_TIMER();
7224 #endif
7225 
7226 #if USE_ITT_BUILD
7227   if (__itt_stack_caller_create_ptr) {
7228     __kmp_itt_stack_callee_leave(
7229         (__itt_caller)
7230             team->t.t_stack_id); // inform ittnotify about leaving user's code
7231   }
7232 #endif /* USE_ITT_BUILD */
7233   __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7234 
7235   return rc;
7236 }
7237 
__kmp_teams_master(int gtid)7238 void __kmp_teams_master(int gtid) {
7239   // This routine is called by all master threads in teams construct
7240   kmp_info_t *thr = __kmp_threads[gtid];
7241   kmp_team_t *team = thr->th.th_team;
7242   ident_t *loc = team->t.t_ident;
7243   thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7244   KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7245   KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7246   KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7247                 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7248 
7249   // This thread is a new CG root.  Set up the proper variables.
7250   kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
7251   tmp->cg_root = thr; // Make thr the CG root
7252   // Init to thread limit that was stored when league masters were forked
7253   tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
7254   tmp->cg_nthreads = 1; // Init counter to one active thread, this one
7255   KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
7256                  " cg_nthreads to 1\n",
7257                  thr, tmp));
7258   tmp->up = thr->th.th_cg_roots;
7259   thr->th.th_cg_roots = tmp;
7260 
7261 // Launch league of teams now, but not let workers execute
7262 // (they hang on fork barrier until next parallel)
7263 #if INCLUDE_SSC_MARKS
7264   SSC_MARK_FORKING();
7265 #endif
7266   __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7267                   (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7268                   VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7269 #if INCLUDE_SSC_MARKS
7270   SSC_MARK_JOINING();
7271 #endif
7272   // If the team size was reduced from the limit, set it to the new size
7273   if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
7274     thr->th.th_teams_size.nth = thr->th.th_team_nproc;
7275   // AC: last parameter "1" eliminates join barrier which won't work because
7276   // worker threads are in a fork barrier waiting for more parallel regions
7277   __kmp_join_call(loc, gtid
7278 #if OMPT_SUPPORT
7279                   ,
7280                   fork_context_intel
7281 #endif
7282                   ,
7283                   1);
7284 }
7285 
__kmp_invoke_teams_master(int gtid)7286 int __kmp_invoke_teams_master(int gtid) {
7287   kmp_info_t *this_thr = __kmp_threads[gtid];
7288   kmp_team_t *team = this_thr->th.th_team;
7289 #if KMP_DEBUG
7290   if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7291     KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7292                      (void *)__kmp_teams_master);
7293 #endif
7294   __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7295 #if OMPT_SUPPORT
7296   int tid = __kmp_tid_from_gtid(gtid);
7297   ompt_data_t *task_data =
7298       &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data;
7299   ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data;
7300   if (ompt_enabled.ompt_callback_implicit_task) {
7301     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7302         ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid,
7303         ompt_task_initial);
7304     OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid;
7305   }
7306 #endif
7307   __kmp_teams_master(gtid);
7308 #if OMPT_SUPPORT
7309   this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_league;
7310 #endif
7311   __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7312   return 1;
7313 }
7314 
7315 /* this sets the requested number of threads for the next parallel region
7316    encountered by this team. since this should be enclosed in the forkjoin
7317    critical section it should avoid race conditions with asymmetrical nested
7318    parallelism */
7319 
__kmp_push_num_threads(ident_t * id,int gtid,int num_threads)7320 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7321   kmp_info_t *thr = __kmp_threads[gtid];
7322 
7323   if (num_threads > 0)
7324     thr->th.th_set_nproc = num_threads;
7325 }
7326 
7327 /* this sets the requested number of teams for the teams region and/or
7328    the number of threads for the next parallel region encountered  */
__kmp_push_num_teams(ident_t * id,int gtid,int num_teams,int num_threads)7329 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7330                           int num_threads) {
7331   kmp_info_t *thr = __kmp_threads[gtid];
7332   KMP_DEBUG_ASSERT(num_teams >= 0);
7333   KMP_DEBUG_ASSERT(num_threads >= 0);
7334 
7335   if (num_teams == 0)
7336     num_teams = 1; // default number of teams is 1.
7337   if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7338     if (!__kmp_reserve_warn) {
7339       __kmp_reserve_warn = 1;
7340       __kmp_msg(kmp_ms_warning,
7341                 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7342                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7343     }
7344     num_teams = __kmp_teams_max_nth;
7345   }
7346   // Set number of teams (number of threads in the outer "parallel" of the
7347   // teams)
7348   thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7349 
7350   // Remember the number of threads for inner parallel regions
7351   if (!TCR_4(__kmp_init_middle))
7352     __kmp_middle_initialize(); // get internal globals calculated
7353   KMP_DEBUG_ASSERT(__kmp_avail_proc);
7354   KMP_DEBUG_ASSERT(__kmp_dflt_team_nth);
7355   if (num_threads == 0) {
7356     num_threads = __kmp_avail_proc / num_teams;
7357     // adjust num_threads w/o warning as it is not user setting
7358     // num_threads = min(num_threads, nthreads-var, thread-limit-var)
7359     // no thread_limit clause specified -  do not change thread-limit-var ICV
7360     if (num_threads > __kmp_dflt_team_nth) {
7361       num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7362     }
7363     if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) {
7364       num_threads = thr->th.th_current_task->td_icvs.thread_limit;
7365     } // prevent team size to exceed thread-limit-var
7366     if (num_teams * num_threads > __kmp_teams_max_nth) {
7367       num_threads = __kmp_teams_max_nth / num_teams;
7368     }
7369   } else {
7370     // This thread will be the master of the league masters
7371     // Store new thread limit; old limit is saved in th_cg_roots list
7372     thr->th.th_current_task->td_icvs.thread_limit = num_threads;
7373     // num_threads = min(num_threads, nthreads-var)
7374     if (num_threads > __kmp_dflt_team_nth) {
7375       num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7376     }
7377     if (num_teams * num_threads > __kmp_teams_max_nth) {
7378       int new_threads = __kmp_teams_max_nth / num_teams;
7379       if (!__kmp_reserve_warn) { // user asked for too many threads
7380         __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
7381         __kmp_msg(kmp_ms_warning,
7382                   KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7383                   KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7384       }
7385       num_threads = new_threads;
7386     }
7387   }
7388   thr->th.th_teams_size.nth = num_threads;
7389 }
7390 
7391 // Set the proc_bind var to use in the following parallel region.
__kmp_push_proc_bind(ident_t * id,int gtid,kmp_proc_bind_t proc_bind)7392 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
7393   kmp_info_t *thr = __kmp_threads[gtid];
7394   thr->th.th_set_proc_bind = proc_bind;
7395 }
7396 
7397 /* Launch the worker threads into the microtask. */
7398 
__kmp_internal_fork(ident_t * id,int gtid,kmp_team_t * team)7399 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
7400   kmp_info_t *this_thr = __kmp_threads[gtid];
7401 
7402 #ifdef KMP_DEBUG
7403   int f;
7404 #endif /* KMP_DEBUG */
7405 
7406   KMP_DEBUG_ASSERT(team);
7407   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7408   KMP_ASSERT(KMP_MASTER_GTID(gtid));
7409   KMP_MB(); /* Flush all pending memory write invalidates.  */
7410 
7411   team->t.t_construct = 0; /* no single directives seen yet */
7412   team->t.t_ordered.dt.t_value =
7413       0; /* thread 0 enters the ordered section first */
7414 
7415   /* Reset the identifiers on the dispatch buffer */
7416   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
7417   if (team->t.t_max_nproc > 1) {
7418     int i;
7419     for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
7420       team->t.t_disp_buffer[i].buffer_index = i;
7421       team->t.t_disp_buffer[i].doacross_buf_idx = i;
7422     }
7423   } else {
7424     team->t.t_disp_buffer[0].buffer_index = 0;
7425     team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7426   }
7427 
7428   KMP_MB(); /* Flush all pending memory write invalidates.  */
7429   KMP_ASSERT(this_thr->th.th_team == team);
7430 
7431 #ifdef KMP_DEBUG
7432   for (f = 0; f < team->t.t_nproc; f++) {
7433     KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
7434                      team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
7435   }
7436 #endif /* KMP_DEBUG */
7437 
7438   /* release the worker threads so they may begin working */
7439   __kmp_fork_barrier(gtid, 0);
7440 }
7441 
__kmp_internal_join(ident_t * id,int gtid,kmp_team_t * team)7442 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
7443   kmp_info_t *this_thr = __kmp_threads[gtid];
7444 
7445   KMP_DEBUG_ASSERT(team);
7446   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7447   KMP_ASSERT(KMP_MASTER_GTID(gtid));
7448   KMP_MB(); /* Flush all pending memory write invalidates.  */
7449 
7450 /* Join barrier after fork */
7451 
7452 #ifdef KMP_DEBUG
7453   if (__kmp_threads[gtid] &&
7454       __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
7455     __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
7456                  __kmp_threads[gtid]);
7457     __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
7458                  "team->t.t_nproc=%d\n",
7459                  gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
7460                  team->t.t_nproc);
7461     __kmp_print_structure();
7462   }
7463   KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
7464                    __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
7465 #endif /* KMP_DEBUG */
7466 
7467   __kmp_join_barrier(gtid); /* wait for everyone */
7468 #if OMPT_SUPPORT
7469   if (ompt_enabled.enabled &&
7470       this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
7471     int ds_tid = this_thr->th.th_info.ds.ds_tid;
7472     ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
7473     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
7474 #if OMPT_OPTIONAL
7475     void *codeptr = NULL;
7476     if (KMP_MASTER_TID(ds_tid) &&
7477         (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
7478          ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
7479       codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
7480 
7481     if (ompt_enabled.ompt_callback_sync_region_wait) {
7482       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
7483           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7484           codeptr);
7485     }
7486     if (ompt_enabled.ompt_callback_sync_region) {
7487       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
7488           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7489           codeptr);
7490     }
7491 #endif
7492     if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
7493       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7494           ompt_scope_end, NULL, task_data, 0, ds_tid, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
7495     }
7496   }
7497 #endif
7498 
7499   KMP_MB(); /* Flush all pending memory write invalidates.  */
7500   KMP_ASSERT(this_thr->th.th_team == team);
7501 }
7502 
7503 /* ------------------------------------------------------------------------ */
7504 
7505 #ifdef USE_LOAD_BALANCE
7506 
7507 // Return the worker threads actively spinning in the hot team, if we
7508 // are at the outermost level of parallelism.  Otherwise, return 0.
__kmp_active_hot_team_nproc(kmp_root_t * root)7509 static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
7510   int i;
7511   int retval;
7512   kmp_team_t *hot_team;
7513 
7514   if (root->r.r_active) {
7515     return 0;
7516   }
7517   hot_team = root->r.r_hot_team;
7518   if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
7519     return hot_team->t.t_nproc - 1; // Don't count master thread
7520   }
7521 
7522   // Skip the master thread - it is accounted for elsewhere.
7523   retval = 0;
7524   for (i = 1; i < hot_team->t.t_nproc; i++) {
7525     if (hot_team->t.t_threads[i]->th.th_active) {
7526       retval++;
7527     }
7528   }
7529   return retval;
7530 }
7531 
7532 // Perform an automatic adjustment to the number of
7533 // threads used by the next parallel region.
__kmp_load_balance_nproc(kmp_root_t * root,int set_nproc)7534 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
7535   int retval;
7536   int pool_active;
7537   int hot_team_active;
7538   int team_curr_active;
7539   int system_active;
7540 
7541   KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
7542                 set_nproc));
7543   KMP_DEBUG_ASSERT(root);
7544   KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
7545                        ->th.th_current_task->td_icvs.dynamic == TRUE);
7546   KMP_DEBUG_ASSERT(set_nproc > 1);
7547 
7548   if (set_nproc == 1) {
7549     KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
7550     return 1;
7551   }
7552 
7553   // Threads that are active in the thread pool, active in the hot team for this
7554   // particular root (if we are at the outer par level), and the currently
7555   // executing thread (to become the master) are available to add to the new
7556   // team, but are currently contributing to the system load, and must be
7557   // accounted for.
7558   pool_active = __kmp_thread_pool_active_nth;
7559   hot_team_active = __kmp_active_hot_team_nproc(root);
7560   team_curr_active = pool_active + hot_team_active + 1;
7561 
7562   // Check the system load.
7563   system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
7564   KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
7565                 "hot team active = %d\n",
7566                 system_active, pool_active, hot_team_active));
7567 
7568   if (system_active < 0) {
7569     // There was an error reading the necessary info from /proc, so use the
7570     // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
7571     // = dynamic_thread_limit, we shouldn't wind up getting back here.
7572     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7573     KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
7574 
7575     // Make this call behave like the thread limit algorithm.
7576     retval = __kmp_avail_proc - __kmp_nth +
7577              (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
7578     if (retval > set_nproc) {
7579       retval = set_nproc;
7580     }
7581     if (retval < KMP_MIN_NTH) {
7582       retval = KMP_MIN_NTH;
7583     }
7584 
7585     KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
7586                   retval));
7587     return retval;
7588   }
7589 
7590   // There is a slight delay in the load balance algorithm in detecting new
7591   // running procs. The real system load at this instant should be at least as
7592   // large as the #active omp thread that are available to add to the team.
7593   if (system_active < team_curr_active) {
7594     system_active = team_curr_active;
7595   }
7596   retval = __kmp_avail_proc - system_active + team_curr_active;
7597   if (retval > set_nproc) {
7598     retval = set_nproc;
7599   }
7600   if (retval < KMP_MIN_NTH) {
7601     retval = KMP_MIN_NTH;
7602   }
7603 
7604   KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
7605   return retval;
7606 } // __kmp_load_balance_nproc()
7607 
7608 #endif /* USE_LOAD_BALANCE */
7609 
7610 /* ------------------------------------------------------------------------ */
7611 
7612 /* NOTE: this is called with the __kmp_init_lock held */
__kmp_cleanup(void)7613 void __kmp_cleanup(void) {
7614   int f;
7615 
7616   KA_TRACE(10, ("__kmp_cleanup: enter\n"));
7617 
7618   if (TCR_4(__kmp_init_parallel)) {
7619 #if KMP_HANDLE_SIGNALS
7620     __kmp_remove_signals();
7621 #endif
7622     TCW_4(__kmp_init_parallel, FALSE);
7623   }
7624 
7625   if (TCR_4(__kmp_init_middle)) {
7626 #if KMP_AFFINITY_SUPPORTED
7627     __kmp_affinity_uninitialize();
7628 #endif /* KMP_AFFINITY_SUPPORTED */
7629     __kmp_cleanup_hierarchy();
7630     TCW_4(__kmp_init_middle, FALSE);
7631   }
7632 
7633   KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
7634 
7635   if (__kmp_init_serial) {
7636     __kmp_runtime_destroy();
7637     __kmp_init_serial = FALSE;
7638   }
7639 
7640   __kmp_cleanup_threadprivate_caches();
7641 
7642   for (f = 0; f < __kmp_threads_capacity; f++) {
7643     if (__kmp_root[f] != NULL) {
7644       __kmp_free(__kmp_root[f]);
7645       __kmp_root[f] = NULL;
7646     }
7647   }
7648   __kmp_free(__kmp_threads);
7649   // __kmp_threads and __kmp_root were allocated at once, as single block, so
7650   // there is no need in freeing __kmp_root.
7651   __kmp_threads = NULL;
7652   __kmp_root = NULL;
7653   __kmp_threads_capacity = 0;
7654 
7655 #if KMP_USE_DYNAMIC_LOCK
7656   __kmp_cleanup_indirect_user_locks();
7657 #else
7658   __kmp_cleanup_user_locks();
7659 #endif
7660 
7661 #if KMP_AFFINITY_SUPPORTED
7662   KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
7663   __kmp_cpuinfo_file = NULL;
7664 #endif /* KMP_AFFINITY_SUPPORTED */
7665 
7666 #if KMP_USE_ADAPTIVE_LOCKS
7667 #if KMP_DEBUG_ADAPTIVE_LOCKS
7668   __kmp_print_speculative_stats();
7669 #endif
7670 #endif
7671   KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
7672   __kmp_nested_nth.nth = NULL;
7673   __kmp_nested_nth.size = 0;
7674   __kmp_nested_nth.used = 0;
7675   KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
7676   __kmp_nested_proc_bind.bind_types = NULL;
7677   __kmp_nested_proc_bind.size = 0;
7678   __kmp_nested_proc_bind.used = 0;
7679   if (__kmp_affinity_format) {
7680     KMP_INTERNAL_FREE(__kmp_affinity_format);
7681     __kmp_affinity_format = NULL;
7682   }
7683 
7684   __kmp_i18n_catclose();
7685 
7686 #if KMP_USE_HIER_SCHED
7687   __kmp_hier_scheds.deallocate();
7688 #endif
7689 
7690 #if KMP_STATS_ENABLED
7691   __kmp_stats_fini();
7692 #endif
7693 
7694   KA_TRACE(10, ("__kmp_cleanup: exit\n"));
7695 }
7696 
7697 /* ------------------------------------------------------------------------ */
7698 
__kmp_ignore_mppbeg(void)7699 int __kmp_ignore_mppbeg(void) {
7700   char *env;
7701 
7702   if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
7703     if (__kmp_str_match_false(env))
7704       return FALSE;
7705   }
7706   // By default __kmpc_begin() is no-op.
7707   return TRUE;
7708 }
7709 
__kmp_ignore_mppend(void)7710 int __kmp_ignore_mppend(void) {
7711   char *env;
7712 
7713   if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
7714     if (__kmp_str_match_false(env))
7715       return FALSE;
7716   }
7717   // By default __kmpc_end() is no-op.
7718   return TRUE;
7719 }
7720 
__kmp_internal_begin(void)7721 void __kmp_internal_begin(void) {
7722   int gtid;
7723   kmp_root_t *root;
7724 
7725   /* this is a very important step as it will register new sibling threads
7726      and assign these new uber threads a new gtid */
7727   gtid = __kmp_entry_gtid();
7728   root = __kmp_threads[gtid]->th.th_root;
7729   KMP_ASSERT(KMP_UBER_GTID(gtid));
7730 
7731   if (root->r.r_begin)
7732     return;
7733   __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
7734   if (root->r.r_begin) {
7735     __kmp_release_lock(&root->r.r_begin_lock, gtid);
7736     return;
7737   }
7738 
7739   root->r.r_begin = TRUE;
7740 
7741   __kmp_release_lock(&root->r.r_begin_lock, gtid);
7742 }
7743 
7744 /* ------------------------------------------------------------------------ */
7745 
__kmp_user_set_library(enum library_type arg)7746 void __kmp_user_set_library(enum library_type arg) {
7747   int gtid;
7748   kmp_root_t *root;
7749   kmp_info_t *thread;
7750 
7751   /* first, make sure we are initialized so we can get our gtid */
7752 
7753   gtid = __kmp_entry_gtid();
7754   thread = __kmp_threads[gtid];
7755 
7756   root = thread->th.th_root;
7757 
7758   KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
7759                 library_serial));
7760   if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
7761                                   thread */
7762     KMP_WARNING(SetLibraryIncorrectCall);
7763     return;
7764   }
7765 
7766   switch (arg) {
7767   case library_serial:
7768     thread->th.th_set_nproc = 0;
7769     set__nproc(thread, 1);
7770     break;
7771   case library_turnaround:
7772     thread->th.th_set_nproc = 0;
7773     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7774                                            : __kmp_dflt_team_nth_ub);
7775     break;
7776   case library_throughput:
7777     thread->th.th_set_nproc = 0;
7778     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7779                                            : __kmp_dflt_team_nth_ub);
7780     break;
7781   default:
7782     KMP_FATAL(UnknownLibraryType, arg);
7783   }
7784 
7785   __kmp_aux_set_library(arg);
7786 }
7787 
__kmp_aux_set_stacksize(size_t arg)7788 void __kmp_aux_set_stacksize(size_t arg) {
7789   if (!__kmp_init_serial)
7790     __kmp_serial_initialize();
7791 
7792 #if KMP_OS_DARWIN
7793   if (arg & (0x1000 - 1)) {
7794     arg &= ~(0x1000 - 1);
7795     if (arg + 0x1000) /* check for overflow if we round up */
7796       arg += 0x1000;
7797   }
7798 #endif
7799   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7800 
7801   /* only change the default stacksize before the first parallel region */
7802   if (!TCR_4(__kmp_init_parallel)) {
7803     size_t value = arg; /* argument is in bytes */
7804 
7805     if (value < __kmp_sys_min_stksize)
7806       value = __kmp_sys_min_stksize;
7807     else if (value > KMP_MAX_STKSIZE)
7808       value = KMP_MAX_STKSIZE;
7809 
7810     __kmp_stksize = value;
7811 
7812     __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
7813   }
7814 
7815   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7816 }
7817 
7818 /* set the behaviour of the runtime library */
7819 /* TODO this can cause some odd behaviour with sibling parallelism... */
__kmp_aux_set_library(enum library_type arg)7820 void __kmp_aux_set_library(enum library_type arg) {
7821   __kmp_library = arg;
7822 
7823   switch (__kmp_library) {
7824   case library_serial: {
7825     KMP_INFORM(LibraryIsSerial);
7826   } break;
7827   case library_turnaround:
7828     if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
7829       __kmp_use_yield = 2; // only yield when oversubscribed
7830     break;
7831   case library_throughput:
7832     if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
7833       __kmp_dflt_blocktime = 200;
7834     break;
7835   default:
7836     KMP_FATAL(UnknownLibraryType, arg);
7837   }
7838 }
7839 
7840 /* Getting team information common for all team API */
7841 // Returns NULL if not in teams construct
__kmp_aux_get_team_info(int & teams_serialized)7842 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
7843   kmp_info_t *thr = __kmp_entry_thread();
7844   teams_serialized = 0;
7845   if (thr->th.th_teams_microtask) {
7846     kmp_team_t *team = thr->th.th_team;
7847     int tlevel = thr->th.th_teams_level; // the level of the teams construct
7848     int ii = team->t.t_level;
7849     teams_serialized = team->t.t_serialized;
7850     int level = tlevel + 1;
7851     KMP_DEBUG_ASSERT(ii >= tlevel);
7852     while (ii > level) {
7853       for (teams_serialized = team->t.t_serialized;
7854            (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
7855       }
7856       if (team->t.t_serialized && (!teams_serialized)) {
7857         team = team->t.t_parent;
7858         continue;
7859       }
7860       if (ii > level) {
7861         team = team->t.t_parent;
7862         ii--;
7863       }
7864     }
7865     return team;
7866   }
7867   return NULL;
7868 }
7869 
__kmp_aux_get_team_num()7870 int __kmp_aux_get_team_num() {
7871   int serialized;
7872   kmp_team_t *team = __kmp_aux_get_team_info(serialized);
7873   if (team) {
7874     if (serialized > 1) {
7875       return 0; // teams region is serialized ( 1 team of 1 thread ).
7876     } else {
7877       return team->t.t_master_tid;
7878     }
7879   }
7880   return 0;
7881 }
7882 
__kmp_aux_get_num_teams()7883 int __kmp_aux_get_num_teams() {
7884   int serialized;
7885   kmp_team_t *team = __kmp_aux_get_team_info(serialized);
7886   if (team) {
7887     if (serialized > 1) {
7888       return 1;
7889     } else {
7890       return team->t.t_parent->t.t_nproc;
7891     }
7892   }
7893   return 1;
7894 }
7895 
7896 /* ------------------------------------------------------------------------ */
7897 
7898 /*
7899  * Affinity Format Parser
7900  *
7901  * Field is in form of: %[[[0].]size]type
7902  * % and type are required (%% means print a literal '%')
7903  * type is either single char or long name surrounded by {},
7904  * e.g., N or {num_threads}
7905  * 0 => leading zeros
7906  * . => right justified when size is specified
7907  * by default output is left justified
7908  * size is the *minimum* field length
7909  * All other characters are printed as is
7910  *
7911  * Available field types:
7912  * L {thread_level}      - omp_get_level()
7913  * n {thread_num}        - omp_get_thread_num()
7914  * h {host}              - name of host machine
7915  * P {process_id}        - process id (integer)
7916  * T {thread_identifier} - native thread identifier (integer)
7917  * N {num_threads}       - omp_get_num_threads()
7918  * A {ancestor_tnum}     - omp_get_ancestor_thread_num(omp_get_level()-1)
7919  * a {thread_affinity}   - comma separated list of integers or integer ranges
7920  *                         (values of affinity mask)
7921  *
7922  * Implementation-specific field types can be added
7923  * If a type is unknown, print "undefined"
7924 */
7925 
7926 // Structure holding the short name, long name, and corresponding data type
7927 // for snprintf.  A table of these will represent the entire valid keyword
7928 // field types.
7929 typedef struct kmp_affinity_format_field_t {
7930   char short_name; // from spec e.g., L -> thread level
7931   const char *long_name; // from spec thread_level -> thread level
7932   char field_format; // data type for snprintf (typically 'd' or 's'
7933   // for integer or string)
7934 } kmp_affinity_format_field_t;
7935 
7936 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
7937 #if KMP_AFFINITY_SUPPORTED
7938     {'A', "thread_affinity", 's'},
7939 #endif
7940     {'t', "team_num", 'd'},
7941     {'T', "num_teams", 'd'},
7942     {'L', "nesting_level", 'd'},
7943     {'n', "thread_num", 'd'},
7944     {'N', "num_threads", 'd'},
7945     {'a', "ancestor_tnum", 'd'},
7946     {'H', "host", 's'},
7947     {'P', "process_id", 'd'},
7948     {'i', "native_thread_id", 'd'}};
7949 
7950 // Return the number of characters it takes to hold field
__kmp_aux_capture_affinity_field(int gtid,const kmp_info_t * th,const char ** ptr,kmp_str_buf_t * field_buffer)7951 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
7952                                             const char **ptr,
7953                                             kmp_str_buf_t *field_buffer) {
7954   int rc, format_index, field_value;
7955   const char *width_left, *width_right;
7956   bool pad_zeros, right_justify, parse_long_name, found_valid_name;
7957   static const int FORMAT_SIZE = 20;
7958   char format[FORMAT_SIZE] = {0};
7959   char absolute_short_name = 0;
7960 
7961   KMP_DEBUG_ASSERT(gtid >= 0);
7962   KMP_DEBUG_ASSERT(th);
7963   KMP_DEBUG_ASSERT(**ptr == '%');
7964   KMP_DEBUG_ASSERT(field_buffer);
7965 
7966   __kmp_str_buf_clear(field_buffer);
7967 
7968   // Skip the initial %
7969   (*ptr)++;
7970 
7971   // Check for %% first
7972   if (**ptr == '%') {
7973     __kmp_str_buf_cat(field_buffer, "%", 1);
7974     (*ptr)++; // skip over the second %
7975     return 1;
7976   }
7977 
7978   // Parse field modifiers if they are present
7979   pad_zeros = false;
7980   if (**ptr == '0') {
7981     pad_zeros = true;
7982     (*ptr)++; // skip over 0
7983   }
7984   right_justify = false;
7985   if (**ptr == '.') {
7986     right_justify = true;
7987     (*ptr)++; // skip over .
7988   }
7989   // Parse width of field: [width_left, width_right)
7990   width_left = width_right = NULL;
7991   if (**ptr >= '0' && **ptr <= '9') {
7992     width_left = *ptr;
7993     SKIP_DIGITS(*ptr);
7994     width_right = *ptr;
7995   }
7996 
7997   // Create the format for KMP_SNPRINTF based on flags parsed above
7998   format_index = 0;
7999   format[format_index++] = '%';
8000   if (!right_justify)
8001     format[format_index++] = '-';
8002   if (pad_zeros)
8003     format[format_index++] = '0';
8004   if (width_left && width_right) {
8005     int i = 0;
8006     // Only allow 8 digit number widths.
8007     // This also prevents overflowing format variable
8008     while (i < 8 && width_left < width_right) {
8009       format[format_index++] = *width_left;
8010       width_left++;
8011       i++;
8012     }
8013   }
8014 
8015   // Parse a name (long or short)
8016   // Canonicalize the name into absolute_short_name
8017   found_valid_name = false;
8018   parse_long_name = (**ptr == '{');
8019   if (parse_long_name)
8020     (*ptr)++; // skip initial left brace
8021   for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
8022                              sizeof(__kmp_affinity_format_table[0]);
8023        ++i) {
8024     char short_name = __kmp_affinity_format_table[i].short_name;
8025     const char *long_name = __kmp_affinity_format_table[i].long_name;
8026     char field_format = __kmp_affinity_format_table[i].field_format;
8027     if (parse_long_name) {
8028       int length = KMP_STRLEN(long_name);
8029       if (strncmp(*ptr, long_name, length) == 0) {
8030         found_valid_name = true;
8031         (*ptr) += length; // skip the long name
8032       }
8033     } else if (**ptr == short_name) {
8034       found_valid_name = true;
8035       (*ptr)++; // skip the short name
8036     }
8037     if (found_valid_name) {
8038       format[format_index++] = field_format;
8039       format[format_index++] = '\0';
8040       absolute_short_name = short_name;
8041       break;
8042     }
8043   }
8044   if (parse_long_name) {
8045     if (**ptr != '}') {
8046       absolute_short_name = 0;
8047     } else {
8048       (*ptr)++; // skip over the right brace
8049     }
8050   }
8051 
8052   // Attempt to fill the buffer with the requested
8053   // value using snprintf within __kmp_str_buf_print()
8054   switch (absolute_short_name) {
8055   case 't':
8056     rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
8057     break;
8058   case 'T':
8059     rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
8060     break;
8061   case 'L':
8062     rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
8063     break;
8064   case 'n':
8065     rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
8066     break;
8067   case 'H': {
8068     static const int BUFFER_SIZE = 256;
8069     char buf[BUFFER_SIZE];
8070     __kmp_expand_host_name(buf, BUFFER_SIZE);
8071     rc = __kmp_str_buf_print(field_buffer, format, buf);
8072   } break;
8073   case 'P':
8074     rc = __kmp_str_buf_print(field_buffer, format, getpid());
8075     break;
8076   case 'i':
8077     rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
8078     break;
8079   case 'N':
8080     rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
8081     break;
8082   case 'a':
8083     field_value =
8084         __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
8085     rc = __kmp_str_buf_print(field_buffer, format, field_value);
8086     break;
8087 #if KMP_AFFINITY_SUPPORTED
8088   case 'A': {
8089     kmp_str_buf_t buf;
8090     __kmp_str_buf_init(&buf);
8091     __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
8092     rc = __kmp_str_buf_print(field_buffer, format, buf.str);
8093     __kmp_str_buf_free(&buf);
8094   } break;
8095 #endif
8096   default:
8097     // According to spec, If an implementation does not have info for field
8098     // type, then "undefined" is printed
8099     rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
8100     // Skip the field
8101     if (parse_long_name) {
8102       SKIP_TOKEN(*ptr);
8103       if (**ptr == '}')
8104         (*ptr)++;
8105     } else {
8106       (*ptr)++;
8107     }
8108   }
8109 
8110   KMP_ASSERT(format_index <= FORMAT_SIZE);
8111   return rc;
8112 }
8113 
8114 /*
8115  * Return number of characters needed to hold the affinity string
8116  * (not including null byte character)
8117  * The resultant string is printed to buffer, which the caller can then
8118  * handle afterwards
8119 */
__kmp_aux_capture_affinity(int gtid,const char * format,kmp_str_buf_t * buffer)8120 size_t __kmp_aux_capture_affinity(int gtid, const char *format,
8121                                   kmp_str_buf_t *buffer) {
8122   const char *parse_ptr;
8123   size_t retval;
8124   const kmp_info_t *th;
8125   kmp_str_buf_t field;
8126 
8127   KMP_DEBUG_ASSERT(buffer);
8128   KMP_DEBUG_ASSERT(gtid >= 0);
8129 
8130   __kmp_str_buf_init(&field);
8131   __kmp_str_buf_clear(buffer);
8132 
8133   th = __kmp_threads[gtid];
8134   retval = 0;
8135 
8136   // If format is NULL or zero-length string, then we use
8137   // affinity-format-var ICV
8138   parse_ptr = format;
8139   if (parse_ptr == NULL || *parse_ptr == '\0') {
8140     parse_ptr = __kmp_affinity_format;
8141   }
8142   KMP_DEBUG_ASSERT(parse_ptr);
8143 
8144   while (*parse_ptr != '\0') {
8145     // Parse a field
8146     if (*parse_ptr == '%') {
8147       // Put field in the buffer
8148       int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
8149       __kmp_str_buf_catbuf(buffer, &field);
8150       retval += rc;
8151     } else {
8152       // Put literal character in buffer
8153       __kmp_str_buf_cat(buffer, parse_ptr, 1);
8154       retval++;
8155       parse_ptr++;
8156     }
8157   }
8158   __kmp_str_buf_free(&field);
8159   return retval;
8160 }
8161 
8162 // Displays the affinity string to stdout
__kmp_aux_display_affinity(int gtid,const char * format)8163 void __kmp_aux_display_affinity(int gtid, const char *format) {
8164   kmp_str_buf_t buf;
8165   __kmp_str_buf_init(&buf);
8166   __kmp_aux_capture_affinity(gtid, format, &buf);
8167   __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
8168   __kmp_str_buf_free(&buf);
8169 }
8170 
8171 /* ------------------------------------------------------------------------ */
8172 
__kmp_aux_set_blocktime(int arg,kmp_info_t * thread,int tid)8173 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
8174   int blocktime = arg; /* argument is in milliseconds */
8175 #if KMP_USE_MONITOR
8176   int bt_intervals;
8177 #endif
8178   int bt_set;
8179 
8180   __kmp_save_internal_controls(thread);
8181 
8182   /* Normalize and set blocktime for the teams */
8183   if (blocktime < KMP_MIN_BLOCKTIME)
8184     blocktime = KMP_MIN_BLOCKTIME;
8185   else if (blocktime > KMP_MAX_BLOCKTIME)
8186     blocktime = KMP_MAX_BLOCKTIME;
8187 
8188   set__blocktime_team(thread->th.th_team, tid, blocktime);
8189   set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
8190 
8191 #if KMP_USE_MONITOR
8192   /* Calculate and set blocktime intervals for the teams */
8193   bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
8194 
8195   set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
8196   set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
8197 #endif
8198 
8199   /* Set whether blocktime has been set to "TRUE" */
8200   bt_set = TRUE;
8201 
8202   set__bt_set_team(thread->th.th_team, tid, bt_set);
8203   set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
8204 #if KMP_USE_MONITOR
8205   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
8206                 "bt_intervals=%d, monitor_updates=%d\n",
8207                 __kmp_gtid_from_tid(tid, thread->th.th_team),
8208                 thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
8209                 __kmp_monitor_wakeups));
8210 #else
8211   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
8212                 __kmp_gtid_from_tid(tid, thread->th.th_team),
8213                 thread->th.th_team->t.t_id, tid, blocktime));
8214 #endif
8215 }
8216 
__kmp_aux_set_defaults(char const * str,int len)8217 void __kmp_aux_set_defaults(char const *str, int len) {
8218   if (!__kmp_init_serial) {
8219     __kmp_serial_initialize();
8220   }
8221   __kmp_env_initialize(str);
8222 
8223   if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) {
8224     __kmp_env_print();
8225   }
8226 } // __kmp_aux_set_defaults
8227 
8228 /* ------------------------------------------------------------------------ */
8229 /* internal fast reduction routines */
8230 
8231 PACKED_REDUCTION_METHOD_T
__kmp_determine_reduction_method(ident_t * loc,kmp_int32 global_tid,kmp_int32 num_vars,size_t reduce_size,void * reduce_data,void (* reduce_func)(void * lhs_data,void * rhs_data),kmp_critical_name * lck)8232 __kmp_determine_reduction_method(
8233     ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
8234     void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
8235     kmp_critical_name *lck) {
8236 
8237   // Default reduction method: critical construct ( lck != NULL, like in current
8238   // PAROPT )
8239   // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
8240   // can be selected by RTL
8241   // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
8242   // can be selected by RTL
8243   // Finally, it's up to OpenMP RTL to make a decision on which method to select
8244   // among generated by PAROPT.
8245 
8246   PACKED_REDUCTION_METHOD_T retval;
8247 
8248   int team_size;
8249 
8250   KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 )
8251   KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8252 
8253 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED                                 \
8254   ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE))
8255 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8256 
8257   retval = critical_reduce_block;
8258 
8259   // another choice of getting a team size (with 1 dynamic deference) is slower
8260   team_size = __kmp_get_team_num_threads(global_tid);
8261   if (team_size == 1) {
8262 
8263     retval = empty_reduce_block;
8264 
8265   } else {
8266 
8267     int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8268 
8269 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 ||                   \
8270     KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64
8271 
8272 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||     \
8273     KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8274 
8275     int teamsize_cutoff = 4;
8276 
8277 #if KMP_MIC_SUPPORTED
8278     if (__kmp_mic_type != non_mic) {
8279       teamsize_cutoff = 8;
8280     }
8281 #endif
8282     int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8283     if (tree_available) {
8284       if (team_size <= teamsize_cutoff) {
8285         if (atomic_available) {
8286           retval = atomic_reduce_block;
8287         }
8288       } else {
8289         retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8290       }
8291     } else if (atomic_available) {
8292       retval = atomic_reduce_block;
8293     }
8294 #else
8295 #error "Unknown or unsupported OS"
8296 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8297        // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8298 
8299 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
8300 
8301 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_HURD
8302 
8303     // basic tuning
8304 
8305     if (atomic_available) {
8306       if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
8307         retval = atomic_reduce_block;
8308       }
8309     } // otherwise: use critical section
8310 
8311 #elif KMP_OS_DARWIN
8312 
8313     int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8314     if (atomic_available && (num_vars <= 3)) {
8315       retval = atomic_reduce_block;
8316     } else if (tree_available) {
8317       if ((reduce_size > (9 * sizeof(kmp_real64))) &&
8318           (reduce_size < (2000 * sizeof(kmp_real64)))) {
8319         retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
8320       }
8321     } // otherwise: use critical section
8322 
8323 #else
8324 #error "Unknown or unsupported OS"
8325 #endif
8326 
8327 #else
8328 #error "Unknown or unsupported architecture"
8329 #endif
8330   }
8331 
8332   // KMP_FORCE_REDUCTION
8333 
8334   // If the team is serialized (team_size == 1), ignore the forced reduction
8335   // method and stay with the unsynchronized method (empty_reduce_block)
8336   if (__kmp_force_reduction_method != reduction_method_not_defined &&
8337       team_size != 1) {
8338 
8339     PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
8340 
8341     int atomic_available, tree_available;
8342 
8343     switch ((forced_retval = __kmp_force_reduction_method)) {
8344     case critical_reduce_block:
8345       KMP_ASSERT(lck); // lck should be != 0
8346       break;
8347 
8348     case atomic_reduce_block:
8349       atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8350       if (!atomic_available) {
8351         KMP_WARNING(RedMethodNotSupported, "atomic");
8352         forced_retval = critical_reduce_block;
8353       }
8354       break;
8355 
8356     case tree_reduce_block:
8357       tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8358       if (!tree_available) {
8359         KMP_WARNING(RedMethodNotSupported, "tree");
8360         forced_retval = critical_reduce_block;
8361       } else {
8362 #if KMP_FAST_REDUCTION_BARRIER
8363         forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8364 #endif
8365       }
8366       break;
8367 
8368     default:
8369       KMP_ASSERT(0); // "unsupported method specified"
8370     }
8371 
8372     retval = forced_retval;
8373   }
8374 
8375   KA_TRACE(10, ("reduction method selected=%08x\n", retval));
8376 
8377 #undef FAST_REDUCTION_TREE_METHOD_GENERATED
8378 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
8379 
8380   return (retval);
8381 }
8382 // this function is for testing set/get/determine reduce method
__kmp_get_reduce_method(void)8383 kmp_int32 __kmp_get_reduce_method(void) {
8384   return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
8385 }
8386 
8387 // Soft pause sets up threads to ignore blocktime and just go to sleep.
8388 // Spin-wait code checks __kmp_pause_status and reacts accordingly.
__kmp_soft_pause()8389 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
8390 
8391 // Hard pause shuts down the runtime completely.  Resume happens naturally when
8392 // OpenMP is used subsequently.
__kmp_hard_pause()8393 void __kmp_hard_pause() {
8394   __kmp_pause_status = kmp_hard_paused;
8395   __kmp_internal_end_thread(-1);
8396 }
8397 
8398 // Soft resume sets __kmp_pause_status, and wakes up all threads.
__kmp_resume_if_soft_paused()8399 void __kmp_resume_if_soft_paused() {
8400   if (__kmp_pause_status == kmp_soft_paused) {
8401     __kmp_pause_status = kmp_not_paused;
8402 
8403     for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
8404       kmp_info_t *thread = __kmp_threads[gtid];
8405       if (thread) { // Wake it if sleeping
8406         kmp_flag_64<> fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
8407                          thread);
8408         if (fl.is_sleeping())
8409           fl.resume(gtid);
8410         else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
8411           __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep
8412         } else { // thread holds the lock and may sleep soon
8413           do { // until either the thread sleeps, or we can get the lock
8414             if (fl.is_sleeping()) {
8415               fl.resume(gtid);
8416               break;
8417             } else if (__kmp_try_suspend_mx(thread)) {
8418               __kmp_unlock_suspend_mx(thread);
8419               break;
8420             }
8421           } while (1);
8422         }
8423       }
8424     }
8425   }
8426 }
8427 
8428 // This function is called via __kmpc_pause_resource. Returns 0 if successful.
8429 // TODO: add warning messages
__kmp_pause_resource(kmp_pause_status_t level)8430 int __kmp_pause_resource(kmp_pause_status_t level) {
8431   if (level == kmp_not_paused) { // requesting resume
8432     if (__kmp_pause_status == kmp_not_paused) {
8433       // error message about runtime not being paused, so can't resume
8434       return 1;
8435     } else {
8436       KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
8437                        __kmp_pause_status == kmp_hard_paused);
8438       __kmp_pause_status = kmp_not_paused;
8439       return 0;
8440     }
8441   } else if (level == kmp_soft_paused) { // requesting soft pause
8442     if (__kmp_pause_status != kmp_not_paused) {
8443       // error message about already being paused
8444       return 1;
8445     } else {
8446       __kmp_soft_pause();
8447       return 0;
8448     }
8449   } else if (level == kmp_hard_paused) { // requesting hard pause
8450     if (__kmp_pause_status != kmp_not_paused) {
8451       // error message about already being paused
8452       return 1;
8453     } else {
8454       __kmp_hard_pause();
8455       return 0;
8456     }
8457   } else {
8458     // error message about invalid level
8459     return 1;
8460   }
8461 }
8462 
8463 
__kmp_omp_display_env(int verbose)8464 void __kmp_omp_display_env(int verbose) {
8465   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8466   if (__kmp_init_serial == 0)
8467     __kmp_do_serial_initialize();
8468   __kmp_display_env_impl(!verbose, verbose);
8469   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
8470 }
8471