1 /*
2  * kmp_csupport.cpp -- kfront linkage support for OpenMP.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #define __KMP_IMP
14 #include "omp.h" /* extern "C" declarations of user-visible routines */
15 #include "kmp.h"
16 #include "kmp_error.h"
17 #include "kmp_i18n.h"
18 #include "kmp_itt.h"
19 #include "kmp_lock.h"
20 #include "kmp_stats.h"
21 #include "ompt-specific.h"
22 
23 #define MAX_MESSAGE 512
24 
25 // flags will be used in future, e.g. to implement openmp_strict library
26 // restrictions
27 
28 /*!
29  * @ingroup STARTUP_SHUTDOWN
30  * @param loc   in   source location information
31  * @param flags in   for future use (currently ignored)
32  *
33  * Initialize the runtime library. This call is optional; if it is not made then
34  * it will be implicitly called by attempts to use other library functions.
35  */
__kmpc_begin(ident_t * loc,kmp_int32 flags)36 void __kmpc_begin(ident_t *loc, kmp_int32 flags) {
37   // By default __kmpc_begin() is no-op.
38   char *env;
39   if ((env = getenv("KMP_INITIAL_THREAD_BIND")) != NULL &&
40       __kmp_str_match_true(env)) {
41     __kmp_middle_initialize();
42     KC_TRACE(10, ("__kmpc_begin: middle initialization called\n"));
43   } else if (__kmp_ignore_mppbeg() == FALSE) {
44     // By default __kmp_ignore_mppbeg() returns TRUE.
45     __kmp_internal_begin();
46     KC_TRACE(10, ("__kmpc_begin: called\n"));
47   }
48 }
49 
50 /*!
51  * @ingroup STARTUP_SHUTDOWN
52  * @param loc source location information
53  *
54  * Shutdown the runtime library. This is also optional, and even if called will
55  * not do anything unless the `KMP_IGNORE_MPPEND` environment variable is set to
56  * zero.
57  */
__kmpc_end(ident_t * loc)58 void __kmpc_end(ident_t *loc) {
59   // By default, __kmp_ignore_mppend() returns TRUE which makes __kmpc_end()
60   // call no-op. However, this can be overridden with KMP_IGNORE_MPPEND
61   // environment variable. If KMP_IGNORE_MPPEND is 0, __kmp_ignore_mppend()
62   // returns FALSE and __kmpc_end() will unregister this root (it can cause
63   // library shut down).
64   if (__kmp_ignore_mppend() == FALSE) {
65     KC_TRACE(10, ("__kmpc_end: called\n"));
66     KA_TRACE(30, ("__kmpc_end\n"));
67 
68     __kmp_internal_end_thread(-1);
69   }
70 #if KMP_OS_WINDOWS && OMPT_SUPPORT
71   // Normal exit process on Windows does not allow worker threads of the final
72   // parallel region to finish reporting their events, so shutting down the
73   // library here fixes the issue at least for the cases where __kmpc_end() is
74   // placed properly.
75   if (ompt_enabled.enabled)
76     __kmp_internal_end_library(__kmp_gtid_get_specific());
77 #endif
78 }
79 
80 /*!
81 @ingroup THREAD_STATES
82 @param loc Source location information.
83 @return The global thread index of the active thread.
84 
85 This function can be called in any context.
86 
87 If the runtime has ony been entered at the outermost level from a
88 single (necessarily non-OpenMP<sup>*</sup>) thread, then the thread number is
89 that which would be returned by omp_get_thread_num() in the outermost
90 active parallel construct. (Or zero if there is no active parallel
91 construct, since the master thread is necessarily thread zero).
92 
93 If multiple non-OpenMP threads all enter an OpenMP construct then this
94 will be a unique thread identifier among all the threads created by
95 the OpenMP runtime (but the value cannot be defined in terms of
96 OpenMP thread ids returned by omp_get_thread_num()).
97 */
__kmpc_global_thread_num(ident_t * loc)98 kmp_int32 __kmpc_global_thread_num(ident_t *loc) {
99   kmp_int32 gtid = __kmp_entry_gtid();
100 
101   KC_TRACE(10, ("__kmpc_global_thread_num: T#%d\n", gtid));
102 
103   return gtid;
104 }
105 
106 /*!
107 @ingroup THREAD_STATES
108 @param loc Source location information.
109 @return The number of threads under control of the OpenMP<sup>*</sup> runtime
110 
111 This function can be called in any context.
112 It returns the total number of threads under the control of the OpenMP runtime.
113 That is not a number that can be determined by any OpenMP standard calls, since
114 the library may be called from more than one non-OpenMP thread, and this
115 reflects the total over all such calls. Similarly the runtime maintains
116 underlying threads even when they are not active (since the cost of creating
117 and destroying OS threads is high), this call counts all such threads even if
118 they are not waiting for work.
119 */
__kmpc_global_num_threads(ident_t * loc)120 kmp_int32 __kmpc_global_num_threads(ident_t *loc) {
121   KC_TRACE(10,
122            ("__kmpc_global_num_threads: num_threads = %d\n", __kmp_all_nth));
123 
124   return TCR_4(__kmp_all_nth);
125 }
126 
127 /*!
128 @ingroup THREAD_STATES
129 @param loc Source location information.
130 @return The thread number of the calling thread in the innermost active parallel
131 construct.
132 */
__kmpc_bound_thread_num(ident_t * loc)133 kmp_int32 __kmpc_bound_thread_num(ident_t *loc) {
134   KC_TRACE(10, ("__kmpc_bound_thread_num: called\n"));
135   return __kmp_tid_from_gtid(__kmp_entry_gtid());
136 }
137 
138 /*!
139 @ingroup THREAD_STATES
140 @param loc Source location information.
141 @return The number of threads in the innermost active parallel construct.
142 */
__kmpc_bound_num_threads(ident_t * loc)143 kmp_int32 __kmpc_bound_num_threads(ident_t *loc) {
144   KC_TRACE(10, ("__kmpc_bound_num_threads: called\n"));
145 
146   return __kmp_entry_thread()->th.th_team->t.t_nproc;
147 }
148 
149 /*!
150  * @ingroup DEPRECATED
151  * @param loc location description
152  *
153  * This function need not be called. It always returns TRUE.
154  */
__kmpc_ok_to_fork(ident_t * loc)155 kmp_int32 __kmpc_ok_to_fork(ident_t *loc) {
156 #ifndef KMP_DEBUG
157 
158   return TRUE;
159 
160 #else
161 
162   const char *semi2;
163   const char *semi3;
164   int line_no;
165 
166   if (__kmp_par_range == 0) {
167     return TRUE;
168   }
169   semi2 = loc->psource;
170   if (semi2 == NULL) {
171     return TRUE;
172   }
173   semi2 = strchr(semi2, ';');
174   if (semi2 == NULL) {
175     return TRUE;
176   }
177   semi2 = strchr(semi2 + 1, ';');
178   if (semi2 == NULL) {
179     return TRUE;
180   }
181   if (__kmp_par_range_filename[0]) {
182     const char *name = semi2 - 1;
183     while ((name > loc->psource) && (*name != '/') && (*name != ';')) {
184       name--;
185     }
186     if ((*name == '/') || (*name == ';')) {
187       name++;
188     }
189     if (strncmp(__kmp_par_range_filename, name, semi2 - name)) {
190       return __kmp_par_range < 0;
191     }
192   }
193   semi3 = strchr(semi2 + 1, ';');
194   if (__kmp_par_range_routine[0]) {
195     if ((semi3 != NULL) && (semi3 > semi2) &&
196         (strncmp(__kmp_par_range_routine, semi2 + 1, semi3 - semi2 - 1))) {
197       return __kmp_par_range < 0;
198     }
199   }
200   if (KMP_SSCANF(semi3 + 1, "%d", &line_no) == 1) {
201     if ((line_no >= __kmp_par_range_lb) && (line_no <= __kmp_par_range_ub)) {
202       return __kmp_par_range > 0;
203     }
204     return __kmp_par_range < 0;
205   }
206   return TRUE;
207 
208 #endif /* KMP_DEBUG */
209 }
210 
211 /*!
212 @ingroup THREAD_STATES
213 @param loc Source location information.
214 @return 1 if this thread is executing inside an active parallel region, zero if
215 not.
216 */
__kmpc_in_parallel(ident_t * loc)217 kmp_int32 __kmpc_in_parallel(ident_t *loc) {
218   return __kmp_entry_thread()->th.th_root->r.r_active;
219 }
220 
221 /*!
222 @ingroup PARALLEL
223 @param loc source location information
224 @param global_tid global thread number
225 @param num_threads number of threads requested for this parallel construct
226 
227 Set the number of threads to be used by the next fork spawned by this thread.
228 This call is only required if the parallel construct has a `num_threads` clause.
229 */
__kmpc_push_num_threads(ident_t * loc,kmp_int32 global_tid,kmp_int32 num_threads)230 void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid,
231                              kmp_int32 num_threads) {
232   KA_TRACE(20, ("__kmpc_push_num_threads: enter T#%d num_threads=%d\n",
233                 global_tid, num_threads));
234   __kmp_assert_valid_gtid(global_tid);
235   __kmp_push_num_threads(loc, global_tid, num_threads);
236 }
237 
__kmpc_pop_num_threads(ident_t * loc,kmp_int32 global_tid)238 void __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid) {
239   KA_TRACE(20, ("__kmpc_pop_num_threads: enter\n"));
240   /* the num_threads are automatically popped */
241 }
242 
__kmpc_push_proc_bind(ident_t * loc,kmp_int32 global_tid,kmp_int32 proc_bind)243 void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid,
244                            kmp_int32 proc_bind) {
245   KA_TRACE(20, ("__kmpc_push_proc_bind: enter T#%d proc_bind=%d\n", global_tid,
246                 proc_bind));
247   __kmp_assert_valid_gtid(global_tid);
248   __kmp_push_proc_bind(loc, global_tid, (kmp_proc_bind_t)proc_bind);
249 }
250 
251 /*!
252 @ingroup PARALLEL
253 @param loc  source location information
254 @param argc  total number of arguments in the ellipsis
255 @param microtask  pointer to callback routine consisting of outlined parallel
256 construct
257 @param ...  pointers to shared variables that aren't global
258 
259 Do the actual fork and call the microtask in the relevant number of threads.
260 */
__kmpc_fork_call(ident_t * loc,kmp_int32 argc,kmpc_micro microtask,...)261 void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...) {
262   int gtid = __kmp_entry_gtid();
263 
264 #if (KMP_STATS_ENABLED)
265   // If we were in a serial region, then stop the serial timer, record
266   // the event, and start parallel region timer
267   stats_state_e previous_state = KMP_GET_THREAD_STATE();
268   if (previous_state == stats_state_e::SERIAL_REGION) {
269     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_parallel_overhead);
270   } else {
271     KMP_PUSH_PARTITIONED_TIMER(OMP_parallel_overhead);
272   }
273   int inParallel = __kmpc_in_parallel(loc);
274   if (inParallel) {
275     KMP_COUNT_BLOCK(OMP_NESTED_PARALLEL);
276   } else {
277     KMP_COUNT_BLOCK(OMP_PARALLEL);
278   }
279 #endif
280 
281   // maybe to save thr_state is enough here
282   {
283     va_list ap;
284     va_start(ap, microtask);
285 
286 #if OMPT_SUPPORT
287     ompt_frame_t *ompt_frame;
288     if (ompt_enabled.enabled) {
289       kmp_info_t *master_th = __kmp_threads[gtid];
290       kmp_team_t *parent_team = master_th->th.th_team;
291       ompt_lw_taskteam_t *lwt = parent_team->t.ompt_serialized_team_info;
292       if (lwt)
293         ompt_frame = &(lwt->ompt_task_info.frame);
294       else {
295         int tid = __kmp_tid_from_gtid(gtid);
296         ompt_frame = &(
297             parent_team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame);
298       }
299       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
300     }
301     OMPT_STORE_RETURN_ADDRESS(gtid);
302 #endif
303 
304 #if INCLUDE_SSC_MARKS
305     SSC_MARK_FORKING();
306 #endif
307     __kmp_fork_call(loc, gtid, fork_context_intel, argc,
308                     VOLATILE_CAST(microtask_t) microtask, // "wrapped" task
309                     VOLATILE_CAST(launch_t) __kmp_invoke_task_func,
310                     kmp_va_addr_of(ap));
311 #if INCLUDE_SSC_MARKS
312     SSC_MARK_JOINING();
313 #endif
314     __kmp_join_call(loc, gtid
315 #if OMPT_SUPPORT
316                     ,
317                     fork_context_intel
318 #endif
319                     );
320 
321     va_end(ap);
322   }
323 
324 #if KMP_STATS_ENABLED
325   if (previous_state == stats_state_e::SERIAL_REGION) {
326     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
327     KMP_SET_THREAD_STATE(previous_state);
328   } else {
329     KMP_POP_PARTITIONED_TIMER();
330   }
331 #endif // KMP_STATS_ENABLED
332 }
333 
334 /*!
335 @ingroup PARALLEL
336 @param loc source location information
337 @param global_tid global thread number
338 @param num_teams number of teams requested for the teams construct
339 @param num_threads number of threads per team requested for the teams construct
340 
341 Set the number of teams to be used by the teams construct.
342 This call is only required if the teams construct has a `num_teams` clause
343 or a `thread_limit` clause (or both).
344 */
__kmpc_push_num_teams(ident_t * loc,kmp_int32 global_tid,kmp_int32 num_teams,kmp_int32 num_threads)345 void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid,
346                            kmp_int32 num_teams, kmp_int32 num_threads) {
347   KA_TRACE(20,
348            ("__kmpc_push_num_teams: enter T#%d num_teams=%d num_threads=%d\n",
349             global_tid, num_teams, num_threads));
350   __kmp_assert_valid_gtid(global_tid);
351   __kmp_push_num_teams(loc, global_tid, num_teams, num_threads);
352 }
353 
354 /*!
355 @ingroup PARALLEL
356 @param loc  source location information
357 @param argc  total number of arguments in the ellipsis
358 @param microtask  pointer to callback routine consisting of outlined teams
359 construct
360 @param ...  pointers to shared variables that aren't global
361 
362 Do the actual fork and call the microtask in the relevant number of threads.
363 */
__kmpc_fork_teams(ident_t * loc,kmp_int32 argc,kmpc_micro microtask,...)364 void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask,
365                        ...) {
366   int gtid = __kmp_entry_gtid();
367   kmp_info_t *this_thr = __kmp_threads[gtid];
368   va_list ap;
369   va_start(ap, microtask);
370 
371 #if KMP_STATS_ENABLED
372   KMP_COUNT_BLOCK(OMP_TEAMS);
373   stats_state_e previous_state = KMP_GET_THREAD_STATE();
374   if (previous_state == stats_state_e::SERIAL_REGION) {
375     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_teams_overhead);
376   } else {
377     KMP_PUSH_PARTITIONED_TIMER(OMP_teams_overhead);
378   }
379 #endif
380 
381   // remember teams entry point and nesting level
382   this_thr->th.th_teams_microtask = microtask;
383   this_thr->th.th_teams_level =
384       this_thr->th.th_team->t.t_level; // AC: can be >0 on host
385 
386 #if OMPT_SUPPORT
387   kmp_team_t *parent_team = this_thr->th.th_team;
388   int tid = __kmp_tid_from_gtid(gtid);
389   if (ompt_enabled.enabled) {
390     parent_team->t.t_implicit_task_taskdata[tid]
391         .ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
392   }
393   OMPT_STORE_RETURN_ADDRESS(gtid);
394 #endif
395 
396   // check if __kmpc_push_num_teams called, set default number of teams
397   // otherwise
398   if (this_thr->th.th_teams_size.nteams == 0) {
399     __kmp_push_num_teams(loc, gtid, 0, 0);
400   }
401   KMP_DEBUG_ASSERT(this_thr->th.th_set_nproc >= 1);
402   KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nteams >= 1);
403   KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nth >= 1);
404 
405   __kmp_fork_call(
406       loc, gtid, fork_context_intel, argc,
407       VOLATILE_CAST(microtask_t) __kmp_teams_master, // "wrapped" task
408       VOLATILE_CAST(launch_t) __kmp_invoke_teams_master, kmp_va_addr_of(ap));
409   __kmp_join_call(loc, gtid
410 #if OMPT_SUPPORT
411                   ,
412                   fork_context_intel
413 #endif
414                   );
415 
416   // Pop current CG root off list
417   KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots);
418   kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
419   this_thr->th.th_cg_roots = tmp->up;
420   KA_TRACE(100, ("__kmpc_fork_teams: Thread %p popping node %p and moving up"
421                  " to node %p. cg_nthreads was %d\n",
422                  this_thr, tmp, this_thr->th.th_cg_roots, tmp->cg_nthreads));
423   KMP_DEBUG_ASSERT(tmp->cg_nthreads);
424   int i = tmp->cg_nthreads--;
425   if (i == 1) { // check is we are the last thread in CG (not always the case)
426     __kmp_free(tmp);
427   }
428   // Restore current task's thread_limit from CG root
429   KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots);
430   this_thr->th.th_current_task->td_icvs.thread_limit =
431       this_thr->th.th_cg_roots->cg_thread_limit;
432 
433   this_thr->th.th_teams_microtask = NULL;
434   this_thr->th.th_teams_level = 0;
435   *(kmp_int64 *)(&this_thr->th.th_teams_size) = 0L;
436   va_end(ap);
437 #if KMP_STATS_ENABLED
438   if (previous_state == stats_state_e::SERIAL_REGION) {
439     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
440     KMP_SET_THREAD_STATE(previous_state);
441   } else {
442     KMP_POP_PARTITIONED_TIMER();
443   }
444 #endif // KMP_STATS_ENABLED
445 }
446 
447 // I don't think this function should ever have been exported.
448 // The __kmpc_ prefix was misapplied.  I'm fairly certain that no generated
449 // openmp code ever called it, but it's been exported from the RTL for so
450 // long that I'm afraid to remove the definition.
__kmpc_invoke_task_func(int gtid)451 int __kmpc_invoke_task_func(int gtid) { return __kmp_invoke_task_func(gtid); }
452 
453 /*!
454 @ingroup PARALLEL
455 @param loc  source location information
456 @param global_tid  global thread number
457 
458 Enter a serialized parallel construct. This interface is used to handle a
459 conditional parallel region, like this,
460 @code
461 #pragma omp parallel if (condition)
462 @endcode
463 when the condition is false.
464 */
__kmpc_serialized_parallel(ident_t * loc,kmp_int32 global_tid)465 void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
466   // The implementation is now in kmp_runtime.cpp so that it can share static
467   // functions with kmp_fork_call since the tasks to be done are similar in
468   // each case.
469   __kmp_assert_valid_gtid(global_tid);
470 #if OMPT_SUPPORT
471   OMPT_STORE_RETURN_ADDRESS(global_tid);
472 #endif
473   __kmp_serialized_parallel(loc, global_tid);
474 }
475 
476 /*!
477 @ingroup PARALLEL
478 @param loc  source location information
479 @param global_tid  global thread number
480 
481 Leave a serialized parallel construct.
482 */
__kmpc_end_serialized_parallel(ident_t * loc,kmp_int32 global_tid)483 void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
484   kmp_internal_control_t *top;
485   kmp_info_t *this_thr;
486   kmp_team_t *serial_team;
487 
488   KC_TRACE(10,
489            ("__kmpc_end_serialized_parallel: called by T#%d\n", global_tid));
490 
491   /* skip all this code for autopar serialized loops since it results in
492      unacceptable overhead */
493   if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
494     return;
495 
496   // Not autopar code
497   __kmp_assert_valid_gtid(global_tid);
498   if (!TCR_4(__kmp_init_parallel))
499     __kmp_parallel_initialize();
500 
501   __kmp_resume_if_soft_paused();
502 
503   this_thr = __kmp_threads[global_tid];
504   serial_team = this_thr->th.th_serial_team;
505 
506   kmp_task_team_t *task_team = this_thr->th.th_task_team;
507   // we need to wait for the proxy tasks before finishing the thread
508   if (task_team != NULL && task_team->tt.tt_found_proxy_tasks)
509     __kmp_task_team_wait(this_thr, serial_team USE_ITT_BUILD_ARG(NULL));
510 
511   KMP_MB();
512   KMP_DEBUG_ASSERT(serial_team);
513   KMP_ASSERT(serial_team->t.t_serialized);
514   KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
515   KMP_DEBUG_ASSERT(serial_team != this_thr->th.th_root->r.r_root_team);
516   KMP_DEBUG_ASSERT(serial_team->t.t_threads);
517   KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
518 
519 #if OMPT_SUPPORT
520   if (ompt_enabled.enabled &&
521       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
522     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame = ompt_data_none;
523     if (ompt_enabled.ompt_callback_implicit_task) {
524       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
525           ompt_scope_end, NULL, OMPT_CUR_TASK_DATA(this_thr), 1,
526           OMPT_CUR_TASK_INFO(this_thr)->thread_num, ompt_task_implicit);
527     }
528 
529     // reset clear the task id only after unlinking the task
530     ompt_data_t *parent_task_data;
531     __ompt_get_task_info_internal(1, NULL, &parent_task_data, NULL, NULL, NULL);
532 
533     if (ompt_enabled.ompt_callback_parallel_end) {
534       ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
535           &(serial_team->t.ompt_team_info.parallel_data), parent_task_data,
536           ompt_parallel_invoker_program | ompt_parallel_team,
537           OMPT_LOAD_RETURN_ADDRESS(global_tid));
538     }
539     __ompt_lw_taskteam_unlink(this_thr);
540     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
541   }
542 #endif
543 
544   /* If necessary, pop the internal control stack values and replace the team
545    * values */
546   top = serial_team->t.t_control_stack_top;
547   if (top && top->serial_nesting_level == serial_team->t.t_serialized) {
548     copy_icvs(&serial_team->t.t_threads[0]->th.th_current_task->td_icvs, top);
549     serial_team->t.t_control_stack_top = top->next;
550     __kmp_free(top);
551   }
552 
553   // if( serial_team -> t.t_serialized > 1 )
554   serial_team->t.t_level--;
555 
556   /* pop dispatch buffers stack */
557   KMP_DEBUG_ASSERT(serial_team->t.t_dispatch->th_disp_buffer);
558   {
559     dispatch_private_info_t *disp_buffer =
560         serial_team->t.t_dispatch->th_disp_buffer;
561     serial_team->t.t_dispatch->th_disp_buffer =
562         serial_team->t.t_dispatch->th_disp_buffer->next;
563     __kmp_free(disp_buffer);
564   }
565   this_thr->th.th_def_allocator = serial_team->t.t_def_allocator; // restore
566 
567   --serial_team->t.t_serialized;
568   if (serial_team->t.t_serialized == 0) {
569 
570 /* return to the parallel section */
571 
572 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
573     if (__kmp_inherit_fp_control && serial_team->t.t_fp_control_saved) {
574       __kmp_clear_x87_fpu_status_word();
575       __kmp_load_x87_fpu_control_word(&serial_team->t.t_x87_fpu_control_word);
576       __kmp_load_mxcsr(&serial_team->t.t_mxcsr);
577     }
578 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
579 
580     this_thr->th.th_team = serial_team->t.t_parent;
581     this_thr->th.th_info.ds.ds_tid = serial_team->t.t_master_tid;
582 
583     /* restore values cached in the thread */
584     this_thr->th.th_team_nproc = serial_team->t.t_parent->t.t_nproc; /*  JPH */
585     this_thr->th.th_team_master =
586         serial_team->t.t_parent->t.t_threads[0]; /* JPH */
587     this_thr->th.th_team_serialized = this_thr->th.th_team->t.t_serialized;
588 
589     /* TODO the below shouldn't need to be adjusted for serialized teams */
590     this_thr->th.th_dispatch =
591         &this_thr->th.th_team->t.t_dispatch[serial_team->t.t_master_tid];
592 
593     __kmp_pop_current_task_from_thread(this_thr);
594 
595     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 0);
596     this_thr->th.th_current_task->td_flags.executing = 1;
597 
598     if (__kmp_tasking_mode != tskm_immediate_exec) {
599       // Copy the task team from the new child / old parent team to the thread.
600       this_thr->th.th_task_team =
601           this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state];
602       KA_TRACE(20,
603                ("__kmpc_end_serialized_parallel: T#%d restoring task_team %p / "
604                 "team %p\n",
605                 global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
606     }
607   } else {
608     if (__kmp_tasking_mode != tskm_immediate_exec) {
609       KA_TRACE(20, ("__kmpc_end_serialized_parallel: T#%d decreasing nesting "
610                     "depth of serial team %p to %d\n",
611                     global_tid, serial_team, serial_team->t.t_serialized));
612     }
613   }
614 
615   if (__kmp_env_consistency_check)
616     __kmp_pop_parallel(global_tid, NULL);
617 #if OMPT_SUPPORT
618   if (ompt_enabled.enabled)
619     this_thr->th.ompt_thread_info.state =
620         ((this_thr->th.th_team_serialized) ? ompt_state_work_serial
621                                            : ompt_state_work_parallel);
622 #endif
623 }
624 
625 /*!
626 @ingroup SYNCHRONIZATION
627 @param loc  source location information.
628 
629 Execute <tt>flush</tt>. This is implemented as a full memory fence. (Though
630 depending on the memory ordering convention obeyed by the compiler
631 even that may not be necessary).
632 */
__kmpc_flush(ident_t * loc)633 void __kmpc_flush(ident_t *loc) {
634   KC_TRACE(10, ("__kmpc_flush: called\n"));
635 
636   /* need explicit __mf() here since use volatile instead in library */
637   KMP_MB(); /* Flush all pending memory write invalidates.  */
638 
639 #if (KMP_ARCH_X86 || KMP_ARCH_X86_64)
640 #if KMP_MIC
641 // fence-style instructions do not exist, but lock; xaddl $0,(%rsp) can be used.
642 // We shouldn't need it, though, since the ABI rules require that
643 // * If the compiler generates NGO stores it also generates the fence
644 // * If users hand-code NGO stores they should insert the fence
645 // therefore no incomplete unordered stores should be visible.
646 #else
647   // C74404
648   // This is to address non-temporal store instructions (sfence needed).
649   // The clflush instruction is addressed either (mfence needed).
650   // Probably the non-temporal load monvtdqa instruction should also be
651   // addressed.
652   // mfence is a SSE2 instruction. Do not execute it if CPU is not SSE2.
653   if (!__kmp_cpuinfo.initialized) {
654     __kmp_query_cpuid(&__kmp_cpuinfo);
655   }
656   if (!__kmp_cpuinfo.sse2) {
657     // CPU cannot execute SSE2 instructions.
658   } else {
659 #if KMP_COMPILER_ICC
660     _mm_mfence();
661 #elif KMP_COMPILER_MSVC
662     MemoryBarrier();
663 #else
664     __sync_synchronize();
665 #endif // KMP_COMPILER_ICC
666   }
667 #endif // KMP_MIC
668 #elif (KMP_ARCH_ARM || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS || KMP_ARCH_MIPS64 || \
669        KMP_ARCH_RISCV64)
670 // Nothing to see here move along
671 #elif KMP_ARCH_PPC64
672 // Nothing needed here (we have a real MB above).
673 #else
674 #error Unknown or unsupported architecture
675 #endif
676 
677 #if OMPT_SUPPORT && OMPT_OPTIONAL
678   if (ompt_enabled.ompt_callback_flush) {
679     ompt_callbacks.ompt_callback(ompt_callback_flush)(
680         __ompt_get_thread_data_internal(), OMPT_GET_RETURN_ADDRESS(0));
681   }
682 #endif
683 }
684 
685 /* -------------------------------------------------------------------------- */
686 /*!
687 @ingroup SYNCHRONIZATION
688 @param loc source location information
689 @param global_tid thread id.
690 
691 Execute a barrier.
692 */
__kmpc_barrier(ident_t * loc,kmp_int32 global_tid)693 void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid) {
694   KMP_COUNT_BLOCK(OMP_BARRIER);
695   KC_TRACE(10, ("__kmpc_barrier: called T#%d\n", global_tid));
696   __kmp_assert_valid_gtid(global_tid);
697 
698   if (!TCR_4(__kmp_init_parallel))
699     __kmp_parallel_initialize();
700 
701   __kmp_resume_if_soft_paused();
702 
703   if (__kmp_env_consistency_check) {
704     if (loc == 0) {
705       KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
706     }
707     __kmp_check_barrier(global_tid, ct_barrier, loc);
708   }
709 
710 #if OMPT_SUPPORT
711   ompt_frame_t *ompt_frame;
712   if (ompt_enabled.enabled) {
713     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
714     if (ompt_frame->enter_frame.ptr == NULL)
715       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
716   }
717   OMPT_STORE_RETURN_ADDRESS(global_tid);
718 #endif
719   __kmp_threads[global_tid]->th.th_ident = loc;
720   // TODO: explicit barrier_wait_id:
721   //   this function is called when 'barrier' directive is present or
722   //   implicit barrier at the end of a worksharing construct.
723   // 1) better to add a per-thread barrier counter to a thread data structure
724   // 2) set to 0 when a new team is created
725   // 4) no sync is required
726 
727   __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
728 #if OMPT_SUPPORT && OMPT_OPTIONAL
729   if (ompt_enabled.enabled) {
730     ompt_frame->enter_frame = ompt_data_none;
731   }
732 #endif
733 }
734 
735 /* The BARRIER for a MASTER section is always explicit   */
736 /*!
737 @ingroup WORK_SHARING
738 @param loc  source location information.
739 @param global_tid  global thread number .
740 @return 1 if this thread should execute the <tt>master</tt> block, 0 otherwise.
741 */
__kmpc_master(ident_t * loc,kmp_int32 global_tid)742 kmp_int32 __kmpc_master(ident_t *loc, kmp_int32 global_tid) {
743   int status = 0;
744 
745   KC_TRACE(10, ("__kmpc_master: called T#%d\n", global_tid));
746   __kmp_assert_valid_gtid(global_tid);
747 
748   if (!TCR_4(__kmp_init_parallel))
749     __kmp_parallel_initialize();
750 
751   __kmp_resume_if_soft_paused();
752 
753   if (KMP_MASTER_GTID(global_tid)) {
754     KMP_COUNT_BLOCK(OMP_MASTER);
755     KMP_PUSH_PARTITIONED_TIMER(OMP_master);
756     status = 1;
757   }
758 
759 #if OMPT_SUPPORT && OMPT_OPTIONAL
760   if (status) {
761     if (ompt_enabled.ompt_callback_masked) {
762       kmp_info_t *this_thr = __kmp_threads[global_tid];
763       kmp_team_t *team = this_thr->th.th_team;
764 
765       int tid = __kmp_tid_from_gtid(global_tid);
766       ompt_callbacks.ompt_callback(ompt_callback_masked)(
767           ompt_scope_begin, &(team->t.ompt_team_info.parallel_data),
768           &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
769           OMPT_GET_RETURN_ADDRESS(0));
770     }
771   }
772 #endif
773 
774   if (__kmp_env_consistency_check) {
775 #if KMP_USE_DYNAMIC_LOCK
776     if (status)
777       __kmp_push_sync(global_tid, ct_master, loc, NULL, 0);
778     else
779       __kmp_check_sync(global_tid, ct_master, loc, NULL, 0);
780 #else
781     if (status)
782       __kmp_push_sync(global_tid, ct_master, loc, NULL);
783     else
784       __kmp_check_sync(global_tid, ct_master, loc, NULL);
785 #endif
786   }
787 
788   return status;
789 }
790 
791 /*!
792 @ingroup WORK_SHARING
793 @param loc  source location information.
794 @param global_tid  global thread number .
795 
796 Mark the end of a <tt>master</tt> region. This should only be called by the
797 thread that executes the <tt>master</tt> region.
798 */
__kmpc_end_master(ident_t * loc,kmp_int32 global_tid)799 void __kmpc_end_master(ident_t *loc, kmp_int32 global_tid) {
800   KC_TRACE(10, ("__kmpc_end_master: called T#%d\n", global_tid));
801   __kmp_assert_valid_gtid(global_tid);
802   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(global_tid));
803   KMP_POP_PARTITIONED_TIMER();
804 
805 #if OMPT_SUPPORT && OMPT_OPTIONAL
806   kmp_info_t *this_thr = __kmp_threads[global_tid];
807   kmp_team_t *team = this_thr->th.th_team;
808   if (ompt_enabled.ompt_callback_masked) {
809     int tid = __kmp_tid_from_gtid(global_tid);
810     ompt_callbacks.ompt_callback(ompt_callback_masked)(
811         ompt_scope_end, &(team->t.ompt_team_info.parallel_data),
812         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
813         OMPT_GET_RETURN_ADDRESS(0));
814   }
815 #endif
816 
817   if (__kmp_env_consistency_check) {
818     if (KMP_MASTER_GTID(global_tid))
819       __kmp_pop_sync(global_tid, ct_master, loc);
820   }
821 }
822 
823 /*!
824 @ingroup WORK_SHARING
825 @param loc  source location information.
826 @param gtid  global thread number.
827 
828 Start execution of an <tt>ordered</tt> construct.
829 */
__kmpc_ordered(ident_t * loc,kmp_int32 gtid)830 void __kmpc_ordered(ident_t *loc, kmp_int32 gtid) {
831   int cid = 0;
832   kmp_info_t *th;
833   KMP_DEBUG_ASSERT(__kmp_init_serial);
834 
835   KC_TRACE(10, ("__kmpc_ordered: called T#%d\n", gtid));
836   __kmp_assert_valid_gtid(gtid);
837 
838   if (!TCR_4(__kmp_init_parallel))
839     __kmp_parallel_initialize();
840 
841   __kmp_resume_if_soft_paused();
842 
843 #if USE_ITT_BUILD
844   __kmp_itt_ordered_prep(gtid);
845 // TODO: ordered_wait_id
846 #endif /* USE_ITT_BUILD */
847 
848   th = __kmp_threads[gtid];
849 
850 #if OMPT_SUPPORT && OMPT_OPTIONAL
851   kmp_team_t *team;
852   ompt_wait_id_t lck;
853   void *codeptr_ra;
854   OMPT_STORE_RETURN_ADDRESS(gtid);
855   if (ompt_enabled.enabled) {
856     team = __kmp_team_from_gtid(gtid);
857     lck = (ompt_wait_id_t)(uintptr_t)&team->t.t_ordered.dt.t_value;
858     /* OMPT state update */
859     th->th.ompt_thread_info.wait_id = lck;
860     th->th.ompt_thread_info.state = ompt_state_wait_ordered;
861 
862     /* OMPT event callback */
863     codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
864     if (ompt_enabled.ompt_callback_mutex_acquire) {
865       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
866           ompt_mutex_ordered, omp_lock_hint_none, kmp_mutex_impl_spin, lck,
867           codeptr_ra);
868     }
869   }
870 #endif
871 
872   if (th->th.th_dispatch->th_deo_fcn != 0)
873     (*th->th.th_dispatch->th_deo_fcn)(&gtid, &cid, loc);
874   else
875     __kmp_parallel_deo(&gtid, &cid, loc);
876 
877 #if OMPT_SUPPORT && OMPT_OPTIONAL
878   if (ompt_enabled.enabled) {
879     /* OMPT state update */
880     th->th.ompt_thread_info.state = ompt_state_work_parallel;
881     th->th.ompt_thread_info.wait_id = 0;
882 
883     /* OMPT event callback */
884     if (ompt_enabled.ompt_callback_mutex_acquired) {
885       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
886           ompt_mutex_ordered, (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
887     }
888   }
889 #endif
890 
891 #if USE_ITT_BUILD
892   __kmp_itt_ordered_start(gtid);
893 #endif /* USE_ITT_BUILD */
894 }
895 
896 /*!
897 @ingroup WORK_SHARING
898 @param loc  source location information.
899 @param gtid  global thread number.
900 
901 End execution of an <tt>ordered</tt> construct.
902 */
__kmpc_end_ordered(ident_t * loc,kmp_int32 gtid)903 void __kmpc_end_ordered(ident_t *loc, kmp_int32 gtid) {
904   int cid = 0;
905   kmp_info_t *th;
906 
907   KC_TRACE(10, ("__kmpc_end_ordered: called T#%d\n", gtid));
908   __kmp_assert_valid_gtid(gtid);
909 
910 #if USE_ITT_BUILD
911   __kmp_itt_ordered_end(gtid);
912 // TODO: ordered_wait_id
913 #endif /* USE_ITT_BUILD */
914 
915   th = __kmp_threads[gtid];
916 
917   if (th->th.th_dispatch->th_dxo_fcn != 0)
918     (*th->th.th_dispatch->th_dxo_fcn)(&gtid, &cid, loc);
919   else
920     __kmp_parallel_dxo(&gtid, &cid, loc);
921 
922 #if OMPT_SUPPORT && OMPT_OPTIONAL
923   OMPT_STORE_RETURN_ADDRESS(gtid);
924   if (ompt_enabled.ompt_callback_mutex_released) {
925     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
926         ompt_mutex_ordered,
927         (ompt_wait_id_t)(uintptr_t)&__kmp_team_from_gtid(gtid)
928             ->t.t_ordered.dt.t_value,
929         OMPT_LOAD_RETURN_ADDRESS(gtid));
930   }
931 #endif
932 }
933 
934 #if KMP_USE_DYNAMIC_LOCK
935 
936 static __forceinline void
__kmp_init_indirect_csptr(kmp_critical_name * crit,ident_t const * loc,kmp_int32 gtid,kmp_indirect_locktag_t tag)937 __kmp_init_indirect_csptr(kmp_critical_name *crit, ident_t const *loc,
938                           kmp_int32 gtid, kmp_indirect_locktag_t tag) {
939   // Pointer to the allocated indirect lock is written to crit, while indexing
940   // is ignored.
941   void *idx;
942   kmp_indirect_lock_t **lck;
943   lck = (kmp_indirect_lock_t **)crit;
944   kmp_indirect_lock_t *ilk = __kmp_allocate_indirect_lock(&idx, gtid, tag);
945   KMP_I_LOCK_FUNC(ilk, init)(ilk->lock);
946   KMP_SET_I_LOCK_LOCATION(ilk, loc);
947   KMP_SET_I_LOCK_FLAGS(ilk, kmp_lf_critical_section);
948   KA_TRACE(20,
949            ("__kmp_init_indirect_csptr: initialized indirect lock #%d\n", tag));
950 #if USE_ITT_BUILD
951   __kmp_itt_critical_creating(ilk->lock, loc);
952 #endif
953   int status = KMP_COMPARE_AND_STORE_PTR(lck, nullptr, ilk);
954   if (status == 0) {
955 #if USE_ITT_BUILD
956     __kmp_itt_critical_destroyed(ilk->lock);
957 #endif
958     // We don't really need to destroy the unclaimed lock here since it will be
959     // cleaned up at program exit.
960     // KMP_D_LOCK_FUNC(&idx, destroy)((kmp_dyna_lock_t *)&idx);
961   }
962   KMP_DEBUG_ASSERT(*lck != NULL);
963 }
964 
965 // Fast-path acquire tas lock
966 #define KMP_ACQUIRE_TAS_LOCK(lock, gtid)                                       \
967   {                                                                            \
968     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
969     kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
970     kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
971     if (KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                          \
972         !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)) {    \
973       kmp_uint32 spins;                                                        \
974       KMP_FSYNC_PREPARE(l);                                                    \
975       KMP_INIT_YIELD(spins);                                                   \
976       kmp_backoff_t backoff = __kmp_spin_backoff_params;                       \
977       do {                                                                     \
978         if (TCR_4(__kmp_nth) >                                                 \
979             (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {             \
980           KMP_YIELD(TRUE);                                                     \
981         } else {                                                               \
982           KMP_YIELD_SPIN(spins);                                               \
983         }                                                                      \
984         __kmp_spin_backoff(&backoff);                                          \
985       } while (                                                                \
986           KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                        \
987           !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy));   \
988     }                                                                          \
989     KMP_FSYNC_ACQUIRED(l);                                                     \
990   }
991 
992 // Fast-path test tas lock
993 #define KMP_TEST_TAS_LOCK(lock, gtid, rc)                                      \
994   {                                                                            \
995     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
996     kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
997     kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
998     rc = KMP_ATOMIC_LD_RLX(&l->lk.poll) == tas_free &&                         \
999          __kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy);      \
1000   }
1001 
1002 // Fast-path release tas lock
1003 #define KMP_RELEASE_TAS_LOCK(lock, gtid)                                       \
1004   { KMP_ATOMIC_ST_REL(&((kmp_tas_lock_t *)lock)->lk.poll, KMP_LOCK_FREE(tas)); }
1005 
1006 #if KMP_USE_FUTEX
1007 
1008 #include <sys/syscall.h>
1009 #include <unistd.h>
1010 #ifndef FUTEX_WAIT
1011 #define FUTEX_WAIT 0
1012 #endif
1013 #ifndef FUTEX_WAKE
1014 #define FUTEX_WAKE 1
1015 #endif
1016 
1017 // Fast-path acquire futex lock
1018 #define KMP_ACQUIRE_FUTEX_LOCK(lock, gtid)                                     \
1019   {                                                                            \
1020     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1021     kmp_int32 gtid_code = (gtid + 1) << 1;                                     \
1022     KMP_MB();                                                                  \
1023     KMP_FSYNC_PREPARE(ftx);                                                    \
1024     kmp_int32 poll_val;                                                        \
1025     while ((poll_val = KMP_COMPARE_AND_STORE_RET32(                            \
1026                 &(ftx->lk.poll), KMP_LOCK_FREE(futex),                         \
1027                 KMP_LOCK_BUSY(gtid_code, futex))) != KMP_LOCK_FREE(futex)) {   \
1028       kmp_int32 cond = KMP_LOCK_STRIP(poll_val) & 1;                           \
1029       if (!cond) {                                                             \
1030         if (!KMP_COMPARE_AND_STORE_RET32(&(ftx->lk.poll), poll_val,            \
1031                                          poll_val |                            \
1032                                              KMP_LOCK_BUSY(1, futex))) {       \
1033           continue;                                                            \
1034         }                                                                      \
1035         poll_val |= KMP_LOCK_BUSY(1, futex);                                   \
1036       }                                                                        \
1037       kmp_int32 rc;                                                            \
1038       if ((rc = syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAIT, poll_val,     \
1039                         NULL, NULL, 0)) != 0) {                                \
1040         continue;                                                              \
1041       }                                                                        \
1042       gtid_code |= 1;                                                          \
1043     }                                                                          \
1044     KMP_FSYNC_ACQUIRED(ftx);                                                   \
1045   }
1046 
1047 // Fast-path test futex lock
1048 #define KMP_TEST_FUTEX_LOCK(lock, gtid, rc)                                    \
1049   {                                                                            \
1050     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1051     if (KMP_COMPARE_AND_STORE_ACQ32(&(ftx->lk.poll), KMP_LOCK_FREE(futex),     \
1052                                     KMP_LOCK_BUSY(gtid + 1 << 1, futex))) {    \
1053       KMP_FSYNC_ACQUIRED(ftx);                                                 \
1054       rc = TRUE;                                                               \
1055     } else {                                                                   \
1056       rc = FALSE;                                                              \
1057     }                                                                          \
1058   }
1059 
1060 // Fast-path release futex lock
1061 #define KMP_RELEASE_FUTEX_LOCK(lock, gtid)                                     \
1062   {                                                                            \
1063     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1064     KMP_MB();                                                                  \
1065     KMP_FSYNC_RELEASING(ftx);                                                  \
1066     kmp_int32 poll_val =                                                       \
1067         KMP_XCHG_FIXED32(&(ftx->lk.poll), KMP_LOCK_FREE(futex));               \
1068     if (KMP_LOCK_STRIP(poll_val) & 1) {                                        \
1069       syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAKE,                         \
1070               KMP_LOCK_BUSY(1, futex), NULL, NULL, 0);                         \
1071     }                                                                          \
1072     KMP_MB();                                                                  \
1073     KMP_YIELD_OVERSUB();                                                       \
1074   }
1075 
1076 #endif // KMP_USE_FUTEX
1077 
1078 #else // KMP_USE_DYNAMIC_LOCK
1079 
__kmp_get_critical_section_ptr(kmp_critical_name * crit,ident_t const * loc,kmp_int32 gtid)1080 static kmp_user_lock_p __kmp_get_critical_section_ptr(kmp_critical_name *crit,
1081                                                       ident_t const *loc,
1082                                                       kmp_int32 gtid) {
1083   kmp_user_lock_p *lck_pp = (kmp_user_lock_p *)crit;
1084 
1085   // Because of the double-check, the following load doesn't need to be volatile
1086   kmp_user_lock_p lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1087 
1088   if (lck == NULL) {
1089     void *idx;
1090 
1091     // Allocate & initialize the lock.
1092     // Remember alloc'ed locks in table in order to free them in __kmp_cleanup()
1093     lck = __kmp_user_lock_allocate(&idx, gtid, kmp_lf_critical_section);
1094     __kmp_init_user_lock_with_checks(lck);
1095     __kmp_set_user_lock_location(lck, loc);
1096 #if USE_ITT_BUILD
1097     __kmp_itt_critical_creating(lck);
1098 // __kmp_itt_critical_creating() should be called *before* the first usage
1099 // of underlying lock. It is the only place where we can guarantee it. There
1100 // are chances the lock will destroyed with no usage, but it is not a
1101 // problem, because this is not real event seen by user but rather setting
1102 // name for object (lock). See more details in kmp_itt.h.
1103 #endif /* USE_ITT_BUILD */
1104 
1105     // Use a cmpxchg instruction to slam the start of the critical section with
1106     // the lock pointer.  If another thread beat us to it, deallocate the lock,
1107     // and use the lock that the other thread allocated.
1108     int status = KMP_COMPARE_AND_STORE_PTR(lck_pp, 0, lck);
1109 
1110     if (status == 0) {
1111 // Deallocate the lock and reload the value.
1112 #if USE_ITT_BUILD
1113       __kmp_itt_critical_destroyed(lck);
1114 // Let ITT know the lock is destroyed and the same memory location may be reused
1115 // for another purpose.
1116 #endif /* USE_ITT_BUILD */
1117       __kmp_destroy_user_lock_with_checks(lck);
1118       __kmp_user_lock_free(&idx, gtid, lck);
1119       lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1120       KMP_DEBUG_ASSERT(lck != NULL);
1121     }
1122   }
1123   return lck;
1124 }
1125 
1126 #endif // KMP_USE_DYNAMIC_LOCK
1127 
1128 /*!
1129 @ingroup WORK_SHARING
1130 @param loc  source location information.
1131 @param global_tid  global thread number.
1132 @param crit identity of the critical section. This could be a pointer to a lock
1133 associated with the critical section, or some other suitably unique value.
1134 
1135 Enter code protected by a `critical` construct.
1136 This function blocks until the executing thread can enter the critical section.
1137 */
__kmpc_critical(ident_t * loc,kmp_int32 global_tid,kmp_critical_name * crit)1138 void __kmpc_critical(ident_t *loc, kmp_int32 global_tid,
1139                      kmp_critical_name *crit) {
1140 #if KMP_USE_DYNAMIC_LOCK
1141 #if OMPT_SUPPORT && OMPT_OPTIONAL
1142   OMPT_STORE_RETURN_ADDRESS(global_tid);
1143 #endif // OMPT_SUPPORT
1144   __kmpc_critical_with_hint(loc, global_tid, crit, omp_lock_hint_none);
1145 #else
1146   KMP_COUNT_BLOCK(OMP_CRITICAL);
1147 #if OMPT_SUPPORT && OMPT_OPTIONAL
1148   ompt_state_t prev_state = ompt_state_undefined;
1149   ompt_thread_info_t ti;
1150 #endif
1151   kmp_user_lock_p lck;
1152 
1153   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1154   __kmp_assert_valid_gtid(global_tid);
1155 
1156   // TODO: add THR_OVHD_STATE
1157 
1158   KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1159   KMP_CHECK_USER_LOCK_INIT();
1160 
1161   if ((__kmp_user_lock_kind == lk_tas) &&
1162       (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1163     lck = (kmp_user_lock_p)crit;
1164   }
1165 #if KMP_USE_FUTEX
1166   else if ((__kmp_user_lock_kind == lk_futex) &&
1167            (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1168     lck = (kmp_user_lock_p)crit;
1169   }
1170 #endif
1171   else { // ticket, queuing or drdpa
1172     lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
1173   }
1174 
1175   if (__kmp_env_consistency_check)
1176     __kmp_push_sync(global_tid, ct_critical, loc, lck);
1177 
1178 // since the critical directive binds to all threads, not just the current
1179 // team we have to check this even if we are in a serialized team.
1180 // also, even if we are the uber thread, we still have to conduct the lock,
1181 // as we have to contend with sibling threads.
1182 
1183 #if USE_ITT_BUILD
1184   __kmp_itt_critical_acquiring(lck);
1185 #endif /* USE_ITT_BUILD */
1186 #if OMPT_SUPPORT && OMPT_OPTIONAL
1187   OMPT_STORE_RETURN_ADDRESS(gtid);
1188   void *codeptr_ra = NULL;
1189   if (ompt_enabled.enabled) {
1190     ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1191     /* OMPT state update */
1192     prev_state = ti.state;
1193     ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1194     ti.state = ompt_state_wait_critical;
1195 
1196     /* OMPT event callback */
1197     codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
1198     if (ompt_enabled.ompt_callback_mutex_acquire) {
1199       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1200           ompt_mutex_critical, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
1201           (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1202     }
1203   }
1204 #endif
1205   // Value of 'crit' should be good for using as a critical_id of the critical
1206   // section directive.
1207   __kmp_acquire_user_lock_with_checks(lck, global_tid);
1208 
1209 #if USE_ITT_BUILD
1210   __kmp_itt_critical_acquired(lck);
1211 #endif /* USE_ITT_BUILD */
1212 #if OMPT_SUPPORT && OMPT_OPTIONAL
1213   if (ompt_enabled.enabled) {
1214     /* OMPT state update */
1215     ti.state = prev_state;
1216     ti.wait_id = 0;
1217 
1218     /* OMPT event callback */
1219     if (ompt_enabled.ompt_callback_mutex_acquired) {
1220       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1221           ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1222     }
1223   }
1224 #endif
1225   KMP_POP_PARTITIONED_TIMER();
1226 
1227   KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1228   KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1229 #endif // KMP_USE_DYNAMIC_LOCK
1230 }
1231 
1232 #if KMP_USE_DYNAMIC_LOCK
1233 
1234 // Converts the given hint to an internal lock implementation
__kmp_map_hint_to_lock(uintptr_t hint)1235 static __forceinline kmp_dyna_lockseq_t __kmp_map_hint_to_lock(uintptr_t hint) {
1236 #if KMP_USE_TSX
1237 #define KMP_TSX_LOCK(seq) lockseq_##seq
1238 #else
1239 #define KMP_TSX_LOCK(seq) __kmp_user_lock_seq
1240 #endif
1241 
1242 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1243 #define KMP_CPUINFO_RTM (__kmp_cpuinfo.rtm)
1244 #else
1245 #define KMP_CPUINFO_RTM 0
1246 #endif
1247 
1248   // Hints that do not require further logic
1249   if (hint & kmp_lock_hint_hle)
1250     return KMP_TSX_LOCK(hle);
1251   if (hint & kmp_lock_hint_rtm)
1252     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm) : __kmp_user_lock_seq;
1253   if (hint & kmp_lock_hint_adaptive)
1254     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(adaptive) : __kmp_user_lock_seq;
1255 
1256   // Rule out conflicting hints first by returning the default lock
1257   if ((hint & omp_lock_hint_contended) && (hint & omp_lock_hint_uncontended))
1258     return __kmp_user_lock_seq;
1259   if ((hint & omp_lock_hint_speculative) &&
1260       (hint & omp_lock_hint_nonspeculative))
1261     return __kmp_user_lock_seq;
1262 
1263   // Do not even consider speculation when it appears to be contended
1264   if (hint & omp_lock_hint_contended)
1265     return lockseq_queuing;
1266 
1267   // Uncontended lock without speculation
1268   if ((hint & omp_lock_hint_uncontended) && !(hint & omp_lock_hint_speculative))
1269     return lockseq_tas;
1270 
1271   // HLE lock for speculation
1272   if (hint & omp_lock_hint_speculative)
1273     return KMP_TSX_LOCK(hle);
1274 
1275   return __kmp_user_lock_seq;
1276 }
1277 
1278 #if OMPT_SUPPORT && OMPT_OPTIONAL
1279 #if KMP_USE_DYNAMIC_LOCK
1280 static kmp_mutex_impl_t
__ompt_get_mutex_impl_type(void * user_lock,kmp_indirect_lock_t * ilock=0)1281 __ompt_get_mutex_impl_type(void *user_lock, kmp_indirect_lock_t *ilock = 0) {
1282   if (user_lock) {
1283     switch (KMP_EXTRACT_D_TAG(user_lock)) {
1284     case 0:
1285       break;
1286 #if KMP_USE_FUTEX
1287     case locktag_futex:
1288       return kmp_mutex_impl_queuing;
1289 #endif
1290     case locktag_tas:
1291       return kmp_mutex_impl_spin;
1292 #if KMP_USE_TSX
1293     case locktag_hle:
1294       return kmp_mutex_impl_speculative;
1295 #endif
1296     default:
1297       return kmp_mutex_impl_none;
1298     }
1299     ilock = KMP_LOOKUP_I_LOCK(user_lock);
1300   }
1301   KMP_ASSERT(ilock);
1302   switch (ilock->type) {
1303 #if KMP_USE_TSX
1304   case locktag_adaptive:
1305   case locktag_rtm:
1306     return kmp_mutex_impl_speculative;
1307 #endif
1308   case locktag_nested_tas:
1309     return kmp_mutex_impl_spin;
1310 #if KMP_USE_FUTEX
1311   case locktag_nested_futex:
1312 #endif
1313   case locktag_ticket:
1314   case locktag_queuing:
1315   case locktag_drdpa:
1316   case locktag_nested_ticket:
1317   case locktag_nested_queuing:
1318   case locktag_nested_drdpa:
1319     return kmp_mutex_impl_queuing;
1320   default:
1321     return kmp_mutex_impl_none;
1322   }
1323 }
1324 #else
1325 // For locks without dynamic binding
__ompt_get_mutex_impl_type()1326 static kmp_mutex_impl_t __ompt_get_mutex_impl_type() {
1327   switch (__kmp_user_lock_kind) {
1328   case lk_tas:
1329     return kmp_mutex_impl_spin;
1330 #if KMP_USE_FUTEX
1331   case lk_futex:
1332 #endif
1333   case lk_ticket:
1334   case lk_queuing:
1335   case lk_drdpa:
1336     return kmp_mutex_impl_queuing;
1337 #if KMP_USE_TSX
1338   case lk_hle:
1339   case lk_rtm:
1340   case lk_adaptive:
1341     return kmp_mutex_impl_speculative;
1342 #endif
1343   default:
1344     return kmp_mutex_impl_none;
1345   }
1346 }
1347 #endif // KMP_USE_DYNAMIC_LOCK
1348 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1349 
1350 /*!
1351 @ingroup WORK_SHARING
1352 @param loc  source location information.
1353 @param global_tid  global thread number.
1354 @param crit identity of the critical section. This could be a pointer to a lock
1355 associated with the critical section, or some other suitably unique value.
1356 @param hint the lock hint.
1357 
1358 Enter code protected by a `critical` construct with a hint. The hint value is
1359 used to suggest a lock implementation. This function blocks until the executing
1360 thread can enter the critical section unless the hint suggests use of
1361 speculative execution and the hardware supports it.
1362 */
__kmpc_critical_with_hint(ident_t * loc,kmp_int32 global_tid,kmp_critical_name * crit,uint32_t hint)1363 void __kmpc_critical_with_hint(ident_t *loc, kmp_int32 global_tid,
1364                                kmp_critical_name *crit, uint32_t hint) {
1365   KMP_COUNT_BLOCK(OMP_CRITICAL);
1366   kmp_user_lock_p lck;
1367 #if OMPT_SUPPORT && OMPT_OPTIONAL
1368   ompt_state_t prev_state = ompt_state_undefined;
1369   ompt_thread_info_t ti;
1370   // This is the case, if called from __kmpc_critical:
1371   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1372   if (!codeptr)
1373     codeptr = OMPT_GET_RETURN_ADDRESS(0);
1374 #endif
1375 
1376   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1377   __kmp_assert_valid_gtid(global_tid);
1378 
1379   kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
1380   // Check if it is initialized.
1381   KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1382   if (*lk == 0) {
1383     kmp_dyna_lockseq_t lckseq = __kmp_map_hint_to_lock(hint);
1384     if (KMP_IS_D_LOCK(lckseq)) {
1385       KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
1386                                   KMP_GET_D_TAG(lckseq));
1387     } else {
1388       __kmp_init_indirect_csptr(crit, loc, global_tid, KMP_GET_I_TAG(lckseq));
1389     }
1390   }
1391   // Branch for accessing the actual lock object and set operation. This
1392   // branching is inevitable since this lock initialization does not follow the
1393   // normal dispatch path (lock table is not used).
1394   if (KMP_EXTRACT_D_TAG(lk) != 0) {
1395     lck = (kmp_user_lock_p)lk;
1396     if (__kmp_env_consistency_check) {
1397       __kmp_push_sync(global_tid, ct_critical, loc, lck,
1398                       __kmp_map_hint_to_lock(hint));
1399     }
1400 #if USE_ITT_BUILD
1401     __kmp_itt_critical_acquiring(lck);
1402 #endif
1403 #if OMPT_SUPPORT && OMPT_OPTIONAL
1404     if (ompt_enabled.enabled) {
1405       ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1406       /* OMPT state update */
1407       prev_state = ti.state;
1408       ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1409       ti.state = ompt_state_wait_critical;
1410 
1411       /* OMPT event callback */
1412       if (ompt_enabled.ompt_callback_mutex_acquire) {
1413         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1414             ompt_mutex_critical, (unsigned int)hint,
1415             __ompt_get_mutex_impl_type(crit), (ompt_wait_id_t)(uintptr_t)lck,
1416             codeptr);
1417       }
1418     }
1419 #endif
1420 #if KMP_USE_INLINED_TAS
1421     if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
1422       KMP_ACQUIRE_TAS_LOCK(lck, global_tid);
1423     } else
1424 #elif KMP_USE_INLINED_FUTEX
1425     if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
1426       KMP_ACQUIRE_FUTEX_LOCK(lck, global_tid);
1427     } else
1428 #endif
1429     {
1430       KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
1431     }
1432   } else {
1433     kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
1434     lck = ilk->lock;
1435     if (__kmp_env_consistency_check) {
1436       __kmp_push_sync(global_tid, ct_critical, loc, lck,
1437                       __kmp_map_hint_to_lock(hint));
1438     }
1439 #if USE_ITT_BUILD
1440     __kmp_itt_critical_acquiring(lck);
1441 #endif
1442 #if OMPT_SUPPORT && OMPT_OPTIONAL
1443     if (ompt_enabled.enabled) {
1444       ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1445       /* OMPT state update */
1446       prev_state = ti.state;
1447       ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1448       ti.state = ompt_state_wait_critical;
1449 
1450       /* OMPT event callback */
1451       if (ompt_enabled.ompt_callback_mutex_acquire) {
1452         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1453             ompt_mutex_critical, (unsigned int)hint,
1454             __ompt_get_mutex_impl_type(0, ilk), (ompt_wait_id_t)(uintptr_t)lck,
1455             codeptr);
1456       }
1457     }
1458 #endif
1459     KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
1460   }
1461   KMP_POP_PARTITIONED_TIMER();
1462 
1463 #if USE_ITT_BUILD
1464   __kmp_itt_critical_acquired(lck);
1465 #endif /* USE_ITT_BUILD */
1466 #if OMPT_SUPPORT && OMPT_OPTIONAL
1467   if (ompt_enabled.enabled) {
1468     /* OMPT state update */
1469     ti.state = prev_state;
1470     ti.wait_id = 0;
1471 
1472     /* OMPT event callback */
1473     if (ompt_enabled.ompt_callback_mutex_acquired) {
1474       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1475           ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
1476     }
1477   }
1478 #endif
1479 
1480   KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1481   KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1482 } // __kmpc_critical_with_hint
1483 
1484 #endif // KMP_USE_DYNAMIC_LOCK
1485 
1486 /*!
1487 @ingroup WORK_SHARING
1488 @param loc  source location information.
1489 @param global_tid  global thread number .
1490 @param crit identity of the critical section. This could be a pointer to a lock
1491 associated with the critical section, or some other suitably unique value.
1492 
1493 Leave a critical section, releasing any lock that was held during its execution.
1494 */
__kmpc_end_critical(ident_t * loc,kmp_int32 global_tid,kmp_critical_name * crit)1495 void __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid,
1496                          kmp_critical_name *crit) {
1497   kmp_user_lock_p lck;
1498 
1499   KC_TRACE(10, ("__kmpc_end_critical: called T#%d\n", global_tid));
1500 
1501 #if KMP_USE_DYNAMIC_LOCK
1502   if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
1503     lck = (kmp_user_lock_p)crit;
1504     KMP_ASSERT(lck != NULL);
1505     if (__kmp_env_consistency_check) {
1506       __kmp_pop_sync(global_tid, ct_critical, loc);
1507     }
1508 #if USE_ITT_BUILD
1509     __kmp_itt_critical_releasing(lck);
1510 #endif
1511 #if KMP_USE_INLINED_TAS
1512     if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
1513       KMP_RELEASE_TAS_LOCK(lck, global_tid);
1514     } else
1515 #elif KMP_USE_INLINED_FUTEX
1516     if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
1517       KMP_RELEASE_FUTEX_LOCK(lck, global_tid);
1518     } else
1519 #endif
1520     {
1521       KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
1522     }
1523   } else {
1524     kmp_indirect_lock_t *ilk =
1525         (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
1526     KMP_ASSERT(ilk != NULL);
1527     lck = ilk->lock;
1528     if (__kmp_env_consistency_check) {
1529       __kmp_pop_sync(global_tid, ct_critical, loc);
1530     }
1531 #if USE_ITT_BUILD
1532     __kmp_itt_critical_releasing(lck);
1533 #endif
1534     KMP_I_LOCK_FUNC(ilk, unset)(lck, global_tid);
1535   }
1536 
1537 #else // KMP_USE_DYNAMIC_LOCK
1538 
1539   if ((__kmp_user_lock_kind == lk_tas) &&
1540       (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1541     lck = (kmp_user_lock_p)crit;
1542   }
1543 #if KMP_USE_FUTEX
1544   else if ((__kmp_user_lock_kind == lk_futex) &&
1545            (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1546     lck = (kmp_user_lock_p)crit;
1547   }
1548 #endif
1549   else { // ticket, queuing or drdpa
1550     lck = (kmp_user_lock_p)TCR_PTR(*((kmp_user_lock_p *)crit));
1551   }
1552 
1553   KMP_ASSERT(lck != NULL);
1554 
1555   if (__kmp_env_consistency_check)
1556     __kmp_pop_sync(global_tid, ct_critical, loc);
1557 
1558 #if USE_ITT_BUILD
1559   __kmp_itt_critical_releasing(lck);
1560 #endif /* USE_ITT_BUILD */
1561   // Value of 'crit' should be good for using as a critical_id of the critical
1562   // section directive.
1563   __kmp_release_user_lock_with_checks(lck, global_tid);
1564 
1565 #endif // KMP_USE_DYNAMIC_LOCK
1566 
1567 #if OMPT_SUPPORT && OMPT_OPTIONAL
1568   /* OMPT release event triggers after lock is released; place here to trigger
1569    * for all #if branches */
1570   OMPT_STORE_RETURN_ADDRESS(global_tid);
1571   if (ompt_enabled.ompt_callback_mutex_released) {
1572     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
1573         ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck,
1574         OMPT_LOAD_RETURN_ADDRESS(0));
1575   }
1576 #endif
1577 
1578   KMP_POP_PARTITIONED_TIMER();
1579   KA_TRACE(15, ("__kmpc_end_critical: done T#%d\n", global_tid));
1580 }
1581 
1582 /*!
1583 @ingroup SYNCHRONIZATION
1584 @param loc source location information
1585 @param global_tid thread id.
1586 @return one if the thread should execute the master block, zero otherwise
1587 
1588 Start execution of a combined barrier and master. The barrier is executed inside
1589 this function.
1590 */
__kmpc_barrier_master(ident_t * loc,kmp_int32 global_tid)1591 kmp_int32 __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1592   int status;
1593   KC_TRACE(10, ("__kmpc_barrier_master: called T#%d\n", global_tid));
1594   __kmp_assert_valid_gtid(global_tid);
1595 
1596   if (!TCR_4(__kmp_init_parallel))
1597     __kmp_parallel_initialize();
1598 
1599   __kmp_resume_if_soft_paused();
1600 
1601   if (__kmp_env_consistency_check)
1602     __kmp_check_barrier(global_tid, ct_barrier, loc);
1603 
1604 #if OMPT_SUPPORT
1605   ompt_frame_t *ompt_frame;
1606   if (ompt_enabled.enabled) {
1607     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1608     if (ompt_frame->enter_frame.ptr == NULL)
1609       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1610   }
1611   OMPT_STORE_RETURN_ADDRESS(global_tid);
1612 #endif
1613 #if USE_ITT_NOTIFY
1614   __kmp_threads[global_tid]->th.th_ident = loc;
1615 #endif
1616   status = __kmp_barrier(bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL);
1617 #if OMPT_SUPPORT && OMPT_OPTIONAL
1618   if (ompt_enabled.enabled) {
1619     ompt_frame->enter_frame = ompt_data_none;
1620   }
1621 #endif
1622 
1623   return (status != 0) ? 0 : 1;
1624 }
1625 
1626 /*!
1627 @ingroup SYNCHRONIZATION
1628 @param loc source location information
1629 @param global_tid thread id.
1630 
1631 Complete the execution of a combined barrier and master. This function should
1632 only be called at the completion of the <tt>master</tt> code. Other threads will
1633 still be waiting at the barrier and this call releases them.
1634 */
__kmpc_end_barrier_master(ident_t * loc,kmp_int32 global_tid)1635 void __kmpc_end_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1636   KC_TRACE(10, ("__kmpc_end_barrier_master: called T#%d\n", global_tid));
1637   __kmp_assert_valid_gtid(global_tid);
1638   __kmp_end_split_barrier(bs_plain_barrier, global_tid);
1639 }
1640 
1641 /*!
1642 @ingroup SYNCHRONIZATION
1643 @param loc source location information
1644 @param global_tid thread id.
1645 @return one if the thread should execute the master block, zero otherwise
1646 
1647 Start execution of a combined barrier and master(nowait) construct.
1648 The barrier is executed inside this function.
1649 There is no equivalent "end" function, since the
1650 */
__kmpc_barrier_master_nowait(ident_t * loc,kmp_int32 global_tid)1651 kmp_int32 __kmpc_barrier_master_nowait(ident_t *loc, kmp_int32 global_tid) {
1652   kmp_int32 ret;
1653   KC_TRACE(10, ("__kmpc_barrier_master_nowait: called T#%d\n", global_tid));
1654   __kmp_assert_valid_gtid(global_tid);
1655 
1656   if (!TCR_4(__kmp_init_parallel))
1657     __kmp_parallel_initialize();
1658 
1659   __kmp_resume_if_soft_paused();
1660 
1661   if (__kmp_env_consistency_check) {
1662     if (loc == 0) {
1663       KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
1664     }
1665     __kmp_check_barrier(global_tid, ct_barrier, loc);
1666   }
1667 
1668 #if OMPT_SUPPORT
1669   ompt_frame_t *ompt_frame;
1670   if (ompt_enabled.enabled) {
1671     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1672     if (ompt_frame->enter_frame.ptr == NULL)
1673       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1674   }
1675   OMPT_STORE_RETURN_ADDRESS(global_tid);
1676 #endif
1677 #if USE_ITT_NOTIFY
1678   __kmp_threads[global_tid]->th.th_ident = loc;
1679 #endif
1680   __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
1681 #if OMPT_SUPPORT && OMPT_OPTIONAL
1682   if (ompt_enabled.enabled) {
1683     ompt_frame->enter_frame = ompt_data_none;
1684   }
1685 #endif
1686 
1687   ret = __kmpc_master(loc, global_tid);
1688 
1689   if (__kmp_env_consistency_check) {
1690     /*  there's no __kmpc_end_master called; so the (stats) */
1691     /*  actions of __kmpc_end_master are done here          */
1692     if (ret) {
1693       /* only one thread should do the pop since only */
1694       /* one did the push (see __kmpc_master())       */
1695       __kmp_pop_sync(global_tid, ct_master, loc);
1696     }
1697   }
1698 
1699   return (ret);
1700 }
1701 
1702 /* The BARRIER for a SINGLE process section is always explicit   */
1703 /*!
1704 @ingroup WORK_SHARING
1705 @param loc  source location information
1706 @param global_tid  global thread number
1707 @return One if this thread should execute the single construct, zero otherwise.
1708 
1709 Test whether to execute a <tt>single</tt> construct.
1710 There are no implicit barriers in the two "single" calls, rather the compiler
1711 should introduce an explicit barrier if it is required.
1712 */
1713 
__kmpc_single(ident_t * loc,kmp_int32 global_tid)1714 kmp_int32 __kmpc_single(ident_t *loc, kmp_int32 global_tid) {
1715   __kmp_assert_valid_gtid(global_tid);
1716   kmp_int32 rc = __kmp_enter_single(global_tid, loc, TRUE);
1717 
1718   if (rc) {
1719     // We are going to execute the single statement, so we should count it.
1720     KMP_COUNT_BLOCK(OMP_SINGLE);
1721     KMP_PUSH_PARTITIONED_TIMER(OMP_single);
1722   }
1723 
1724 #if OMPT_SUPPORT && OMPT_OPTIONAL
1725   kmp_info_t *this_thr = __kmp_threads[global_tid];
1726   kmp_team_t *team = this_thr->th.th_team;
1727   int tid = __kmp_tid_from_gtid(global_tid);
1728 
1729   if (ompt_enabled.enabled) {
1730     if (rc) {
1731       if (ompt_enabled.ompt_callback_work) {
1732         ompt_callbacks.ompt_callback(ompt_callback_work)(
1733             ompt_work_single_executor, ompt_scope_begin,
1734             &(team->t.ompt_team_info.parallel_data),
1735             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1736             1, OMPT_GET_RETURN_ADDRESS(0));
1737       }
1738     } else {
1739       if (ompt_enabled.ompt_callback_work) {
1740         ompt_callbacks.ompt_callback(ompt_callback_work)(
1741             ompt_work_single_other, ompt_scope_begin,
1742             &(team->t.ompt_team_info.parallel_data),
1743             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1744             1, OMPT_GET_RETURN_ADDRESS(0));
1745         ompt_callbacks.ompt_callback(ompt_callback_work)(
1746             ompt_work_single_other, ompt_scope_end,
1747             &(team->t.ompt_team_info.parallel_data),
1748             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1749             1, OMPT_GET_RETURN_ADDRESS(0));
1750       }
1751     }
1752   }
1753 #endif
1754 
1755   return rc;
1756 }
1757 
1758 /*!
1759 @ingroup WORK_SHARING
1760 @param loc  source location information
1761 @param global_tid  global thread number
1762 
1763 Mark the end of a <tt>single</tt> construct.  This function should
1764 only be called by the thread that executed the block of code protected
1765 by the `single` construct.
1766 */
__kmpc_end_single(ident_t * loc,kmp_int32 global_tid)1767 void __kmpc_end_single(ident_t *loc, kmp_int32 global_tid) {
1768   __kmp_assert_valid_gtid(global_tid);
1769   __kmp_exit_single(global_tid);
1770   KMP_POP_PARTITIONED_TIMER();
1771 
1772 #if OMPT_SUPPORT && OMPT_OPTIONAL
1773   kmp_info_t *this_thr = __kmp_threads[global_tid];
1774   kmp_team_t *team = this_thr->th.th_team;
1775   int tid = __kmp_tid_from_gtid(global_tid);
1776 
1777   if (ompt_enabled.ompt_callback_work) {
1778     ompt_callbacks.ompt_callback(ompt_callback_work)(
1779         ompt_work_single_executor, ompt_scope_end,
1780         &(team->t.ompt_team_info.parallel_data),
1781         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1,
1782         OMPT_GET_RETURN_ADDRESS(0));
1783   }
1784 #endif
1785 }
1786 
1787 /*!
1788 @ingroup WORK_SHARING
1789 @param loc Source location
1790 @param global_tid Global thread id
1791 
1792 Mark the end of a statically scheduled loop.
1793 */
__kmpc_for_static_fini(ident_t * loc,kmp_int32 global_tid)1794 void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid) {
1795   KMP_POP_PARTITIONED_TIMER();
1796   KE_TRACE(10, ("__kmpc_for_static_fini called T#%d\n", global_tid));
1797 
1798 #if OMPT_SUPPORT && OMPT_OPTIONAL
1799   if (ompt_enabled.ompt_callback_work) {
1800     ompt_work_t ompt_work_type = ompt_work_loop;
1801     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1802     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
1803     // Determine workshare type
1804     if (loc != NULL) {
1805       if ((loc->flags & KMP_IDENT_WORK_LOOP) != 0) {
1806         ompt_work_type = ompt_work_loop;
1807       } else if ((loc->flags & KMP_IDENT_WORK_SECTIONS) != 0) {
1808         ompt_work_type = ompt_work_sections;
1809       } else if ((loc->flags & KMP_IDENT_WORK_DISTRIBUTE) != 0) {
1810         ompt_work_type = ompt_work_distribute;
1811       } else {
1812         // use default set above.
1813         // a warning about this case is provided in __kmpc_for_static_init
1814       }
1815       KMP_DEBUG_ASSERT(ompt_work_type);
1816     }
1817     ompt_callbacks.ompt_callback(ompt_callback_work)(
1818         ompt_work_type, ompt_scope_end, &(team_info->parallel_data),
1819         &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
1820   }
1821 #endif
1822   if (__kmp_env_consistency_check)
1823     __kmp_pop_workshare(global_tid, ct_pdo, loc);
1824 }
1825 
1826 // User routines which take C-style arguments (call by value)
1827 // different from the Fortran equivalent routines
1828 
ompc_set_num_threads(int arg)1829 void ompc_set_num_threads(int arg) {
1830   // !!!!! TODO: check the per-task binding
1831   __kmp_set_num_threads(arg, __kmp_entry_gtid());
1832 }
1833 
ompc_set_dynamic(int flag)1834 void ompc_set_dynamic(int flag) {
1835   kmp_info_t *thread;
1836 
1837   /* For the thread-private implementation of the internal controls */
1838   thread = __kmp_entry_thread();
1839 
1840   __kmp_save_internal_controls(thread);
1841 
1842   set__dynamic(thread, flag ? TRUE : FALSE);
1843 }
1844 
ompc_set_nested(int flag)1845 void ompc_set_nested(int flag) {
1846   kmp_info_t *thread;
1847 
1848   /* For the thread-private internal controls implementation */
1849   thread = __kmp_entry_thread();
1850 
1851   __kmp_save_internal_controls(thread);
1852 
1853   set__max_active_levels(thread, flag ? __kmp_dflt_max_active_levels : 1);
1854 }
1855 
ompc_set_max_active_levels(int max_active_levels)1856 void ompc_set_max_active_levels(int max_active_levels) {
1857   /* TO DO */
1858   /* we want per-task implementation of this internal control */
1859 
1860   /* For the per-thread internal controls implementation */
1861   __kmp_set_max_active_levels(__kmp_entry_gtid(), max_active_levels);
1862 }
1863 
ompc_set_schedule(omp_sched_t kind,int modifier)1864 void ompc_set_schedule(omp_sched_t kind, int modifier) {
1865   // !!!!! TODO: check the per-task binding
1866   __kmp_set_schedule(__kmp_entry_gtid(), (kmp_sched_t)kind, modifier);
1867 }
1868 
ompc_get_ancestor_thread_num(int level)1869 int ompc_get_ancestor_thread_num(int level) {
1870   return __kmp_get_ancestor_thread_num(__kmp_entry_gtid(), level);
1871 }
1872 
ompc_get_team_size(int level)1873 int ompc_get_team_size(int level) {
1874   return __kmp_get_team_size(__kmp_entry_gtid(), level);
1875 }
1876 
1877 /* OpenMP 5.0 Affinity Format API */
1878 
ompc_set_affinity_format(char const * format)1879 void ompc_set_affinity_format(char const *format) {
1880   if (!__kmp_init_serial) {
1881     __kmp_serial_initialize();
1882   }
1883   __kmp_strncpy_truncate(__kmp_affinity_format, KMP_AFFINITY_FORMAT_SIZE,
1884                          format, KMP_STRLEN(format) + 1);
1885 }
1886 
ompc_get_affinity_format(char * buffer,size_t size)1887 size_t ompc_get_affinity_format(char *buffer, size_t size) {
1888   size_t format_size;
1889   if (!__kmp_init_serial) {
1890     __kmp_serial_initialize();
1891   }
1892   format_size = KMP_STRLEN(__kmp_affinity_format);
1893   if (buffer && size) {
1894     __kmp_strncpy_truncate(buffer, size, __kmp_affinity_format,
1895                            format_size + 1);
1896   }
1897   return format_size;
1898 }
1899 
ompc_display_affinity(char const * format)1900 void ompc_display_affinity(char const *format) {
1901   int gtid;
1902   if (!TCR_4(__kmp_init_middle)) {
1903     __kmp_middle_initialize();
1904   }
1905   gtid = __kmp_get_gtid();
1906   __kmp_aux_display_affinity(gtid, format);
1907 }
1908 
ompc_capture_affinity(char * buffer,size_t buf_size,char const * format)1909 size_t ompc_capture_affinity(char *buffer, size_t buf_size,
1910                              char const *format) {
1911   int gtid;
1912   size_t num_required;
1913   kmp_str_buf_t capture_buf;
1914   if (!TCR_4(__kmp_init_middle)) {
1915     __kmp_middle_initialize();
1916   }
1917   gtid = __kmp_get_gtid();
1918   __kmp_str_buf_init(&capture_buf);
1919   num_required = __kmp_aux_capture_affinity(gtid, format, &capture_buf);
1920   if (buffer && buf_size) {
1921     __kmp_strncpy_truncate(buffer, buf_size, capture_buf.str,
1922                            capture_buf.used + 1);
1923   }
1924   __kmp_str_buf_free(&capture_buf);
1925   return num_required;
1926 }
1927 
kmpc_set_stacksize(int arg)1928 void kmpc_set_stacksize(int arg) {
1929   // __kmp_aux_set_stacksize initializes the library if needed
1930   __kmp_aux_set_stacksize(arg);
1931 }
1932 
kmpc_set_stacksize_s(size_t arg)1933 void kmpc_set_stacksize_s(size_t arg) {
1934   // __kmp_aux_set_stacksize initializes the library if needed
1935   __kmp_aux_set_stacksize(arg);
1936 }
1937 
kmpc_set_blocktime(int arg)1938 void kmpc_set_blocktime(int arg) {
1939   int gtid, tid;
1940   kmp_info_t *thread;
1941 
1942   gtid = __kmp_entry_gtid();
1943   tid = __kmp_tid_from_gtid(gtid);
1944   thread = __kmp_thread_from_gtid(gtid);
1945 
1946   __kmp_aux_set_blocktime(arg, thread, tid);
1947 }
1948 
kmpc_set_library(int arg)1949 void kmpc_set_library(int arg) {
1950   // __kmp_user_set_library initializes the library if needed
1951   __kmp_user_set_library((enum library_type)arg);
1952 }
1953 
kmpc_set_defaults(char const * str)1954 void kmpc_set_defaults(char const *str) {
1955   // __kmp_aux_set_defaults initializes the library if needed
1956   __kmp_aux_set_defaults(str, KMP_STRLEN(str));
1957 }
1958 
kmpc_set_disp_num_buffers(int arg)1959 void kmpc_set_disp_num_buffers(int arg) {
1960   // ignore after initialization because some teams have already
1961   // allocated dispatch buffers
1962   if (__kmp_init_serial == 0 && arg > 0)
1963     __kmp_dispatch_num_buffers = arg;
1964 }
1965 
kmpc_set_affinity_mask_proc(int proc,void ** mask)1966 int kmpc_set_affinity_mask_proc(int proc, void **mask) {
1967 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1968   return -1;
1969 #else
1970   if (!TCR_4(__kmp_init_middle)) {
1971     __kmp_middle_initialize();
1972   }
1973   return __kmp_aux_set_affinity_mask_proc(proc, mask);
1974 #endif
1975 }
1976 
kmpc_unset_affinity_mask_proc(int proc,void ** mask)1977 int kmpc_unset_affinity_mask_proc(int proc, void **mask) {
1978 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1979   return -1;
1980 #else
1981   if (!TCR_4(__kmp_init_middle)) {
1982     __kmp_middle_initialize();
1983   }
1984   return __kmp_aux_unset_affinity_mask_proc(proc, mask);
1985 #endif
1986 }
1987 
kmpc_get_affinity_mask_proc(int proc,void ** mask)1988 int kmpc_get_affinity_mask_proc(int proc, void **mask) {
1989 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1990   return -1;
1991 #else
1992   if (!TCR_4(__kmp_init_middle)) {
1993     __kmp_middle_initialize();
1994   }
1995   return __kmp_aux_get_affinity_mask_proc(proc, mask);
1996 #endif
1997 }
1998 
1999 /* -------------------------------------------------------------------------- */
2000 /*!
2001 @ingroup THREADPRIVATE
2002 @param loc       source location information
2003 @param gtid      global thread number
2004 @param cpy_size  size of the cpy_data buffer
2005 @param cpy_data  pointer to data to be copied
2006 @param cpy_func  helper function to call for copying data
2007 @param didit     flag variable: 1=single thread; 0=not single thread
2008 
2009 __kmpc_copyprivate implements the interface for the private data broadcast
2010 needed for the copyprivate clause associated with a single region in an
2011 OpenMP<sup>*</sup> program (both C and Fortran).
2012 All threads participating in the parallel region call this routine.
2013 One of the threads (called the single thread) should have the <tt>didit</tt>
2014 variable set to 1 and all other threads should have that variable set to 0.
2015 All threads pass a pointer to a data buffer (cpy_data) that they have built.
2016 
2017 The OpenMP specification forbids the use of nowait on the single region when a
2018 copyprivate clause is present. However, @ref __kmpc_copyprivate implements a
2019 barrier internally to avoid race conditions, so the code generation for the
2020 single region should avoid generating a barrier after the call to @ref
2021 __kmpc_copyprivate.
2022 
2023 The <tt>gtid</tt> parameter is the global thread id for the current thread.
2024 The <tt>loc</tt> parameter is a pointer to source location information.
2025 
2026 Internal implementation: The single thread will first copy its descriptor
2027 address (cpy_data) to a team-private location, then the other threads will each
2028 call the function pointed to by the parameter cpy_func, which carries out the
2029 copy by copying the data using the cpy_data buffer.
2030 
2031 The cpy_func routine used for the copy and the contents of the data area defined
2032 by cpy_data and cpy_size may be built in any fashion that will allow the copy
2033 to be done. For instance, the cpy_data buffer can hold the actual data to be
2034 copied or it may hold a list of pointers to the data. The cpy_func routine must
2035 interpret the cpy_data buffer appropriately.
2036 
2037 The interface to cpy_func is as follows:
2038 @code
2039 void cpy_func( void *destination, void *source )
2040 @endcode
2041 where void *destination is the cpy_data pointer for the thread being copied to
2042 and void *source is the cpy_data pointer for the thread being copied from.
2043 */
__kmpc_copyprivate(ident_t * loc,kmp_int32 gtid,size_t cpy_size,void * cpy_data,void (* cpy_func)(void *,void *),kmp_int32 didit)2044 void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size,
2045                         void *cpy_data, void (*cpy_func)(void *, void *),
2046                         kmp_int32 didit) {
2047   void **data_ptr;
2048   KC_TRACE(10, ("__kmpc_copyprivate: called T#%d\n", gtid));
2049   __kmp_assert_valid_gtid(gtid);
2050 
2051   KMP_MB();
2052 
2053   data_ptr = &__kmp_team_from_gtid(gtid)->t.t_copypriv_data;
2054 
2055   if (__kmp_env_consistency_check) {
2056     if (loc == 0) {
2057       KMP_WARNING(ConstructIdentInvalid);
2058     }
2059   }
2060 
2061   // ToDo: Optimize the following two barriers into some kind of split barrier
2062 
2063   if (didit)
2064     *data_ptr = cpy_data;
2065 
2066 #if OMPT_SUPPORT
2067   ompt_frame_t *ompt_frame;
2068   if (ompt_enabled.enabled) {
2069     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
2070     if (ompt_frame->enter_frame.ptr == NULL)
2071       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
2072   }
2073   OMPT_STORE_RETURN_ADDRESS(gtid);
2074 #endif
2075 /* This barrier is not a barrier region boundary */
2076 #if USE_ITT_NOTIFY
2077   __kmp_threads[gtid]->th.th_ident = loc;
2078 #endif
2079   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2080 
2081   if (!didit)
2082     (*cpy_func)(cpy_data, *data_ptr);
2083 
2084 // Consider next barrier a user-visible barrier for barrier region boundaries
2085 // Nesting checks are already handled by the single construct checks
2086   {
2087 #if OMPT_SUPPORT
2088     OMPT_STORE_RETURN_ADDRESS(gtid);
2089 #endif
2090 #if USE_ITT_NOTIFY
2091   __kmp_threads[gtid]->th.th_ident = loc; // TODO: check if it is needed (e.g.
2092 // tasks can overwrite the location)
2093 #endif
2094   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2095 #if OMPT_SUPPORT && OMPT_OPTIONAL
2096   if (ompt_enabled.enabled) {
2097     ompt_frame->enter_frame = ompt_data_none;
2098   }
2099 #endif
2100   }
2101 }
2102 
2103 /* -------------------------------------------------------------------------- */
2104 
2105 #define INIT_LOCK __kmp_init_user_lock_with_checks
2106 #define INIT_NESTED_LOCK __kmp_init_nested_user_lock_with_checks
2107 #define ACQUIRE_LOCK __kmp_acquire_user_lock_with_checks
2108 #define ACQUIRE_LOCK_TIMED __kmp_acquire_user_lock_with_checks_timed
2109 #define ACQUIRE_NESTED_LOCK __kmp_acquire_nested_user_lock_with_checks
2110 #define ACQUIRE_NESTED_LOCK_TIMED                                              \
2111   __kmp_acquire_nested_user_lock_with_checks_timed
2112 #define RELEASE_LOCK __kmp_release_user_lock_with_checks
2113 #define RELEASE_NESTED_LOCK __kmp_release_nested_user_lock_with_checks
2114 #define TEST_LOCK __kmp_test_user_lock_with_checks
2115 #define TEST_NESTED_LOCK __kmp_test_nested_user_lock_with_checks
2116 #define DESTROY_LOCK __kmp_destroy_user_lock_with_checks
2117 #define DESTROY_NESTED_LOCK __kmp_destroy_nested_user_lock_with_checks
2118 
2119 // TODO: Make check abort messages use location info & pass it into
2120 // with_checks routines
2121 
2122 #if KMP_USE_DYNAMIC_LOCK
2123 
2124 // internal lock initializer
__kmp_init_lock_with_hint(ident_t * loc,void ** lock,kmp_dyna_lockseq_t seq)2125 static __forceinline void __kmp_init_lock_with_hint(ident_t *loc, void **lock,
2126                                                     kmp_dyna_lockseq_t seq) {
2127   if (KMP_IS_D_LOCK(seq)) {
2128     KMP_INIT_D_LOCK(lock, seq);
2129 #if USE_ITT_BUILD
2130     __kmp_itt_lock_creating((kmp_user_lock_p)lock, NULL);
2131 #endif
2132   } else {
2133     KMP_INIT_I_LOCK(lock, seq);
2134 #if USE_ITT_BUILD
2135     kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2136     __kmp_itt_lock_creating(ilk->lock, loc);
2137 #endif
2138   }
2139 }
2140 
2141 // internal nest lock initializer
2142 static __forceinline void
__kmp_init_nest_lock_with_hint(ident_t * loc,void ** lock,kmp_dyna_lockseq_t seq)2143 __kmp_init_nest_lock_with_hint(ident_t *loc, void **lock,
2144                                kmp_dyna_lockseq_t seq) {
2145 #if KMP_USE_TSX
2146   // Don't have nested lock implementation for speculative locks
2147   if (seq == lockseq_hle || seq == lockseq_rtm || seq == lockseq_adaptive)
2148     seq = __kmp_user_lock_seq;
2149 #endif
2150   switch (seq) {
2151   case lockseq_tas:
2152     seq = lockseq_nested_tas;
2153     break;
2154 #if KMP_USE_FUTEX
2155   case lockseq_futex:
2156     seq = lockseq_nested_futex;
2157     break;
2158 #endif
2159   case lockseq_ticket:
2160     seq = lockseq_nested_ticket;
2161     break;
2162   case lockseq_queuing:
2163     seq = lockseq_nested_queuing;
2164     break;
2165   case lockseq_drdpa:
2166     seq = lockseq_nested_drdpa;
2167     break;
2168   default:
2169     seq = lockseq_nested_queuing;
2170   }
2171   KMP_INIT_I_LOCK(lock, seq);
2172 #if USE_ITT_BUILD
2173   kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2174   __kmp_itt_lock_creating(ilk->lock, loc);
2175 #endif
2176 }
2177 
2178 /* initialize the lock with a hint */
__kmpc_init_lock_with_hint(ident_t * loc,kmp_int32 gtid,void ** user_lock,uintptr_t hint)2179 void __kmpc_init_lock_with_hint(ident_t *loc, kmp_int32 gtid, void **user_lock,
2180                                 uintptr_t hint) {
2181   KMP_DEBUG_ASSERT(__kmp_init_serial);
2182   if (__kmp_env_consistency_check && user_lock == NULL) {
2183     KMP_FATAL(LockIsUninitialized, "omp_init_lock_with_hint");
2184   }
2185 
2186   __kmp_init_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2187 
2188 #if OMPT_SUPPORT && OMPT_OPTIONAL
2189   // This is the case, if called from omp_init_lock_with_hint:
2190   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2191   if (!codeptr)
2192     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2193   if (ompt_enabled.ompt_callback_lock_init) {
2194     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2195         ompt_mutex_lock, (omp_lock_hint_t)hint,
2196         __ompt_get_mutex_impl_type(user_lock),
2197         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2198   }
2199 #endif
2200 }
2201 
2202 /* initialize the lock with a hint */
__kmpc_init_nest_lock_with_hint(ident_t * loc,kmp_int32 gtid,void ** user_lock,uintptr_t hint)2203 void __kmpc_init_nest_lock_with_hint(ident_t *loc, kmp_int32 gtid,
2204                                      void **user_lock, uintptr_t hint) {
2205   KMP_DEBUG_ASSERT(__kmp_init_serial);
2206   if (__kmp_env_consistency_check && user_lock == NULL) {
2207     KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock_with_hint");
2208   }
2209 
2210   __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2211 
2212 #if OMPT_SUPPORT && OMPT_OPTIONAL
2213   // This is the case, if called from omp_init_lock_with_hint:
2214   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2215   if (!codeptr)
2216     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2217   if (ompt_enabled.ompt_callback_lock_init) {
2218     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2219         ompt_mutex_nest_lock, (omp_lock_hint_t)hint,
2220         __ompt_get_mutex_impl_type(user_lock),
2221         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2222   }
2223 #endif
2224 }
2225 
2226 #endif // KMP_USE_DYNAMIC_LOCK
2227 
2228 /* initialize the lock */
__kmpc_init_lock(ident_t * loc,kmp_int32 gtid,void ** user_lock)2229 void __kmpc_init_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2230 #if KMP_USE_DYNAMIC_LOCK
2231 
2232   KMP_DEBUG_ASSERT(__kmp_init_serial);
2233   if (__kmp_env_consistency_check && user_lock == NULL) {
2234     KMP_FATAL(LockIsUninitialized, "omp_init_lock");
2235   }
2236   __kmp_init_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2237 
2238 #if OMPT_SUPPORT && OMPT_OPTIONAL
2239   // This is the case, if called from omp_init_lock_with_hint:
2240   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2241   if (!codeptr)
2242     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2243   if (ompt_enabled.ompt_callback_lock_init) {
2244     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2245         ompt_mutex_lock, omp_lock_hint_none,
2246         __ompt_get_mutex_impl_type(user_lock),
2247         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2248   }
2249 #endif
2250 
2251 #else // KMP_USE_DYNAMIC_LOCK
2252 
2253   static char const *const func = "omp_init_lock";
2254   kmp_user_lock_p lck;
2255   KMP_DEBUG_ASSERT(__kmp_init_serial);
2256 
2257   if (__kmp_env_consistency_check) {
2258     if (user_lock == NULL) {
2259       KMP_FATAL(LockIsUninitialized, func);
2260     }
2261   }
2262 
2263   KMP_CHECK_USER_LOCK_INIT();
2264 
2265   if ((__kmp_user_lock_kind == lk_tas) &&
2266       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2267     lck = (kmp_user_lock_p)user_lock;
2268   }
2269 #if KMP_USE_FUTEX
2270   else if ((__kmp_user_lock_kind == lk_futex) &&
2271            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2272     lck = (kmp_user_lock_p)user_lock;
2273   }
2274 #endif
2275   else {
2276     lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2277   }
2278   INIT_LOCK(lck);
2279   __kmp_set_user_lock_location(lck, loc);
2280 
2281 #if OMPT_SUPPORT && OMPT_OPTIONAL
2282   // This is the case, if called from omp_init_lock_with_hint:
2283   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2284   if (!codeptr)
2285     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2286   if (ompt_enabled.ompt_callback_lock_init) {
2287     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2288         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2289         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2290   }
2291 #endif
2292 
2293 #if USE_ITT_BUILD
2294   __kmp_itt_lock_creating(lck);
2295 #endif /* USE_ITT_BUILD */
2296 
2297 #endif // KMP_USE_DYNAMIC_LOCK
2298 } // __kmpc_init_lock
2299 
2300 /* initialize the lock */
__kmpc_init_nest_lock(ident_t * loc,kmp_int32 gtid,void ** user_lock)2301 void __kmpc_init_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2302 #if KMP_USE_DYNAMIC_LOCK
2303 
2304   KMP_DEBUG_ASSERT(__kmp_init_serial);
2305   if (__kmp_env_consistency_check && user_lock == NULL) {
2306     KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock");
2307   }
2308   __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2309 
2310 #if OMPT_SUPPORT && OMPT_OPTIONAL
2311   // This is the case, if called from omp_init_lock_with_hint:
2312   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2313   if (!codeptr)
2314     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2315   if (ompt_enabled.ompt_callback_lock_init) {
2316     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2317         ompt_mutex_nest_lock, omp_lock_hint_none,
2318         __ompt_get_mutex_impl_type(user_lock),
2319         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2320   }
2321 #endif
2322 
2323 #else // KMP_USE_DYNAMIC_LOCK
2324 
2325   static char const *const func = "omp_init_nest_lock";
2326   kmp_user_lock_p lck;
2327   KMP_DEBUG_ASSERT(__kmp_init_serial);
2328 
2329   if (__kmp_env_consistency_check) {
2330     if (user_lock == NULL) {
2331       KMP_FATAL(LockIsUninitialized, func);
2332     }
2333   }
2334 
2335   KMP_CHECK_USER_LOCK_INIT();
2336 
2337   if ((__kmp_user_lock_kind == lk_tas) &&
2338       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2339        OMP_NEST_LOCK_T_SIZE)) {
2340     lck = (kmp_user_lock_p)user_lock;
2341   }
2342 #if KMP_USE_FUTEX
2343   else if ((__kmp_user_lock_kind == lk_futex) &&
2344            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2345             OMP_NEST_LOCK_T_SIZE)) {
2346     lck = (kmp_user_lock_p)user_lock;
2347   }
2348 #endif
2349   else {
2350     lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2351   }
2352 
2353   INIT_NESTED_LOCK(lck);
2354   __kmp_set_user_lock_location(lck, loc);
2355 
2356 #if OMPT_SUPPORT && OMPT_OPTIONAL
2357   // This is the case, if called from omp_init_lock_with_hint:
2358   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2359   if (!codeptr)
2360     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2361   if (ompt_enabled.ompt_callback_lock_init) {
2362     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2363         ompt_mutex_nest_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2364         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2365   }
2366 #endif
2367 
2368 #if USE_ITT_BUILD
2369   __kmp_itt_lock_creating(lck);
2370 #endif /* USE_ITT_BUILD */
2371 
2372 #endif // KMP_USE_DYNAMIC_LOCK
2373 } // __kmpc_init_nest_lock
2374 
__kmpc_destroy_lock(ident_t * loc,kmp_int32 gtid,void ** user_lock)2375 void __kmpc_destroy_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2376 #if KMP_USE_DYNAMIC_LOCK
2377 
2378 #if USE_ITT_BUILD
2379   kmp_user_lock_p lck;
2380   if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2381     lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2382   } else {
2383     lck = (kmp_user_lock_p)user_lock;
2384   }
2385   __kmp_itt_lock_destroyed(lck);
2386 #endif
2387 #if OMPT_SUPPORT && OMPT_OPTIONAL
2388   // This is the case, if called from omp_init_lock_with_hint:
2389   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2390   if (!codeptr)
2391     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2392   if (ompt_enabled.ompt_callback_lock_destroy) {
2393     kmp_user_lock_p lck;
2394     if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2395       lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2396     } else {
2397       lck = (kmp_user_lock_p)user_lock;
2398     }
2399     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2400         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2401   }
2402 #endif
2403   KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2404 #else
2405   kmp_user_lock_p lck;
2406 
2407   if ((__kmp_user_lock_kind == lk_tas) &&
2408       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2409     lck = (kmp_user_lock_p)user_lock;
2410   }
2411 #if KMP_USE_FUTEX
2412   else if ((__kmp_user_lock_kind == lk_futex) &&
2413            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2414     lck = (kmp_user_lock_p)user_lock;
2415   }
2416 #endif
2417   else {
2418     lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_lock");
2419   }
2420 
2421 #if OMPT_SUPPORT && OMPT_OPTIONAL
2422   // This is the case, if called from omp_init_lock_with_hint:
2423   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2424   if (!codeptr)
2425     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2426   if (ompt_enabled.ompt_callback_lock_destroy) {
2427     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2428         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2429   }
2430 #endif
2431 
2432 #if USE_ITT_BUILD
2433   __kmp_itt_lock_destroyed(lck);
2434 #endif /* USE_ITT_BUILD */
2435   DESTROY_LOCK(lck);
2436 
2437   if ((__kmp_user_lock_kind == lk_tas) &&
2438       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2439     ;
2440   }
2441 #if KMP_USE_FUTEX
2442   else if ((__kmp_user_lock_kind == lk_futex) &&
2443            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2444     ;
2445   }
2446 #endif
2447   else {
2448     __kmp_user_lock_free(user_lock, gtid, lck);
2449   }
2450 #endif // KMP_USE_DYNAMIC_LOCK
2451 } // __kmpc_destroy_lock
2452 
2453 /* destroy the lock */
__kmpc_destroy_nest_lock(ident_t * loc,kmp_int32 gtid,void ** user_lock)2454 void __kmpc_destroy_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2455 #if KMP_USE_DYNAMIC_LOCK
2456 
2457 #if USE_ITT_BUILD
2458   kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(user_lock);
2459   __kmp_itt_lock_destroyed(ilk->lock);
2460 #endif
2461 #if OMPT_SUPPORT && OMPT_OPTIONAL
2462   // This is the case, if called from omp_init_lock_with_hint:
2463   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2464   if (!codeptr)
2465     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2466   if (ompt_enabled.ompt_callback_lock_destroy) {
2467     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2468         ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2469   }
2470 #endif
2471   KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2472 
2473 #else // KMP_USE_DYNAMIC_LOCK
2474 
2475   kmp_user_lock_p lck;
2476 
2477   if ((__kmp_user_lock_kind == lk_tas) &&
2478       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2479        OMP_NEST_LOCK_T_SIZE)) {
2480     lck = (kmp_user_lock_p)user_lock;
2481   }
2482 #if KMP_USE_FUTEX
2483   else if ((__kmp_user_lock_kind == lk_futex) &&
2484            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2485             OMP_NEST_LOCK_T_SIZE)) {
2486     lck = (kmp_user_lock_p)user_lock;
2487   }
2488 #endif
2489   else {
2490     lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_nest_lock");
2491   }
2492 
2493 #if OMPT_SUPPORT && OMPT_OPTIONAL
2494   // This is the case, if called from omp_init_lock_with_hint:
2495   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2496   if (!codeptr)
2497     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2498   if (ompt_enabled.ompt_callback_lock_destroy) {
2499     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2500         ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2501   }
2502 #endif
2503 
2504 #if USE_ITT_BUILD
2505   __kmp_itt_lock_destroyed(lck);
2506 #endif /* USE_ITT_BUILD */
2507 
2508   DESTROY_NESTED_LOCK(lck);
2509 
2510   if ((__kmp_user_lock_kind == lk_tas) &&
2511       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2512        OMP_NEST_LOCK_T_SIZE)) {
2513     ;
2514   }
2515 #if KMP_USE_FUTEX
2516   else if ((__kmp_user_lock_kind == lk_futex) &&
2517            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2518             OMP_NEST_LOCK_T_SIZE)) {
2519     ;
2520   }
2521 #endif
2522   else {
2523     __kmp_user_lock_free(user_lock, gtid, lck);
2524   }
2525 #endif // KMP_USE_DYNAMIC_LOCK
2526 } // __kmpc_destroy_nest_lock
2527 
__kmpc_set_lock(ident_t * loc,kmp_int32 gtid,void ** user_lock)2528 void __kmpc_set_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2529   KMP_COUNT_BLOCK(OMP_set_lock);
2530 #if KMP_USE_DYNAMIC_LOCK
2531   int tag = KMP_EXTRACT_D_TAG(user_lock);
2532 #if USE_ITT_BUILD
2533   __kmp_itt_lock_acquiring(
2534       (kmp_user_lock_p)
2535           user_lock); // itt function will get to the right lock object.
2536 #endif
2537 #if OMPT_SUPPORT && OMPT_OPTIONAL
2538   // This is the case, if called from omp_init_lock_with_hint:
2539   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2540   if (!codeptr)
2541     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2542   if (ompt_enabled.ompt_callback_mutex_acquire) {
2543     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2544         ompt_mutex_lock, omp_lock_hint_none,
2545         __ompt_get_mutex_impl_type(user_lock),
2546         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2547   }
2548 #endif
2549 #if KMP_USE_INLINED_TAS
2550   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2551     KMP_ACQUIRE_TAS_LOCK(user_lock, gtid);
2552   } else
2553 #elif KMP_USE_INLINED_FUTEX
2554   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2555     KMP_ACQUIRE_FUTEX_LOCK(user_lock, gtid);
2556   } else
2557 #endif
2558   {
2559     __kmp_direct_set[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2560   }
2561 #if USE_ITT_BUILD
2562   __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2563 #endif
2564 #if OMPT_SUPPORT && OMPT_OPTIONAL
2565   if (ompt_enabled.ompt_callback_mutex_acquired) {
2566     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2567         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2568   }
2569 #endif
2570 
2571 #else // KMP_USE_DYNAMIC_LOCK
2572 
2573   kmp_user_lock_p lck;
2574 
2575   if ((__kmp_user_lock_kind == lk_tas) &&
2576       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2577     lck = (kmp_user_lock_p)user_lock;
2578   }
2579 #if KMP_USE_FUTEX
2580   else if ((__kmp_user_lock_kind == lk_futex) &&
2581            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2582     lck = (kmp_user_lock_p)user_lock;
2583   }
2584 #endif
2585   else {
2586     lck = __kmp_lookup_user_lock(user_lock, "omp_set_lock");
2587   }
2588 
2589 #if USE_ITT_BUILD
2590   __kmp_itt_lock_acquiring(lck);
2591 #endif /* USE_ITT_BUILD */
2592 #if OMPT_SUPPORT && OMPT_OPTIONAL
2593   // This is the case, if called from omp_init_lock_with_hint:
2594   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2595   if (!codeptr)
2596     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2597   if (ompt_enabled.ompt_callback_mutex_acquire) {
2598     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2599         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2600         (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2601   }
2602 #endif
2603 
2604   ACQUIRE_LOCK(lck, gtid);
2605 
2606 #if USE_ITT_BUILD
2607   __kmp_itt_lock_acquired(lck);
2608 #endif /* USE_ITT_BUILD */
2609 
2610 #if OMPT_SUPPORT && OMPT_OPTIONAL
2611   if (ompt_enabled.ompt_callback_mutex_acquired) {
2612     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2613         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2614   }
2615 #endif
2616 
2617 #endif // KMP_USE_DYNAMIC_LOCK
2618 }
2619 
__kmpc_set_nest_lock(ident_t * loc,kmp_int32 gtid,void ** user_lock)2620 void __kmpc_set_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2621 #if KMP_USE_DYNAMIC_LOCK
2622 
2623 #if USE_ITT_BUILD
2624   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2625 #endif
2626 #if OMPT_SUPPORT && OMPT_OPTIONAL
2627   // This is the case, if called from omp_init_lock_with_hint:
2628   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2629   if (!codeptr)
2630     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2631   if (ompt_enabled.enabled) {
2632     if (ompt_enabled.ompt_callback_mutex_acquire) {
2633       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2634           ompt_mutex_nest_lock, omp_lock_hint_none,
2635           __ompt_get_mutex_impl_type(user_lock),
2636           (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2637     }
2638   }
2639 #endif
2640   int acquire_status =
2641       KMP_D_LOCK_FUNC(user_lock, set)((kmp_dyna_lock_t *)user_lock, gtid);
2642   (void) acquire_status;
2643 #if USE_ITT_BUILD
2644   __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2645 #endif
2646 
2647 #if OMPT_SUPPORT && OMPT_OPTIONAL
2648   if (ompt_enabled.enabled) {
2649     if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2650       if (ompt_enabled.ompt_callback_mutex_acquired) {
2651         // lock_first
2652         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2653             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
2654             codeptr);
2655       }
2656     } else {
2657       if (ompt_enabled.ompt_callback_nest_lock) {
2658         // lock_next
2659         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2660             ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2661       }
2662     }
2663   }
2664 #endif
2665 
2666 #else // KMP_USE_DYNAMIC_LOCK
2667   int acquire_status;
2668   kmp_user_lock_p lck;
2669 
2670   if ((__kmp_user_lock_kind == lk_tas) &&
2671       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2672        OMP_NEST_LOCK_T_SIZE)) {
2673     lck = (kmp_user_lock_p)user_lock;
2674   }
2675 #if KMP_USE_FUTEX
2676   else if ((__kmp_user_lock_kind == lk_futex) &&
2677            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2678             OMP_NEST_LOCK_T_SIZE)) {
2679     lck = (kmp_user_lock_p)user_lock;
2680   }
2681 #endif
2682   else {
2683     lck = __kmp_lookup_user_lock(user_lock, "omp_set_nest_lock");
2684   }
2685 
2686 #if USE_ITT_BUILD
2687   __kmp_itt_lock_acquiring(lck);
2688 #endif /* USE_ITT_BUILD */
2689 #if OMPT_SUPPORT && OMPT_OPTIONAL
2690   // This is the case, if called from omp_init_lock_with_hint:
2691   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2692   if (!codeptr)
2693     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2694   if (ompt_enabled.enabled) {
2695     if (ompt_enabled.ompt_callback_mutex_acquire) {
2696       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2697           ompt_mutex_nest_lock, omp_lock_hint_none,
2698           __ompt_get_mutex_impl_type(), (ompt_wait_id_t)(uintptr_t)lck,
2699           codeptr);
2700     }
2701   }
2702 #endif
2703 
2704   ACQUIRE_NESTED_LOCK(lck, gtid, &acquire_status);
2705 
2706 #if USE_ITT_BUILD
2707   __kmp_itt_lock_acquired(lck);
2708 #endif /* USE_ITT_BUILD */
2709 
2710 #if OMPT_SUPPORT && OMPT_OPTIONAL
2711   if (ompt_enabled.enabled) {
2712     if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2713       if (ompt_enabled.ompt_callback_mutex_acquired) {
2714         // lock_first
2715         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2716             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2717       }
2718     } else {
2719       if (ompt_enabled.ompt_callback_nest_lock) {
2720         // lock_next
2721         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2722             ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2723       }
2724     }
2725   }
2726 #endif
2727 
2728 #endif // KMP_USE_DYNAMIC_LOCK
2729 }
2730 
__kmpc_unset_lock(ident_t * loc,kmp_int32 gtid,void ** user_lock)2731 void __kmpc_unset_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2732 #if KMP_USE_DYNAMIC_LOCK
2733 
2734   int tag = KMP_EXTRACT_D_TAG(user_lock);
2735 #if USE_ITT_BUILD
2736   __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2737 #endif
2738 #if KMP_USE_INLINED_TAS
2739   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2740     KMP_RELEASE_TAS_LOCK(user_lock, gtid);
2741   } else
2742 #elif KMP_USE_INLINED_FUTEX
2743   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2744     KMP_RELEASE_FUTEX_LOCK(user_lock, gtid);
2745   } else
2746 #endif
2747   {
2748     __kmp_direct_unset[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2749   }
2750 
2751 #if OMPT_SUPPORT && OMPT_OPTIONAL
2752   // This is the case, if called from omp_init_lock_with_hint:
2753   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2754   if (!codeptr)
2755     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2756   if (ompt_enabled.ompt_callback_mutex_released) {
2757     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2758         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2759   }
2760 #endif
2761 
2762 #else // KMP_USE_DYNAMIC_LOCK
2763 
2764   kmp_user_lock_p lck;
2765 
2766   /* Can't use serial interval since not block structured */
2767   /* release the lock */
2768 
2769   if ((__kmp_user_lock_kind == lk_tas) &&
2770       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2771 #if KMP_OS_LINUX &&                                                            \
2772     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2773 // "fast" path implemented to fix customer performance issue
2774 #if USE_ITT_BUILD
2775     __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2776 #endif /* USE_ITT_BUILD */
2777     TCW_4(((kmp_user_lock_p)user_lock)->tas.lk.poll, 0);
2778     KMP_MB();
2779 
2780 #if OMPT_SUPPORT && OMPT_OPTIONAL
2781     // This is the case, if called from omp_init_lock_with_hint:
2782     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2783     if (!codeptr)
2784       codeptr = OMPT_GET_RETURN_ADDRESS(0);
2785     if (ompt_enabled.ompt_callback_mutex_released) {
2786       ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2787           ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2788     }
2789 #endif
2790 
2791     return;
2792 #else
2793     lck = (kmp_user_lock_p)user_lock;
2794 #endif
2795   }
2796 #if KMP_USE_FUTEX
2797   else if ((__kmp_user_lock_kind == lk_futex) &&
2798            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2799     lck = (kmp_user_lock_p)user_lock;
2800   }
2801 #endif
2802   else {
2803     lck = __kmp_lookup_user_lock(user_lock, "omp_unset_lock");
2804   }
2805 
2806 #if USE_ITT_BUILD
2807   __kmp_itt_lock_releasing(lck);
2808 #endif /* USE_ITT_BUILD */
2809 
2810   RELEASE_LOCK(lck, gtid);
2811 
2812 #if OMPT_SUPPORT && OMPT_OPTIONAL
2813   // This is the case, if called from omp_init_lock_with_hint:
2814   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2815   if (!codeptr)
2816     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2817   if (ompt_enabled.ompt_callback_mutex_released) {
2818     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2819         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2820   }
2821 #endif
2822 
2823 #endif // KMP_USE_DYNAMIC_LOCK
2824 }
2825 
2826 /* release the lock */
__kmpc_unset_nest_lock(ident_t * loc,kmp_int32 gtid,void ** user_lock)2827 void __kmpc_unset_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2828 #if KMP_USE_DYNAMIC_LOCK
2829 
2830 #if USE_ITT_BUILD
2831   __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2832 #endif
2833   int release_status =
2834       KMP_D_LOCK_FUNC(user_lock, unset)((kmp_dyna_lock_t *)user_lock, gtid);
2835   (void) release_status;
2836 
2837 #if OMPT_SUPPORT && OMPT_OPTIONAL
2838   // This is the case, if called from omp_init_lock_with_hint:
2839   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2840   if (!codeptr)
2841     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2842   if (ompt_enabled.enabled) {
2843     if (release_status == KMP_LOCK_RELEASED) {
2844       if (ompt_enabled.ompt_callback_mutex_released) {
2845         // release_lock_last
2846         ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2847             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
2848             codeptr);
2849       }
2850     } else if (ompt_enabled.ompt_callback_nest_lock) {
2851       // release_lock_prev
2852       ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2853           ompt_scope_end, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2854     }
2855   }
2856 #endif
2857 
2858 #else // KMP_USE_DYNAMIC_LOCK
2859 
2860   kmp_user_lock_p lck;
2861 
2862   /* Can't use serial interval since not block structured */
2863 
2864   if ((__kmp_user_lock_kind == lk_tas) &&
2865       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2866        OMP_NEST_LOCK_T_SIZE)) {
2867 #if KMP_OS_LINUX &&                                                            \
2868     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2869     // "fast" path implemented to fix customer performance issue
2870     kmp_tas_lock_t *tl = (kmp_tas_lock_t *)user_lock;
2871 #if USE_ITT_BUILD
2872     __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2873 #endif /* USE_ITT_BUILD */
2874 
2875 #if OMPT_SUPPORT && OMPT_OPTIONAL
2876     int release_status = KMP_LOCK_STILL_HELD;
2877 #endif
2878 
2879     if (--(tl->lk.depth_locked) == 0) {
2880       TCW_4(tl->lk.poll, 0);
2881 #if OMPT_SUPPORT && OMPT_OPTIONAL
2882       release_status = KMP_LOCK_RELEASED;
2883 #endif
2884     }
2885     KMP_MB();
2886 
2887 #if OMPT_SUPPORT && OMPT_OPTIONAL
2888     // This is the case, if called from omp_init_lock_with_hint:
2889     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2890     if (!codeptr)
2891       codeptr = OMPT_GET_RETURN_ADDRESS(0);
2892     if (ompt_enabled.enabled) {
2893       if (release_status == KMP_LOCK_RELEASED) {
2894         if (ompt_enabled.ompt_callback_mutex_released) {
2895           // release_lock_last
2896           ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2897               ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2898         }
2899       } else if (ompt_enabled.ompt_callback_nest_lock) {
2900         // release_lock_previous
2901         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2902             ompt_mutex_scope_end, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2903       }
2904     }
2905 #endif
2906 
2907     return;
2908 #else
2909     lck = (kmp_user_lock_p)user_lock;
2910 #endif
2911   }
2912 #if KMP_USE_FUTEX
2913   else if ((__kmp_user_lock_kind == lk_futex) &&
2914            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2915             OMP_NEST_LOCK_T_SIZE)) {
2916     lck = (kmp_user_lock_p)user_lock;
2917   }
2918 #endif
2919   else {
2920     lck = __kmp_lookup_user_lock(user_lock, "omp_unset_nest_lock");
2921   }
2922 
2923 #if USE_ITT_BUILD
2924   __kmp_itt_lock_releasing(lck);
2925 #endif /* USE_ITT_BUILD */
2926 
2927   int release_status;
2928   release_status = RELEASE_NESTED_LOCK(lck, gtid);
2929 #if OMPT_SUPPORT && OMPT_OPTIONAL
2930   // This is the case, if called from omp_init_lock_with_hint:
2931   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2932   if (!codeptr)
2933     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2934   if (ompt_enabled.enabled) {
2935     if (release_status == KMP_LOCK_RELEASED) {
2936       if (ompt_enabled.ompt_callback_mutex_released) {
2937         // release_lock_last
2938         ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2939             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2940       }
2941     } else if (ompt_enabled.ompt_callback_nest_lock) {
2942       // release_lock_previous
2943       ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2944           ompt_mutex_scope_end, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2945     }
2946   }
2947 #endif
2948 
2949 #endif // KMP_USE_DYNAMIC_LOCK
2950 }
2951 
2952 /* try to acquire the lock */
__kmpc_test_lock(ident_t * loc,kmp_int32 gtid,void ** user_lock)2953 int __kmpc_test_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2954   KMP_COUNT_BLOCK(OMP_test_lock);
2955 
2956 #if KMP_USE_DYNAMIC_LOCK
2957   int rc;
2958   int tag = KMP_EXTRACT_D_TAG(user_lock);
2959 #if USE_ITT_BUILD
2960   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2961 #endif
2962 #if OMPT_SUPPORT && OMPT_OPTIONAL
2963   // This is the case, if called from omp_init_lock_with_hint:
2964   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2965   if (!codeptr)
2966     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2967   if (ompt_enabled.ompt_callback_mutex_acquire) {
2968     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2969         ompt_mutex_lock, omp_lock_hint_none,
2970         __ompt_get_mutex_impl_type(user_lock),
2971         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2972   }
2973 #endif
2974 #if KMP_USE_INLINED_TAS
2975   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2976     KMP_TEST_TAS_LOCK(user_lock, gtid, rc);
2977   } else
2978 #elif KMP_USE_INLINED_FUTEX
2979   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2980     KMP_TEST_FUTEX_LOCK(user_lock, gtid, rc);
2981   } else
2982 #endif
2983   {
2984     rc = __kmp_direct_test[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2985   }
2986   if (rc) {
2987 #if USE_ITT_BUILD
2988     __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2989 #endif
2990 #if OMPT_SUPPORT && OMPT_OPTIONAL
2991     if (ompt_enabled.ompt_callback_mutex_acquired) {
2992       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2993           ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2994     }
2995 #endif
2996     return FTN_TRUE;
2997   } else {
2998 #if USE_ITT_BUILD
2999     __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3000 #endif
3001     return FTN_FALSE;
3002   }
3003 
3004 #else // KMP_USE_DYNAMIC_LOCK
3005 
3006   kmp_user_lock_p lck;
3007   int rc;
3008 
3009   if ((__kmp_user_lock_kind == lk_tas) &&
3010       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
3011     lck = (kmp_user_lock_p)user_lock;
3012   }
3013 #if KMP_USE_FUTEX
3014   else if ((__kmp_user_lock_kind == lk_futex) &&
3015            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
3016     lck = (kmp_user_lock_p)user_lock;
3017   }
3018 #endif
3019   else {
3020     lck = __kmp_lookup_user_lock(user_lock, "omp_test_lock");
3021   }
3022 
3023 #if USE_ITT_BUILD
3024   __kmp_itt_lock_acquiring(lck);
3025 #endif /* USE_ITT_BUILD */
3026 #if OMPT_SUPPORT && OMPT_OPTIONAL
3027   // This is the case, if called from omp_init_lock_with_hint:
3028   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3029   if (!codeptr)
3030     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3031   if (ompt_enabled.ompt_callback_mutex_acquire) {
3032     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3033         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
3034         (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3035   }
3036 #endif
3037 
3038   rc = TEST_LOCK(lck, gtid);
3039 #if USE_ITT_BUILD
3040   if (rc) {
3041     __kmp_itt_lock_acquired(lck);
3042   } else {
3043     __kmp_itt_lock_cancelled(lck);
3044   }
3045 #endif /* USE_ITT_BUILD */
3046 #if OMPT_SUPPORT && OMPT_OPTIONAL
3047   if (rc && ompt_enabled.ompt_callback_mutex_acquired) {
3048     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3049         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3050   }
3051 #endif
3052 
3053   return (rc ? FTN_TRUE : FTN_FALSE);
3054 
3055 /* Can't use serial interval since not block structured */
3056 
3057 #endif // KMP_USE_DYNAMIC_LOCK
3058 }
3059 
3060 /* try to acquire the lock */
__kmpc_test_nest_lock(ident_t * loc,kmp_int32 gtid,void ** user_lock)3061 int __kmpc_test_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
3062 #if KMP_USE_DYNAMIC_LOCK
3063   int rc;
3064 #if USE_ITT_BUILD
3065   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
3066 #endif
3067 #if OMPT_SUPPORT && OMPT_OPTIONAL
3068   // This is the case, if called from omp_init_lock_with_hint:
3069   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3070   if (!codeptr)
3071     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3072   if (ompt_enabled.ompt_callback_mutex_acquire) {
3073     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3074         ompt_mutex_nest_lock, omp_lock_hint_none,
3075         __ompt_get_mutex_impl_type(user_lock),
3076         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3077   }
3078 #endif
3079   rc = KMP_D_LOCK_FUNC(user_lock, test)((kmp_dyna_lock_t *)user_lock, gtid);
3080 #if USE_ITT_BUILD
3081   if (rc) {
3082     __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
3083   } else {
3084     __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3085   }
3086 #endif
3087 #if OMPT_SUPPORT && OMPT_OPTIONAL
3088   if (ompt_enabled.enabled && rc) {
3089     if (rc == 1) {
3090       if (ompt_enabled.ompt_callback_mutex_acquired) {
3091         // lock_first
3092         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3093             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
3094             codeptr);
3095       }
3096     } else {
3097       if (ompt_enabled.ompt_callback_nest_lock) {
3098         // lock_next
3099         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3100             ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3101       }
3102     }
3103   }
3104 #endif
3105   return rc;
3106 
3107 #else // KMP_USE_DYNAMIC_LOCK
3108 
3109   kmp_user_lock_p lck;
3110   int rc;
3111 
3112   if ((__kmp_user_lock_kind == lk_tas) &&
3113       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
3114        OMP_NEST_LOCK_T_SIZE)) {
3115     lck = (kmp_user_lock_p)user_lock;
3116   }
3117 #if KMP_USE_FUTEX
3118   else if ((__kmp_user_lock_kind == lk_futex) &&
3119            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
3120             OMP_NEST_LOCK_T_SIZE)) {
3121     lck = (kmp_user_lock_p)user_lock;
3122   }
3123 #endif
3124   else {
3125     lck = __kmp_lookup_user_lock(user_lock, "omp_test_nest_lock");
3126   }
3127 
3128 #if USE_ITT_BUILD
3129   __kmp_itt_lock_acquiring(lck);
3130 #endif /* USE_ITT_BUILD */
3131 
3132 #if OMPT_SUPPORT && OMPT_OPTIONAL
3133   // This is the case, if called from omp_init_lock_with_hint:
3134   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3135   if (!codeptr)
3136     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3137   if (ompt_enabled.enabled) &&
3138         ompt_enabled.ompt_callback_mutex_acquire) {
3139       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3140           ompt_mutex_nest_lock, omp_lock_hint_none,
3141           __ompt_get_mutex_impl_type(), (ompt_wait_id_t)(uintptr_t)lck,
3142           codeptr);
3143     }
3144 #endif
3145 
3146   rc = TEST_NESTED_LOCK(lck, gtid);
3147 #if USE_ITT_BUILD
3148   if (rc) {
3149     __kmp_itt_lock_acquired(lck);
3150   } else {
3151     __kmp_itt_lock_cancelled(lck);
3152   }
3153 #endif /* USE_ITT_BUILD */
3154 #if OMPT_SUPPORT && OMPT_OPTIONAL
3155   if (ompt_enabled.enabled && rc) {
3156     if (rc == 1) {
3157       if (ompt_enabled.ompt_callback_mutex_acquired) {
3158         // lock_first
3159         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3160             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3161       }
3162     } else {
3163       if (ompt_enabled.ompt_callback_nest_lock) {
3164         // lock_next
3165         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3166             ompt_mutex_scope_begin, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3167       }
3168     }
3169   }
3170 #endif
3171   return rc;
3172 
3173 /* Can't use serial interval since not block structured */
3174 
3175 #endif // KMP_USE_DYNAMIC_LOCK
3176 }
3177 
3178 // Interface to fast scalable reduce methods routines
3179 
3180 // keep the selected method in a thread local structure for cross-function
3181 // usage: will be used in __kmpc_end_reduce* functions;
3182 // another solution: to re-determine the method one more time in
3183 // __kmpc_end_reduce* functions (new prototype required then)
3184 // AT: which solution is better?
3185 #define __KMP_SET_REDUCTION_METHOD(gtid, rmethod)                              \
3186   ((__kmp_threads[(gtid)]->th.th_local.packed_reduction_method) = (rmethod))
3187 
3188 #define __KMP_GET_REDUCTION_METHOD(gtid)                                       \
3189   (__kmp_threads[(gtid)]->th.th_local.packed_reduction_method)
3190 
3191 // description of the packed_reduction_method variable: look at the macros in
3192 // kmp.h
3193 
3194 // used in a critical section reduce block
3195 static __forceinline void
__kmp_enter_critical_section_reduce_block(ident_t * loc,kmp_int32 global_tid,kmp_critical_name * crit)3196 __kmp_enter_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3197                                           kmp_critical_name *crit) {
3198 
3199   // this lock was visible to a customer and to the threading profile tool as a
3200   // serial overhead span (although it's used for an internal purpose only)
3201   //            why was it visible in previous implementation?
3202   //            should we keep it visible in new reduce block?
3203   kmp_user_lock_p lck;
3204 
3205 #if KMP_USE_DYNAMIC_LOCK
3206 
3207   kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
3208   // Check if it is initialized.
3209   if (*lk == 0) {
3210     if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3211       KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
3212                                   KMP_GET_D_TAG(__kmp_user_lock_seq));
3213     } else {
3214       __kmp_init_indirect_csptr(crit, loc, global_tid,
3215                                 KMP_GET_I_TAG(__kmp_user_lock_seq));
3216     }
3217   }
3218   // Branch for accessing the actual lock object and set operation. This
3219   // branching is inevitable since this lock initialization does not follow the
3220   // normal dispatch path (lock table is not used).
3221   if (KMP_EXTRACT_D_TAG(lk) != 0) {
3222     lck = (kmp_user_lock_p)lk;
3223     KMP_DEBUG_ASSERT(lck != NULL);
3224     if (__kmp_env_consistency_check) {
3225       __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3226     }
3227     KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
3228   } else {
3229     kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
3230     lck = ilk->lock;
3231     KMP_DEBUG_ASSERT(lck != NULL);
3232     if (__kmp_env_consistency_check) {
3233       __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3234     }
3235     KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
3236   }
3237 
3238 #else // KMP_USE_DYNAMIC_LOCK
3239 
3240   // We know that the fast reduction code is only emitted by Intel compilers
3241   // with 32 byte critical sections. If there isn't enough space, then we
3242   // have to use a pointer.
3243   if (__kmp_base_user_lock_size <= INTEL_CRITICAL_SIZE) {
3244     lck = (kmp_user_lock_p)crit;
3245   } else {
3246     lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
3247   }
3248   KMP_DEBUG_ASSERT(lck != NULL);
3249 
3250   if (__kmp_env_consistency_check)
3251     __kmp_push_sync(global_tid, ct_critical, loc, lck);
3252 
3253   __kmp_acquire_user_lock_with_checks(lck, global_tid);
3254 
3255 #endif // KMP_USE_DYNAMIC_LOCK
3256 }
3257 
3258 // used in a critical section reduce block
3259 static __forceinline void
__kmp_end_critical_section_reduce_block(ident_t * loc,kmp_int32 global_tid,kmp_critical_name * crit)3260 __kmp_end_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3261                                         kmp_critical_name *crit) {
3262 
3263   kmp_user_lock_p lck;
3264 
3265 #if KMP_USE_DYNAMIC_LOCK
3266 
3267   if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3268     lck = (kmp_user_lock_p)crit;
3269     if (__kmp_env_consistency_check)
3270       __kmp_pop_sync(global_tid, ct_critical, loc);
3271     KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
3272   } else {
3273     kmp_indirect_lock_t *ilk =
3274         (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
3275     if (__kmp_env_consistency_check)
3276       __kmp_pop_sync(global_tid, ct_critical, loc);
3277     KMP_I_LOCK_FUNC(ilk, unset)(ilk->lock, global_tid);
3278   }
3279 
3280 #else // KMP_USE_DYNAMIC_LOCK
3281 
3282   // We know that the fast reduction code is only emitted by Intel compilers
3283   // with 32 byte critical sections. If there isn't enough space, then we have
3284   // to use a pointer.
3285   if (__kmp_base_user_lock_size > 32) {
3286     lck = *((kmp_user_lock_p *)crit);
3287     KMP_ASSERT(lck != NULL);
3288   } else {
3289     lck = (kmp_user_lock_p)crit;
3290   }
3291 
3292   if (__kmp_env_consistency_check)
3293     __kmp_pop_sync(global_tid, ct_critical, loc);
3294 
3295   __kmp_release_user_lock_with_checks(lck, global_tid);
3296 
3297 #endif // KMP_USE_DYNAMIC_LOCK
3298 } // __kmp_end_critical_section_reduce_block
3299 
3300 static __forceinline int
__kmp_swap_teams_for_teams_reduction(kmp_info_t * th,kmp_team_t ** team_p,int * task_state)3301 __kmp_swap_teams_for_teams_reduction(kmp_info_t *th, kmp_team_t **team_p,
3302                                      int *task_state) {
3303   kmp_team_t *team;
3304 
3305   // Check if we are inside the teams construct?
3306   if (th->th.th_teams_microtask) {
3307     *team_p = team = th->th.th_team;
3308     if (team->t.t_level == th->th.th_teams_level) {
3309       // This is reduction at teams construct.
3310       KMP_DEBUG_ASSERT(!th->th.th_info.ds.ds_tid); // AC: check that tid == 0
3311       // Let's swap teams temporarily for the reduction.
3312       th->th.th_info.ds.ds_tid = team->t.t_master_tid;
3313       th->th.th_team = team->t.t_parent;
3314       th->th.th_team_nproc = th->th.th_team->t.t_nproc;
3315       th->th.th_task_team = th->th.th_team->t.t_task_team[0];
3316       *task_state = th->th.th_task_state;
3317       th->th.th_task_state = 0;
3318 
3319       return 1;
3320     }
3321   }
3322   return 0;
3323 }
3324 
3325 static __forceinline void
__kmp_restore_swapped_teams(kmp_info_t * th,kmp_team_t * team,int task_state)3326 __kmp_restore_swapped_teams(kmp_info_t *th, kmp_team_t *team, int task_state) {
3327   // Restore thread structure swapped in __kmp_swap_teams_for_teams_reduction.
3328   th->th.th_info.ds.ds_tid = 0;
3329   th->th.th_team = team;
3330   th->th.th_team_nproc = team->t.t_nproc;
3331   th->th.th_task_team = team->t.t_task_team[task_state];
3332   th->th.th_task_state = task_state;
3333 }
3334 
3335 /* 2.a.i. Reduce Block without a terminating barrier */
3336 /*!
3337 @ingroup SYNCHRONIZATION
3338 @param loc source location information
3339 @param global_tid global thread number
3340 @param num_vars number of items (variables) to be reduced
3341 @param reduce_size size of data in bytes to be reduced
3342 @param reduce_data pointer to data to be reduced
3343 @param reduce_func callback function providing reduction operation on two
3344 operands and returning result of reduction in lhs_data
3345 @param lck pointer to the unique lock data structure
3346 @result 1 for the master thread, 0 for all other team threads, 2 for all team
3347 threads if atomic reduction needed
3348 
3349 The nowait version is used for a reduce clause with the nowait argument.
3350 */
3351 kmp_int32
__kmpc_reduce_nowait(ident_t * loc,kmp_int32 global_tid,kmp_int32 num_vars,size_t reduce_size,void * reduce_data,void (* reduce_func)(void * lhs_data,void * rhs_data),kmp_critical_name * lck)3352 __kmpc_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3353                      size_t reduce_size, void *reduce_data,
3354                      void (*reduce_func)(void *lhs_data, void *rhs_data),
3355                      kmp_critical_name *lck) {
3356 
3357   KMP_COUNT_BLOCK(REDUCE_nowait);
3358   int retval = 0;
3359   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3360   kmp_info_t *th;
3361   kmp_team_t *team;
3362   int teams_swapped = 0, task_state;
3363   KA_TRACE(10, ("__kmpc_reduce_nowait() enter: called T#%d\n", global_tid));
3364   __kmp_assert_valid_gtid(global_tid);
3365 
3366   // why do we need this initialization here at all?
3367   // Reduction clause can not be used as a stand-alone directive.
3368 
3369   // do not call __kmp_serial_initialize(), it will be called by
3370   // __kmp_parallel_initialize() if needed
3371   // possible detection of false-positive race by the threadchecker ???
3372   if (!TCR_4(__kmp_init_parallel))
3373     __kmp_parallel_initialize();
3374 
3375   __kmp_resume_if_soft_paused();
3376 
3377 // check correctness of reduce block nesting
3378 #if KMP_USE_DYNAMIC_LOCK
3379   if (__kmp_env_consistency_check)
3380     __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3381 #else
3382   if (__kmp_env_consistency_check)
3383     __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3384 #endif
3385 
3386   th = __kmp_thread_from_gtid(global_tid);
3387   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3388 
3389   // packed_reduction_method value will be reused by __kmp_end_reduce* function,
3390   // the value should be kept in a variable
3391   // the variable should be either a construct-specific or thread-specific
3392   // property, not a team specific property
3393   //     (a thread can reach the next reduce block on the next construct, reduce
3394   //     method may differ on the next construct)
3395   // an ident_t "loc" parameter could be used as a construct-specific property
3396   // (what if loc == 0?)
3397   //     (if both construct-specific and team-specific variables were shared,
3398   //     then unness extra syncs should be needed)
3399   // a thread-specific variable is better regarding two issues above (next
3400   // construct and extra syncs)
3401   // a thread-specific "th_local.reduction_method" variable is used currently
3402   // each thread executes 'determine' and 'set' lines (no need to execute by one
3403   // thread, to avoid unness extra syncs)
3404 
3405   packed_reduction_method = __kmp_determine_reduction_method(
3406       loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3407   __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3408 
3409   OMPT_REDUCTION_DECL(th, global_tid);
3410   if (packed_reduction_method == critical_reduce_block) {
3411 
3412     OMPT_REDUCTION_BEGIN;
3413 
3414     __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3415     retval = 1;
3416 
3417   } else if (packed_reduction_method == empty_reduce_block) {
3418 
3419     OMPT_REDUCTION_BEGIN;
3420 
3421     // usage: if team size == 1, no synchronization is required ( Intel
3422     // platforms only )
3423     retval = 1;
3424 
3425   } else if (packed_reduction_method == atomic_reduce_block) {
3426 
3427     retval = 2;
3428 
3429     // all threads should do this pop here (because __kmpc_end_reduce_nowait()
3430     // won't be called by the code gen)
3431     //     (it's not quite good, because the checking block has been closed by
3432     //     this 'pop',
3433     //      but atomic operation has not been executed yet, will be executed
3434     //      slightly later, literally on next instruction)
3435     if (__kmp_env_consistency_check)
3436       __kmp_pop_sync(global_tid, ct_reduce, loc);
3437 
3438   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3439                                    tree_reduce_block)) {
3440 
3441 // AT: performance issue: a real barrier here
3442 // AT:     (if master goes slow, other threads are blocked here waiting for the
3443 // master to come and release them)
3444 // AT:     (it's not what a customer might expect specifying NOWAIT clause)
3445 // AT:     (specifying NOWAIT won't result in improvement of performance, it'll
3446 // be confusing to a customer)
3447 // AT: another implementation of *barrier_gather*nowait() (or some other design)
3448 // might go faster and be more in line with sense of NOWAIT
3449 // AT: TO DO: do epcc test and compare times
3450 
3451 // this barrier should be invisible to a customer and to the threading profile
3452 // tool (it's neither a terminating barrier nor customer's code, it's
3453 // used for an internal purpose)
3454 #if OMPT_SUPPORT
3455     // JP: can this barrier potentially leed to task scheduling?
3456     // JP: as long as there is a barrier in the implementation, OMPT should and
3457     // will provide the barrier events
3458     //         so we set-up the necessary frame/return addresses.
3459     ompt_frame_t *ompt_frame;
3460     if (ompt_enabled.enabled) {
3461       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3462       if (ompt_frame->enter_frame.ptr == NULL)
3463         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3464     }
3465     OMPT_STORE_RETURN_ADDRESS(global_tid);
3466 #endif
3467 #if USE_ITT_NOTIFY
3468     __kmp_threads[global_tid]->th.th_ident = loc;
3469 #endif
3470     retval =
3471         __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3472                       global_tid, FALSE, reduce_size, reduce_data, reduce_func);
3473     retval = (retval != 0) ? (0) : (1);
3474 #if OMPT_SUPPORT && OMPT_OPTIONAL
3475     if (ompt_enabled.enabled) {
3476       ompt_frame->enter_frame = ompt_data_none;
3477     }
3478 #endif
3479 
3480     // all other workers except master should do this pop here
3481     //     ( none of other workers will get to __kmpc_end_reduce_nowait() )
3482     if (__kmp_env_consistency_check) {
3483       if (retval == 0) {
3484         __kmp_pop_sync(global_tid, ct_reduce, loc);
3485       }
3486     }
3487 
3488   } else {
3489 
3490     // should never reach this block
3491     KMP_ASSERT(0); // "unexpected method"
3492   }
3493   if (teams_swapped) {
3494     __kmp_restore_swapped_teams(th, team, task_state);
3495   }
3496   KA_TRACE(
3497       10,
3498       ("__kmpc_reduce_nowait() exit: called T#%d: method %08x, returns %08x\n",
3499        global_tid, packed_reduction_method, retval));
3500 
3501   return retval;
3502 }
3503 
3504 /*!
3505 @ingroup SYNCHRONIZATION
3506 @param loc source location information
3507 @param global_tid global thread id.
3508 @param lck pointer to the unique lock data structure
3509 
3510 Finish the execution of a reduce nowait.
3511 */
__kmpc_end_reduce_nowait(ident_t * loc,kmp_int32 global_tid,kmp_critical_name * lck)3512 void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid,
3513                               kmp_critical_name *lck) {
3514 
3515   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3516 
3517   KA_TRACE(10, ("__kmpc_end_reduce_nowait() enter: called T#%d\n", global_tid));
3518   __kmp_assert_valid_gtid(global_tid);
3519 
3520   packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3521 
3522   OMPT_REDUCTION_DECL(__kmp_thread_from_gtid(global_tid), global_tid);
3523 
3524   if (packed_reduction_method == critical_reduce_block) {
3525 
3526     __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3527     OMPT_REDUCTION_END;
3528 
3529   } else if (packed_reduction_method == empty_reduce_block) {
3530 
3531     // usage: if team size == 1, no synchronization is required ( on Intel
3532     // platforms only )
3533 
3534     OMPT_REDUCTION_END;
3535 
3536   } else if (packed_reduction_method == atomic_reduce_block) {
3537 
3538     // neither master nor other workers should get here
3539     //     (code gen does not generate this call in case 2: atomic reduce block)
3540     // actually it's better to remove this elseif at all;
3541     // after removal this value will checked by the 'else' and will assert
3542 
3543   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3544                                    tree_reduce_block)) {
3545 
3546     // only master gets here
3547     // OMPT: tree reduction is annotated in the barrier code
3548 
3549   } else {
3550 
3551     // should never reach this block
3552     KMP_ASSERT(0); // "unexpected method"
3553   }
3554 
3555   if (__kmp_env_consistency_check)
3556     __kmp_pop_sync(global_tid, ct_reduce, loc);
3557 
3558   KA_TRACE(10, ("__kmpc_end_reduce_nowait() exit: called T#%d: method %08x\n",
3559                 global_tid, packed_reduction_method));
3560 
3561   return;
3562 }
3563 
3564 /* 2.a.ii. Reduce Block with a terminating barrier */
3565 
3566 /*!
3567 @ingroup SYNCHRONIZATION
3568 @param loc source location information
3569 @param global_tid global thread number
3570 @param num_vars number of items (variables) to be reduced
3571 @param reduce_size size of data in bytes to be reduced
3572 @param reduce_data pointer to data to be reduced
3573 @param reduce_func callback function providing reduction operation on two
3574 operands and returning result of reduction in lhs_data
3575 @param lck pointer to the unique lock data structure
3576 @result 1 for the master thread, 0 for all other team threads, 2 for all team
3577 threads if atomic reduction needed
3578 
3579 A blocking reduce that includes an implicit barrier.
3580 */
__kmpc_reduce(ident_t * loc,kmp_int32 global_tid,kmp_int32 num_vars,size_t reduce_size,void * reduce_data,void (* reduce_func)(void * lhs_data,void * rhs_data),kmp_critical_name * lck)3581 kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3582                         size_t reduce_size, void *reduce_data,
3583                         void (*reduce_func)(void *lhs_data, void *rhs_data),
3584                         kmp_critical_name *lck) {
3585   KMP_COUNT_BLOCK(REDUCE_wait);
3586   int retval = 0;
3587   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3588   kmp_info_t *th;
3589   kmp_team_t *team;
3590   int teams_swapped = 0, task_state;
3591 
3592   KA_TRACE(10, ("__kmpc_reduce() enter: called T#%d\n", global_tid));
3593   __kmp_assert_valid_gtid(global_tid);
3594 
3595   // why do we need this initialization here at all?
3596   // Reduction clause can not be a stand-alone directive.
3597 
3598   // do not call __kmp_serial_initialize(), it will be called by
3599   // __kmp_parallel_initialize() if needed
3600   // possible detection of false-positive race by the threadchecker ???
3601   if (!TCR_4(__kmp_init_parallel))
3602     __kmp_parallel_initialize();
3603 
3604   __kmp_resume_if_soft_paused();
3605 
3606 // check correctness of reduce block nesting
3607 #if KMP_USE_DYNAMIC_LOCK
3608   if (__kmp_env_consistency_check)
3609     __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3610 #else
3611   if (__kmp_env_consistency_check)
3612     __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3613 #endif
3614 
3615   th = __kmp_thread_from_gtid(global_tid);
3616   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3617 
3618   packed_reduction_method = __kmp_determine_reduction_method(
3619       loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3620   __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3621 
3622   OMPT_REDUCTION_DECL(th, global_tid);
3623 
3624   if (packed_reduction_method == critical_reduce_block) {
3625 
3626     OMPT_REDUCTION_BEGIN;
3627     __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3628     retval = 1;
3629 
3630   } else if (packed_reduction_method == empty_reduce_block) {
3631 
3632     OMPT_REDUCTION_BEGIN;
3633     // usage: if team size == 1, no synchronization is required ( Intel
3634     // platforms only )
3635     retval = 1;
3636 
3637   } else if (packed_reduction_method == atomic_reduce_block) {
3638 
3639     retval = 2;
3640 
3641   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3642                                    tree_reduce_block)) {
3643 
3644 // case tree_reduce_block:
3645 // this barrier should be visible to a customer and to the threading profile
3646 // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3647 #if OMPT_SUPPORT
3648     ompt_frame_t *ompt_frame;
3649     if (ompt_enabled.enabled) {
3650       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3651       if (ompt_frame->enter_frame.ptr == NULL)
3652         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3653     }
3654     OMPT_STORE_RETURN_ADDRESS(global_tid);
3655 #endif
3656 #if USE_ITT_NOTIFY
3657     __kmp_threads[global_tid]->th.th_ident =
3658         loc; // needed for correct notification of frames
3659 #endif
3660     retval =
3661         __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3662                       global_tid, TRUE, reduce_size, reduce_data, reduce_func);
3663     retval = (retval != 0) ? (0) : (1);
3664 #if OMPT_SUPPORT && OMPT_OPTIONAL
3665     if (ompt_enabled.enabled) {
3666       ompt_frame->enter_frame = ompt_data_none;
3667     }
3668 #endif
3669 
3670     // all other workers except master should do this pop here
3671     // ( none of other workers except master will enter __kmpc_end_reduce() )
3672     if (__kmp_env_consistency_check) {
3673       if (retval == 0) { // 0: all other workers; 1: master
3674         __kmp_pop_sync(global_tid, ct_reduce, loc);
3675       }
3676     }
3677 
3678   } else {
3679 
3680     // should never reach this block
3681     KMP_ASSERT(0); // "unexpected method"
3682   }
3683   if (teams_swapped) {
3684     __kmp_restore_swapped_teams(th, team, task_state);
3685   }
3686 
3687   KA_TRACE(10,
3688            ("__kmpc_reduce() exit: called T#%d: method %08x, returns %08x\n",
3689             global_tid, packed_reduction_method, retval));
3690   return retval;
3691 }
3692 
3693 /*!
3694 @ingroup SYNCHRONIZATION
3695 @param loc source location information
3696 @param global_tid global thread id.
3697 @param lck pointer to the unique lock data structure
3698 
3699 Finish the execution of a blocking reduce.
3700 The <tt>lck</tt> pointer must be the same as that used in the corresponding
3701 start function.
3702 */
__kmpc_end_reduce(ident_t * loc,kmp_int32 global_tid,kmp_critical_name * lck)3703 void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
3704                        kmp_critical_name *lck) {
3705 
3706   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3707   kmp_info_t *th;
3708   kmp_team_t *team;
3709   int teams_swapped = 0, task_state;
3710 
3711   KA_TRACE(10, ("__kmpc_end_reduce() enter: called T#%d\n", global_tid));
3712   __kmp_assert_valid_gtid(global_tid);
3713 
3714   th = __kmp_thread_from_gtid(global_tid);
3715   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3716 
3717   packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3718 
3719   // this barrier should be visible to a customer and to the threading profile
3720   // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3721   OMPT_REDUCTION_DECL(th, global_tid);
3722 
3723   if (packed_reduction_method == critical_reduce_block) {
3724     __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3725 
3726     OMPT_REDUCTION_END;
3727 
3728 // TODO: implicit barrier: should be exposed
3729 #if OMPT_SUPPORT
3730     ompt_frame_t *ompt_frame;
3731     if (ompt_enabled.enabled) {
3732       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3733       if (ompt_frame->enter_frame.ptr == NULL)
3734         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3735     }
3736     OMPT_STORE_RETURN_ADDRESS(global_tid);
3737 #endif
3738 #if USE_ITT_NOTIFY
3739     __kmp_threads[global_tid]->th.th_ident = loc;
3740 #endif
3741     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3742 #if OMPT_SUPPORT && OMPT_OPTIONAL
3743     if (ompt_enabled.enabled) {
3744       ompt_frame->enter_frame = ompt_data_none;
3745     }
3746 #endif
3747 
3748   } else if (packed_reduction_method == empty_reduce_block) {
3749 
3750     OMPT_REDUCTION_END;
3751 
3752 // usage: if team size==1, no synchronization is required (Intel platforms only)
3753 
3754 // TODO: implicit barrier: should be exposed
3755 #if OMPT_SUPPORT
3756     ompt_frame_t *ompt_frame;
3757     if (ompt_enabled.enabled) {
3758       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3759       if (ompt_frame->enter_frame.ptr == NULL)
3760         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3761     }
3762     OMPT_STORE_RETURN_ADDRESS(global_tid);
3763 #endif
3764 #if USE_ITT_NOTIFY
3765     __kmp_threads[global_tid]->th.th_ident = loc;
3766 #endif
3767     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3768 #if OMPT_SUPPORT && OMPT_OPTIONAL
3769     if (ompt_enabled.enabled) {
3770       ompt_frame->enter_frame = ompt_data_none;
3771     }
3772 #endif
3773 
3774   } else if (packed_reduction_method == atomic_reduce_block) {
3775 
3776 #if OMPT_SUPPORT
3777     ompt_frame_t *ompt_frame;
3778     if (ompt_enabled.enabled) {
3779       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3780       if (ompt_frame->enter_frame.ptr == NULL)
3781         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3782     }
3783     OMPT_STORE_RETURN_ADDRESS(global_tid);
3784 #endif
3785 // TODO: implicit barrier: should be exposed
3786 #if USE_ITT_NOTIFY
3787     __kmp_threads[global_tid]->th.th_ident = loc;
3788 #endif
3789     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3790 #if OMPT_SUPPORT && OMPT_OPTIONAL
3791     if (ompt_enabled.enabled) {
3792       ompt_frame->enter_frame = ompt_data_none;
3793     }
3794 #endif
3795 
3796   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3797                                    tree_reduce_block)) {
3798 
3799     // only master executes here (master releases all other workers)
3800     __kmp_end_split_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3801                             global_tid);
3802 
3803   } else {
3804 
3805     // should never reach this block
3806     KMP_ASSERT(0); // "unexpected method"
3807   }
3808   if (teams_swapped) {
3809     __kmp_restore_swapped_teams(th, team, task_state);
3810   }
3811 
3812   if (__kmp_env_consistency_check)
3813     __kmp_pop_sync(global_tid, ct_reduce, loc);
3814 
3815   KA_TRACE(10, ("__kmpc_end_reduce() exit: called T#%d: method %08x\n",
3816                 global_tid, packed_reduction_method));
3817 
3818   return;
3819 }
3820 
3821 #undef __KMP_GET_REDUCTION_METHOD
3822 #undef __KMP_SET_REDUCTION_METHOD
3823 
3824 /* end of interface to fast scalable reduce routines */
3825 
__kmpc_get_taskid()3826 kmp_uint64 __kmpc_get_taskid() {
3827 
3828   kmp_int32 gtid;
3829   kmp_info_t *thread;
3830 
3831   gtid = __kmp_get_gtid();
3832   if (gtid < 0) {
3833     return 0;
3834   }
3835   thread = __kmp_thread_from_gtid(gtid);
3836   return thread->th.th_current_task->td_task_id;
3837 
3838 } // __kmpc_get_taskid
3839 
__kmpc_get_parent_taskid()3840 kmp_uint64 __kmpc_get_parent_taskid() {
3841 
3842   kmp_int32 gtid;
3843   kmp_info_t *thread;
3844   kmp_taskdata_t *parent_task;
3845 
3846   gtid = __kmp_get_gtid();
3847   if (gtid < 0) {
3848     return 0;
3849   }
3850   thread = __kmp_thread_from_gtid(gtid);
3851   parent_task = thread->th.th_current_task->td_parent;
3852   return (parent_task == NULL ? 0 : parent_task->td_task_id);
3853 
3854 } // __kmpc_get_parent_taskid
3855 
3856 /*!
3857 @ingroup WORK_SHARING
3858 @param loc  source location information.
3859 @param gtid  global thread number.
3860 @param num_dims  number of associated doacross loops.
3861 @param dims  info on loops bounds.
3862 
3863 Initialize doacross loop information.
3864 Expect compiler send us inclusive bounds,
3865 e.g. for(i=2;i<9;i+=2) lo=2, up=8, st=2.
3866 */
__kmpc_doacross_init(ident_t * loc,int gtid,int num_dims,const struct kmp_dim * dims)3867 void __kmpc_doacross_init(ident_t *loc, int gtid, int num_dims,
3868                           const struct kmp_dim *dims) {
3869   __kmp_assert_valid_gtid(gtid);
3870   int j, idx;
3871   kmp_int64 last, trace_count;
3872   kmp_info_t *th = __kmp_threads[gtid];
3873   kmp_team_t *team = th->th.th_team;
3874   kmp_uint32 *flags;
3875   kmp_disp_t *pr_buf = th->th.th_dispatch;
3876   dispatch_shared_info_t *sh_buf;
3877 
3878   KA_TRACE(
3879       20,
3880       ("__kmpc_doacross_init() enter: called T#%d, num dims %d, active %d\n",
3881        gtid, num_dims, !team->t.t_serialized));
3882   KMP_DEBUG_ASSERT(dims != NULL);
3883   KMP_DEBUG_ASSERT(num_dims > 0);
3884 
3885   if (team->t.t_serialized) {
3886     KA_TRACE(20, ("__kmpc_doacross_init() exit: serialized team\n"));
3887     return; // no dependencies if team is serialized
3888   }
3889   KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
3890   idx = pr_buf->th_doacross_buf_idx++; // Increment index of shared buffer for
3891   // the next loop
3892   sh_buf = &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
3893 
3894   // Save bounds info into allocated private buffer
3895   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info == NULL);
3896   pr_buf->th_doacross_info = (kmp_int64 *)__kmp_thread_malloc(
3897       th, sizeof(kmp_int64) * (4 * num_dims + 1));
3898   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
3899   pr_buf->th_doacross_info[0] =
3900       (kmp_int64)num_dims; // first element is number of dimensions
3901   // Save also address of num_done in order to access it later without knowing
3902   // the buffer index
3903   pr_buf->th_doacross_info[1] = (kmp_int64)&sh_buf->doacross_num_done;
3904   pr_buf->th_doacross_info[2] = dims[0].lo;
3905   pr_buf->th_doacross_info[3] = dims[0].up;
3906   pr_buf->th_doacross_info[4] = dims[0].st;
3907   last = 5;
3908   for (j = 1; j < num_dims; ++j) {
3909     kmp_int64
3910         range_length; // To keep ranges of all dimensions but the first dims[0]
3911     if (dims[j].st == 1) { // most common case
3912       // AC: should we care of ranges bigger than LLONG_MAX? (not for now)
3913       range_length = dims[j].up - dims[j].lo + 1;
3914     } else {
3915       if (dims[j].st > 0) {
3916         KMP_DEBUG_ASSERT(dims[j].up > dims[j].lo);
3917         range_length = (kmp_uint64)(dims[j].up - dims[j].lo) / dims[j].st + 1;
3918       } else { // negative increment
3919         KMP_DEBUG_ASSERT(dims[j].lo > dims[j].up);
3920         range_length =
3921             (kmp_uint64)(dims[j].lo - dims[j].up) / (-dims[j].st) + 1;
3922       }
3923     }
3924     pr_buf->th_doacross_info[last++] = range_length;
3925     pr_buf->th_doacross_info[last++] = dims[j].lo;
3926     pr_buf->th_doacross_info[last++] = dims[j].up;
3927     pr_buf->th_doacross_info[last++] = dims[j].st;
3928   }
3929 
3930   // Compute total trip count.
3931   // Start with range of dims[0] which we don't need to keep in the buffer.
3932   if (dims[0].st == 1) { // most common case
3933     trace_count = dims[0].up - dims[0].lo + 1;
3934   } else if (dims[0].st > 0) {
3935     KMP_DEBUG_ASSERT(dims[0].up > dims[0].lo);
3936     trace_count = (kmp_uint64)(dims[0].up - dims[0].lo) / dims[0].st + 1;
3937   } else { // negative increment
3938     KMP_DEBUG_ASSERT(dims[0].lo > dims[0].up);
3939     trace_count = (kmp_uint64)(dims[0].lo - dims[0].up) / (-dims[0].st) + 1;
3940   }
3941   for (j = 1; j < num_dims; ++j) {
3942     trace_count *= pr_buf->th_doacross_info[4 * j + 1]; // use kept ranges
3943   }
3944   KMP_DEBUG_ASSERT(trace_count > 0);
3945 
3946   // Check if shared buffer is not occupied by other loop (idx -
3947   // __kmp_dispatch_num_buffers)
3948   if (idx != sh_buf->doacross_buf_idx) {
3949     // Shared buffer is occupied, wait for it to be free
3950     __kmp_wait_4((volatile kmp_uint32 *)&sh_buf->doacross_buf_idx, idx,
3951                  __kmp_eq_4, NULL);
3952   }
3953 #if KMP_32_BIT_ARCH
3954   // Check if we are the first thread. After the CAS the first thread gets 0,
3955   // others get 1 if initialization is in progress, allocated pointer otherwise.
3956   // Treat pointer as volatile integer (value 0 or 1) until memory is allocated.
3957   flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET32(
3958       (volatile kmp_int32 *)&sh_buf->doacross_flags, NULL, 1);
3959 #else
3960   flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET64(
3961       (volatile kmp_int64 *)&sh_buf->doacross_flags, NULL, 1LL);
3962 #endif
3963   if (flags == NULL) {
3964     // we are the first thread, allocate the array of flags
3965     size_t size = trace_count / 8 + 8; // in bytes, use single bit per iteration
3966     flags = (kmp_uint32 *)__kmp_thread_calloc(th, size, 1);
3967     KMP_MB();
3968     sh_buf->doacross_flags = flags;
3969   } else if (flags == (kmp_uint32 *)1) {
3970 #if KMP_32_BIT_ARCH
3971     // initialization is still in progress, need to wait
3972     while (*(volatile kmp_int32 *)&sh_buf->doacross_flags == 1)
3973 #else
3974     while (*(volatile kmp_int64 *)&sh_buf->doacross_flags == 1LL)
3975 #endif
3976       KMP_YIELD(TRUE);
3977     KMP_MB();
3978   } else {
3979     KMP_MB();
3980   }
3981   KMP_DEBUG_ASSERT(sh_buf->doacross_flags > (kmp_uint32 *)1); // check ptr value
3982   pr_buf->th_doacross_flags =
3983       sh_buf->doacross_flags; // save private copy in order to not
3984   // touch shared buffer on each iteration
3985   KA_TRACE(20, ("__kmpc_doacross_init() exit: T#%d\n", gtid));
3986 }
3987 
__kmpc_doacross_wait(ident_t * loc,int gtid,const kmp_int64 * vec)3988 void __kmpc_doacross_wait(ident_t *loc, int gtid, const kmp_int64 *vec) {
3989   __kmp_assert_valid_gtid(gtid);
3990   kmp_int32 shft, num_dims, i;
3991   kmp_uint32 flag;
3992   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
3993   kmp_info_t *th = __kmp_threads[gtid];
3994   kmp_team_t *team = th->th.th_team;
3995   kmp_disp_t *pr_buf;
3996   kmp_int64 lo, up, st;
3997 
3998   KA_TRACE(20, ("__kmpc_doacross_wait() enter: called T#%d\n", gtid));
3999   if (team->t.t_serialized) {
4000     KA_TRACE(20, ("__kmpc_doacross_wait() exit: serialized team\n"));
4001     return; // no dependencies if team is serialized
4002   }
4003 
4004   // calculate sequential iteration number and check out-of-bounds condition
4005   pr_buf = th->th.th_dispatch;
4006   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4007   num_dims = pr_buf->th_doacross_info[0];
4008   lo = pr_buf->th_doacross_info[2];
4009   up = pr_buf->th_doacross_info[3];
4010   st = pr_buf->th_doacross_info[4];
4011 #if OMPT_SUPPORT && OMPT_OPTIONAL
4012   ompt_dependence_t deps[num_dims];
4013 #endif
4014   if (st == 1) { // most common case
4015     if (vec[0] < lo || vec[0] > up) {
4016       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4017                     "bounds [%lld,%lld]\n",
4018                     gtid, vec[0], lo, up));
4019       return;
4020     }
4021     iter_number = vec[0] - lo;
4022   } else if (st > 0) {
4023     if (vec[0] < lo || vec[0] > up) {
4024       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4025                     "bounds [%lld,%lld]\n",
4026                     gtid, vec[0], lo, up));
4027       return;
4028     }
4029     iter_number = (kmp_uint64)(vec[0] - lo) / st;
4030   } else { // negative increment
4031     if (vec[0] > lo || vec[0] < up) {
4032       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4033                     "bounds [%lld,%lld]\n",
4034                     gtid, vec[0], lo, up));
4035       return;
4036     }
4037     iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4038   }
4039 #if OMPT_SUPPORT && OMPT_OPTIONAL
4040   deps[0].variable.value = iter_number;
4041   deps[0].dependence_type = ompt_dependence_type_sink;
4042 #endif
4043   for (i = 1; i < num_dims; ++i) {
4044     kmp_int64 iter, ln;
4045     kmp_int32 j = i * 4;
4046     ln = pr_buf->th_doacross_info[j + 1];
4047     lo = pr_buf->th_doacross_info[j + 2];
4048     up = pr_buf->th_doacross_info[j + 3];
4049     st = pr_buf->th_doacross_info[j + 4];
4050     if (st == 1) {
4051       if (vec[i] < lo || vec[i] > up) {
4052         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4053                       "bounds [%lld,%lld]\n",
4054                       gtid, vec[i], lo, up));
4055         return;
4056       }
4057       iter = vec[i] - lo;
4058     } else if (st > 0) {
4059       if (vec[i] < lo || vec[i] > up) {
4060         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4061                       "bounds [%lld,%lld]\n",
4062                       gtid, vec[i], lo, up));
4063         return;
4064       }
4065       iter = (kmp_uint64)(vec[i] - lo) / st;
4066     } else { // st < 0
4067       if (vec[i] > lo || vec[i] < up) {
4068         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4069                       "bounds [%lld,%lld]\n",
4070                       gtid, vec[i], lo, up));
4071         return;
4072       }
4073       iter = (kmp_uint64)(lo - vec[i]) / (-st);
4074     }
4075     iter_number = iter + ln * iter_number;
4076 #if OMPT_SUPPORT && OMPT_OPTIONAL
4077     deps[i].variable.value = iter;
4078     deps[i].dependence_type = ompt_dependence_type_sink;
4079 #endif
4080   }
4081   shft = iter_number % 32; // use 32-bit granularity
4082   iter_number >>= 5; // divided by 32
4083   flag = 1 << shft;
4084   while ((flag & pr_buf->th_doacross_flags[iter_number]) == 0) {
4085     KMP_YIELD(TRUE);
4086   }
4087   KMP_MB();
4088 #if OMPT_SUPPORT && OMPT_OPTIONAL
4089   if (ompt_enabled.ompt_callback_dependences) {
4090     ompt_callbacks.ompt_callback(ompt_callback_dependences)(
4091         &(OMPT_CUR_TASK_INFO(th)->task_data), deps, num_dims);
4092   }
4093 #endif
4094   KA_TRACE(20,
4095            ("__kmpc_doacross_wait() exit: T#%d wait for iter %lld completed\n",
4096             gtid, (iter_number << 5) + shft));
4097 }
4098 
__kmpc_doacross_post(ident_t * loc,int gtid,const kmp_int64 * vec)4099 void __kmpc_doacross_post(ident_t *loc, int gtid, const kmp_int64 *vec) {
4100   __kmp_assert_valid_gtid(gtid);
4101   kmp_int32 shft, num_dims, i;
4102   kmp_uint32 flag;
4103   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
4104   kmp_info_t *th = __kmp_threads[gtid];
4105   kmp_team_t *team = th->th.th_team;
4106   kmp_disp_t *pr_buf;
4107   kmp_int64 lo, st;
4108 
4109   KA_TRACE(20, ("__kmpc_doacross_post() enter: called T#%d\n", gtid));
4110   if (team->t.t_serialized) {
4111     KA_TRACE(20, ("__kmpc_doacross_post() exit: serialized team\n"));
4112     return; // no dependencies if team is serialized
4113   }
4114 
4115   // calculate sequential iteration number (same as in "wait" but no
4116   // out-of-bounds checks)
4117   pr_buf = th->th.th_dispatch;
4118   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4119   num_dims = pr_buf->th_doacross_info[0];
4120   lo = pr_buf->th_doacross_info[2];
4121   st = pr_buf->th_doacross_info[4];
4122 #if OMPT_SUPPORT && OMPT_OPTIONAL
4123   ompt_dependence_t deps[num_dims];
4124 #endif
4125   if (st == 1) { // most common case
4126     iter_number = vec[0] - lo;
4127   } else if (st > 0) {
4128     iter_number = (kmp_uint64)(vec[0] - lo) / st;
4129   } else { // negative increment
4130     iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4131   }
4132 #if OMPT_SUPPORT && OMPT_OPTIONAL
4133   deps[0].variable.value = iter_number;
4134   deps[0].dependence_type = ompt_dependence_type_source;
4135 #endif
4136   for (i = 1; i < num_dims; ++i) {
4137     kmp_int64 iter, ln;
4138     kmp_int32 j = i * 4;
4139     ln = pr_buf->th_doacross_info[j + 1];
4140     lo = pr_buf->th_doacross_info[j + 2];
4141     st = pr_buf->th_doacross_info[j + 4];
4142     if (st == 1) {
4143       iter = vec[i] - lo;
4144     } else if (st > 0) {
4145       iter = (kmp_uint64)(vec[i] - lo) / st;
4146     } else { // st < 0
4147       iter = (kmp_uint64)(lo - vec[i]) / (-st);
4148     }
4149     iter_number = iter + ln * iter_number;
4150 #if OMPT_SUPPORT && OMPT_OPTIONAL
4151     deps[i].variable.value = iter;
4152     deps[i].dependence_type = ompt_dependence_type_source;
4153 #endif
4154   }
4155 #if OMPT_SUPPORT && OMPT_OPTIONAL
4156   if (ompt_enabled.ompt_callback_dependences) {
4157     ompt_callbacks.ompt_callback(ompt_callback_dependences)(
4158         &(OMPT_CUR_TASK_INFO(th)->task_data), deps, num_dims);
4159   }
4160 #endif
4161   shft = iter_number % 32; // use 32-bit granularity
4162   iter_number >>= 5; // divided by 32
4163   flag = 1 << shft;
4164   KMP_MB();
4165   if ((flag & pr_buf->th_doacross_flags[iter_number]) == 0)
4166     KMP_TEST_THEN_OR32(&pr_buf->th_doacross_flags[iter_number], flag);
4167   KA_TRACE(20, ("__kmpc_doacross_post() exit: T#%d iter %lld posted\n", gtid,
4168                 (iter_number << 5) + shft));
4169 }
4170 
__kmpc_doacross_fini(ident_t * loc,int gtid)4171 void __kmpc_doacross_fini(ident_t *loc, int gtid) {
4172   __kmp_assert_valid_gtid(gtid);
4173   kmp_int32 num_done;
4174   kmp_info_t *th = __kmp_threads[gtid];
4175   kmp_team_t *team = th->th.th_team;
4176   kmp_disp_t *pr_buf = th->th.th_dispatch;
4177 
4178   KA_TRACE(20, ("__kmpc_doacross_fini() enter: called T#%d\n", gtid));
4179   if (team->t.t_serialized) {
4180     KA_TRACE(20, ("__kmpc_doacross_fini() exit: serialized team %p\n", team));
4181     return; // nothing to do
4182   }
4183   num_done = KMP_TEST_THEN_INC32((kmp_int32 *)pr_buf->th_doacross_info[1]) + 1;
4184   if (num_done == th->th.th_team_nproc) {
4185     // we are the last thread, need to free shared resources
4186     int idx = pr_buf->th_doacross_buf_idx - 1;
4187     dispatch_shared_info_t *sh_buf =
4188         &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
4189     KMP_DEBUG_ASSERT(pr_buf->th_doacross_info[1] ==
4190                      (kmp_int64)&sh_buf->doacross_num_done);
4191     KMP_DEBUG_ASSERT(num_done == sh_buf->doacross_num_done);
4192     KMP_DEBUG_ASSERT(idx == sh_buf->doacross_buf_idx);
4193     __kmp_thread_free(th, CCAST(kmp_uint32 *, sh_buf->doacross_flags));
4194     sh_buf->doacross_flags = NULL;
4195     sh_buf->doacross_num_done = 0;
4196     sh_buf->doacross_buf_idx +=
4197         __kmp_dispatch_num_buffers; // free buffer for future re-use
4198   }
4199   // free private resources (need to keep buffer index forever)
4200   pr_buf->th_doacross_flags = NULL;
4201   __kmp_thread_free(th, (void *)pr_buf->th_doacross_info);
4202   pr_buf->th_doacross_info = NULL;
4203   KA_TRACE(20, ("__kmpc_doacross_fini() exit: T#%d\n", gtid));
4204 }
4205 
4206 /* omp_alloc/omp_calloc/omp_free only defined for C/C++, not for Fortran */
omp_alloc(size_t size,omp_allocator_handle_t allocator)4207 void *omp_alloc(size_t size, omp_allocator_handle_t allocator) {
4208   return __kmpc_alloc(__kmp_entry_gtid(), size, allocator);
4209 }
4210 
omp_calloc(size_t nmemb,size_t size,omp_allocator_handle_t allocator)4211 void *omp_calloc(size_t nmemb, size_t size, omp_allocator_handle_t allocator) {
4212   return __kmpc_calloc(__kmp_entry_gtid(), nmemb, size, allocator);
4213 }
4214 
omp_realloc(void * ptr,size_t size,omp_allocator_handle_t allocator,omp_allocator_handle_t free_allocator)4215 void *omp_realloc(void *ptr, size_t size, omp_allocator_handle_t allocator,
4216                   omp_allocator_handle_t free_allocator) {
4217   return __kmpc_realloc(__kmp_entry_gtid(), ptr, size, allocator,
4218                         free_allocator);
4219 }
4220 
omp_free(void * ptr,omp_allocator_handle_t allocator)4221 void omp_free(void *ptr, omp_allocator_handle_t allocator) {
4222   __kmpc_free(__kmp_entry_gtid(), ptr, allocator);
4223 }
4224 
__kmpc_get_target_offload(void)4225 int __kmpc_get_target_offload(void) {
4226   if (!__kmp_init_serial) {
4227     __kmp_serial_initialize();
4228   }
4229   return __kmp_target_offload;
4230 }
4231 
__kmpc_pause_resource(kmp_pause_status_t level)4232 int __kmpc_pause_resource(kmp_pause_status_t level) {
4233   if (!__kmp_init_serial) {
4234     return 1; // Can't pause if runtime is not initialized
4235   }
4236   return __kmp_pause_resource(level);
4237 }
4238