LLVM OpenMP* Runtime Library
kmp_runtime.cpp
1 /*
2  * kmp_runtime.cpp -- KPTS runtime support library
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_affinity.h"
15 #include "kmp_atomic.h"
16 #include "kmp_environment.h"
17 #include "kmp_error.h"
18 #include "kmp_i18n.h"
19 #include "kmp_io.h"
20 #include "kmp_itt.h"
21 #include "kmp_settings.h"
22 #include "kmp_stats.h"
23 #include "kmp_str.h"
24 #include "kmp_wait_release.h"
25 #include "kmp_wrapper_getpid.h"
26 #include "kmp_dispatch.h"
27 #if KMP_USE_HIER_SCHED
28 #include "kmp_dispatch_hier.h"
29 #endif
30 
31 #if OMPT_SUPPORT
32 #include "ompt-specific.h"
33 #endif
34 #if OMPD_SUPPORT
35 #include "ompd-specific.h"
36 #endif
37 
38 #if OMP_PROFILING_SUPPORT
39 #include "llvm/Support/TimeProfiler.h"
40 static char *ProfileTraceFile = nullptr;
41 #endif
42 
43 /* these are temporary issues to be dealt with */
44 #define KMP_USE_PRCTL 0
45 
46 #if KMP_OS_WINDOWS
47 #include <process.h>
48 #endif
49 
50 #if KMP_OS_WINDOWS
51 // windows does not need include files as it doesn't use shared memory
52 #else
53 #include <sys/mman.h>
54 #include <sys/stat.h>
55 #include <fcntl.h>
56 #define SHM_SIZE 1024
57 #endif
58 
59 #if defined(KMP_GOMP_COMPAT)
60 char const __kmp_version_alt_comp[] =
61  KMP_VERSION_PREFIX "alternative compiler support: yes";
62 #endif /* defined(KMP_GOMP_COMPAT) */
63 
64 char const __kmp_version_omp_api[] =
65  KMP_VERSION_PREFIX "API version: 5.0 (201611)";
66 
67 #ifdef KMP_DEBUG
68 char const __kmp_version_lock[] =
69  KMP_VERSION_PREFIX "lock type: run time selectable";
70 #endif /* KMP_DEBUG */
71 
72 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
73 
74 /* ------------------------------------------------------------------------ */
75 
76 #if KMP_USE_MONITOR
77 kmp_info_t __kmp_monitor;
78 #endif
79 
80 /* Forward declarations */
81 
82 void __kmp_cleanup(void);
83 
84 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
85  int gtid);
86 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
87  kmp_internal_control_t *new_icvs,
88  ident_t *loc);
89 #if KMP_AFFINITY_SUPPORTED
90 static void __kmp_partition_places(kmp_team_t *team,
91  int update_master_only = 0);
92 #endif
93 static void __kmp_do_serial_initialize(void);
94 void __kmp_fork_barrier(int gtid, int tid);
95 void __kmp_join_barrier(int gtid);
96 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
97  kmp_internal_control_t *new_icvs, ident_t *loc);
98 
99 #ifdef USE_LOAD_BALANCE
100 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
101 #endif
102 
103 static int __kmp_expand_threads(int nNeed);
104 #if KMP_OS_WINDOWS
105 static int __kmp_unregister_root_other_thread(int gtid);
106 #endif
107 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
108 kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
109 
110 /* Calculate the identifier of the current thread */
111 /* fast (and somewhat portable) way to get unique identifier of executing
112  thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
113 int __kmp_get_global_thread_id() {
114  int i;
115  kmp_info_t **other_threads;
116  size_t stack_data;
117  char *stack_addr;
118  size_t stack_size;
119  char *stack_base;
120 
121  KA_TRACE(
122  1000,
123  ("*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n",
124  __kmp_nth, __kmp_all_nth));
125 
126  /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
127  a parallel region, made it return KMP_GTID_DNE to force serial_initialize
128  by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
129  __kmp_init_gtid for this to work. */
130 
131  if (!TCR_4(__kmp_init_gtid))
132  return KMP_GTID_DNE;
133 
134 #ifdef KMP_TDATA_GTID
135  if (TCR_4(__kmp_gtid_mode) >= 3) {
136  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
137  return __kmp_gtid;
138  }
139 #endif
140  if (TCR_4(__kmp_gtid_mode) >= 2) {
141  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
142  return __kmp_gtid_get_specific();
143  }
144  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
145 
146  stack_addr = (char *)&stack_data;
147  other_threads = __kmp_threads;
148 
149  /* ATT: The code below is a source of potential bugs due to unsynchronized
150  access to __kmp_threads array. For example:
151  1. Current thread loads other_threads[i] to thr and checks it, it is
152  non-NULL.
153  2. Current thread is suspended by OS.
154  3. Another thread unregisters and finishes (debug versions of free()
155  may fill memory with something like 0xEF).
156  4. Current thread is resumed.
157  5. Current thread reads junk from *thr.
158  TODO: Fix it. --ln */
159 
160  for (i = 0; i < __kmp_threads_capacity; i++) {
161 
162  kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
163  if (!thr)
164  continue;
165 
166  stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
167  stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
168 
169  /* stack grows down -- search through all of the active threads */
170 
171  if (stack_addr <= stack_base) {
172  size_t stack_diff = stack_base - stack_addr;
173 
174  if (stack_diff <= stack_size) {
175  /* The only way we can be closer than the allocated */
176  /* stack size is if we are running on this thread. */
177  KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
178  return i;
179  }
180  }
181  }
182 
183  /* get specific to try and determine our gtid */
184  KA_TRACE(1000,
185  ("*** __kmp_get_global_thread_id: internal alg. failed to find "
186  "thread, using TLS\n"));
187  i = __kmp_gtid_get_specific();
188 
189  /*fprintf( stderr, "=== %d\n", i ); */ /* GROO */
190 
191  /* if we havn't been assigned a gtid, then return code */
192  if (i < 0)
193  return i;
194 
195  /* dynamically updated stack window for uber threads to avoid get_specific
196  call */
197  if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
198  KMP_FATAL(StackOverflow, i);
199  }
200 
201  stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
202  if (stack_addr > stack_base) {
203  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
204  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
205  other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
206  stack_base);
207  } else {
208  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
209  stack_base - stack_addr);
210  }
211 
212  /* Reprint stack bounds for ubermaster since they have been refined */
213  if (__kmp_storage_map) {
214  char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
215  char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
216  __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
217  other_threads[i]->th.th_info.ds.ds_stacksize,
218  "th_%d stack (refinement)", i);
219  }
220  return i;
221 }
222 
223 int __kmp_get_global_thread_id_reg() {
224  int gtid;
225 
226  if (!__kmp_init_serial) {
227  gtid = KMP_GTID_DNE;
228  } else
229 #ifdef KMP_TDATA_GTID
230  if (TCR_4(__kmp_gtid_mode) >= 3) {
231  KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
232  gtid = __kmp_gtid;
233  } else
234 #endif
235  if (TCR_4(__kmp_gtid_mode) >= 2) {
236  KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
237  gtid = __kmp_gtid_get_specific();
238  } else {
239  KA_TRACE(1000,
240  ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
241  gtid = __kmp_get_global_thread_id();
242  }
243 
244  /* we must be a new uber master sibling thread */
245  if (gtid == KMP_GTID_DNE) {
246  KA_TRACE(10,
247  ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
248  "Registering a new gtid.\n"));
249  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
250  if (!__kmp_init_serial) {
251  __kmp_do_serial_initialize();
252  gtid = __kmp_gtid_get_specific();
253  } else {
254  gtid = __kmp_register_root(FALSE);
255  }
256  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
257  /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
258  }
259 
260  KMP_DEBUG_ASSERT(gtid >= 0);
261 
262  return gtid;
263 }
264 
265 /* caller must hold forkjoin_lock */
266 void __kmp_check_stack_overlap(kmp_info_t *th) {
267  int f;
268  char *stack_beg = NULL;
269  char *stack_end = NULL;
270  int gtid;
271 
272  KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
273  if (__kmp_storage_map) {
274  stack_end = (char *)th->th.th_info.ds.ds_stackbase;
275  stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
276 
277  gtid = __kmp_gtid_from_thread(th);
278 
279  if (gtid == KMP_GTID_MONITOR) {
280  __kmp_print_storage_map_gtid(
281  gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
282  "th_%s stack (%s)", "mon",
283  (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
284  } else {
285  __kmp_print_storage_map_gtid(
286  gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
287  "th_%d stack (%s)", gtid,
288  (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
289  }
290  }
291 
292  /* No point in checking ubermaster threads since they use refinement and
293  * cannot overlap */
294  gtid = __kmp_gtid_from_thread(th);
295  if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
296  KA_TRACE(10,
297  ("__kmp_check_stack_overlap: performing extensive checking\n"));
298  if (stack_beg == NULL) {
299  stack_end = (char *)th->th.th_info.ds.ds_stackbase;
300  stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
301  }
302 
303  for (f = 0; f < __kmp_threads_capacity; f++) {
304  kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
305 
306  if (f_th && f_th != th) {
307  char *other_stack_end =
308  (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
309  char *other_stack_beg =
310  other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
311  if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
312  (stack_end > other_stack_beg && stack_end < other_stack_end)) {
313 
314  /* Print the other stack values before the abort */
315  if (__kmp_storage_map)
316  __kmp_print_storage_map_gtid(
317  -1, other_stack_beg, other_stack_end,
318  (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
319  "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
320 
321  __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
322  __kmp_msg_null);
323  }
324  }
325  }
326  }
327  KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
328 }
329 
330 /* ------------------------------------------------------------------------ */
331 
332 void __kmp_infinite_loop(void) {
333  static int done = FALSE;
334 
335  while (!done) {
336  KMP_YIELD(TRUE);
337  }
338 }
339 
340 #define MAX_MESSAGE 512
341 
342 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
343  char const *format, ...) {
344  char buffer[MAX_MESSAGE];
345  va_list ap;
346 
347  va_start(ap, format);
348  KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
349  p2, (unsigned long)size, format);
350  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
351  __kmp_vprintf(kmp_err, buffer, ap);
352 #if KMP_PRINT_DATA_PLACEMENT
353  int node;
354  if (gtid >= 0) {
355  if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
356  if (__kmp_storage_map_verbose) {
357  node = __kmp_get_host_node(p1);
358  if (node < 0) /* doesn't work, so don't try this next time */
359  __kmp_storage_map_verbose = FALSE;
360  else {
361  char *last;
362  int lastNode;
363  int localProc = __kmp_get_cpu_from_gtid(gtid);
364 
365  const int page_size = KMP_GET_PAGE_SIZE();
366 
367  p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
368  p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
369  if (localProc >= 0)
370  __kmp_printf_no_lock(" GTID %d localNode %d\n", gtid,
371  localProc >> 1);
372  else
373  __kmp_printf_no_lock(" GTID %d\n", gtid);
374 #if KMP_USE_PRCTL
375  /* The more elaborate format is disabled for now because of the prctl
376  * hanging bug. */
377  do {
378  last = p1;
379  lastNode = node;
380  /* This loop collates adjacent pages with the same host node. */
381  do {
382  (char *)p1 += page_size;
383  } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
384  __kmp_printf_no_lock(" %p-%p memNode %d\n", last, (char *)p1 - 1,
385  lastNode);
386  } while (p1 <= p2);
387 #else
388  __kmp_printf_no_lock(" %p-%p memNode %d\n", p1,
389  (char *)p1 + (page_size - 1),
390  __kmp_get_host_node(p1));
391  if (p1 < p2) {
392  __kmp_printf_no_lock(" %p-%p memNode %d\n", p2,
393  (char *)p2 + (page_size - 1),
394  __kmp_get_host_node(p2));
395  }
396 #endif
397  }
398  }
399  } else
400  __kmp_printf_no_lock(" %s\n", KMP_I18N_STR(StorageMapWarning));
401  }
402 #endif /* KMP_PRINT_DATA_PLACEMENT */
403  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
404 }
405 
406 void __kmp_warn(char const *format, ...) {
407  char buffer[MAX_MESSAGE];
408  va_list ap;
409 
410  if (__kmp_generate_warnings == kmp_warnings_off) {
411  return;
412  }
413 
414  va_start(ap, format);
415 
416  KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
417  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
418  __kmp_vprintf(kmp_err, buffer, ap);
419  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
420 
421  va_end(ap);
422 }
423 
424 void __kmp_abort_process() {
425  // Later threads may stall here, but that's ok because abort() will kill them.
426  __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
427 
428  if (__kmp_debug_buf) {
429  __kmp_dump_debug_buffer();
430  }
431 
432  if (KMP_OS_WINDOWS) {
433  // Let other threads know of abnormal termination and prevent deadlock
434  // if abort happened during library initialization or shutdown
435  __kmp_global.g.g_abort = SIGABRT;
436 
437  /* On Windows* OS by default abort() causes pop-up error box, which stalls
438  nightly testing. Unfortunately, we cannot reliably suppress pop-up error
439  boxes. _set_abort_behavior() works well, but this function is not
440  available in VS7 (this is not problem for DLL, but it is a problem for
441  static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
442  help, at least in some versions of MS C RTL.
443 
444  It seems following sequence is the only way to simulate abort() and
445  avoid pop-up error box. */
446  raise(SIGABRT);
447  _exit(3); // Just in case, if signal ignored, exit anyway.
448  } else {
449  __kmp_unregister_library();
450  abort();
451  }
452 
453  __kmp_infinite_loop();
454  __kmp_release_bootstrap_lock(&__kmp_exit_lock);
455 
456 } // __kmp_abort_process
457 
458 void __kmp_abort_thread(void) {
459  // TODO: Eliminate g_abort global variable and this function.
460  // In case of abort just call abort(), it will kill all the threads.
461  __kmp_infinite_loop();
462 } // __kmp_abort_thread
463 
464 /* Print out the storage map for the major kmp_info_t thread data structures
465  that are allocated together. */
466 
467 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
468  __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
469  gtid);
470 
471  __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
472  sizeof(kmp_desc_t), "th_%d.th_info", gtid);
473 
474  __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
475  sizeof(kmp_local_t), "th_%d.th_local", gtid);
476 
477  __kmp_print_storage_map_gtid(
478  gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
479  sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
480 
481  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
482  &thr->th.th_bar[bs_plain_barrier + 1],
483  sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
484  gtid);
485 
486  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
487  &thr->th.th_bar[bs_forkjoin_barrier + 1],
488  sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
489  gtid);
490 
491 #if KMP_FAST_REDUCTION_BARRIER
492  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
493  &thr->th.th_bar[bs_reduction_barrier + 1],
494  sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
495  gtid);
496 #endif // KMP_FAST_REDUCTION_BARRIER
497 }
498 
499 /* Print out the storage map for the major kmp_team_t team data structures
500  that are allocated together. */
501 
502 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
503  int team_id, int num_thr) {
504  int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
505  __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
506  header, team_id);
507 
508  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
509  &team->t.t_bar[bs_last_barrier],
510  sizeof(kmp_balign_team_t) * bs_last_barrier,
511  "%s_%d.t_bar", header, team_id);
512 
513  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
514  &team->t.t_bar[bs_plain_barrier + 1],
515  sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
516  header, team_id);
517 
518  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
519  &team->t.t_bar[bs_forkjoin_barrier + 1],
520  sizeof(kmp_balign_team_t),
521  "%s_%d.t_bar[forkjoin]", header, team_id);
522 
523 #if KMP_FAST_REDUCTION_BARRIER
524  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
525  &team->t.t_bar[bs_reduction_barrier + 1],
526  sizeof(kmp_balign_team_t),
527  "%s_%d.t_bar[reduction]", header, team_id);
528 #endif // KMP_FAST_REDUCTION_BARRIER
529 
530  __kmp_print_storage_map_gtid(
531  -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
532  sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
533 
534  __kmp_print_storage_map_gtid(
535  -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
536  sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
537 
538  __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
539  &team->t.t_disp_buffer[num_disp_buff],
540  sizeof(dispatch_shared_info_t) * num_disp_buff,
541  "%s_%d.t_disp_buffer", header, team_id);
542 }
543 
544 static void __kmp_init_allocator() {
545  __kmp_init_memkind();
546  __kmp_init_target_mem();
547 }
548 static void __kmp_fini_allocator() { __kmp_fini_memkind(); }
549 
550 /* ------------------------------------------------------------------------ */
551 
552 #if KMP_DYNAMIC_LIB
553 #if KMP_OS_WINDOWS
554 
555 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
556  //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
557 
558  switch (fdwReason) {
559 
560  case DLL_PROCESS_ATTACH:
561  KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
562 
563  return TRUE;
564 
565  case DLL_PROCESS_DETACH:
566  KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
567 
568  // According to Windows* documentation for DllMain entry point:
569  // for DLL_PROCESS_DETACH, lpReserved is used for telling the difference:
570  // lpReserved == NULL when FreeLibrary() is called,
571  // lpReserved != NULL when the process is terminated.
572  // When FreeLibrary() is called, worker threads remain alive. So the
573  // runtime's state is consistent and executing proper shutdown is OK.
574  // When the process is terminated, worker threads have exited or been
575  // forcefully terminated by the OS and only the shutdown thread remains.
576  // This can leave the runtime in an inconsistent state.
577  // Hence, only attempt proper cleanup when FreeLibrary() is called.
578  // Otherwise, rely on OS to reclaim resources.
579  if (lpReserved == NULL)
580  __kmp_internal_end_library(__kmp_gtid_get_specific());
581 
582  return TRUE;
583 
584  case DLL_THREAD_ATTACH:
585  KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
586 
587  /* if we want to register new siblings all the time here call
588  * __kmp_get_gtid(); */
589  return TRUE;
590 
591  case DLL_THREAD_DETACH:
592  KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
593 
594  __kmp_internal_end_thread(__kmp_gtid_get_specific());
595  return TRUE;
596  }
597 
598  return TRUE;
599 }
600 
601 #endif /* KMP_OS_WINDOWS */
602 #endif /* KMP_DYNAMIC_LIB */
603 
604 /* __kmp_parallel_deo -- Wait until it's our turn. */
605 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
606  int gtid = *gtid_ref;
607 #ifdef BUILD_PARALLEL_ORDERED
608  kmp_team_t *team = __kmp_team_from_gtid(gtid);
609 #endif /* BUILD_PARALLEL_ORDERED */
610 
611  if (__kmp_env_consistency_check) {
612  if (__kmp_threads[gtid]->th.th_root->r.r_active)
613 #if KMP_USE_DYNAMIC_LOCK
614  __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
615 #else
616  __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
617 #endif
618  }
619 #ifdef BUILD_PARALLEL_ORDERED
620  if (!team->t.t_serialized) {
621  KMP_MB();
622  KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
623  NULL);
624  KMP_MB();
625  }
626 #endif /* BUILD_PARALLEL_ORDERED */
627 }
628 
629 /* __kmp_parallel_dxo -- Signal the next task. */
630 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
631  int gtid = *gtid_ref;
632 #ifdef BUILD_PARALLEL_ORDERED
633  int tid = __kmp_tid_from_gtid(gtid);
634  kmp_team_t *team = __kmp_team_from_gtid(gtid);
635 #endif /* BUILD_PARALLEL_ORDERED */
636 
637  if (__kmp_env_consistency_check) {
638  if (__kmp_threads[gtid]->th.th_root->r.r_active)
639  __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
640  }
641 #ifdef BUILD_PARALLEL_ORDERED
642  if (!team->t.t_serialized) {
643  KMP_MB(); /* Flush all pending memory write invalidates. */
644 
645  /* use the tid of the next thread in this team */
646  /* TODO replace with general release procedure */
647  team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
648 
649  KMP_MB(); /* Flush all pending memory write invalidates. */
650  }
651 #endif /* BUILD_PARALLEL_ORDERED */
652 }
653 
654 /* ------------------------------------------------------------------------ */
655 /* The BARRIER for a SINGLE process section is always explicit */
656 
657 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
658  int status;
659  kmp_info_t *th;
660  kmp_team_t *team;
661 
662  if (!TCR_4(__kmp_init_parallel))
663  __kmp_parallel_initialize();
664  __kmp_resume_if_soft_paused();
665 
666  th = __kmp_threads[gtid];
667  team = th->th.th_team;
668  status = 0;
669 
670  th->th.th_ident = id_ref;
671 
672  if (team->t.t_serialized) {
673  status = 1;
674  } else {
675  kmp_int32 old_this = th->th.th_local.this_construct;
676 
677  ++th->th.th_local.this_construct;
678  /* try to set team count to thread count--success means thread got the
679  single block */
680  /* TODO: Should this be acquire or release? */
681  if (team->t.t_construct == old_this) {
682  status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
683  th->th.th_local.this_construct);
684  }
685 #if USE_ITT_BUILD
686  if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
687  KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
688  team->t.t_active_level == 1) {
689  // Only report metadata by primary thread of active team at level 1
690  __kmp_itt_metadata_single(id_ref);
691  }
692 #endif /* USE_ITT_BUILD */
693  }
694 
695  if (__kmp_env_consistency_check) {
696  if (status && push_ws) {
697  __kmp_push_workshare(gtid, ct_psingle, id_ref);
698  } else {
699  __kmp_check_workshare(gtid, ct_psingle, id_ref);
700  }
701  }
702 #if USE_ITT_BUILD
703  if (status) {
704  __kmp_itt_single_start(gtid);
705  }
706 #endif /* USE_ITT_BUILD */
707  return status;
708 }
709 
710 void __kmp_exit_single(int gtid) {
711 #if USE_ITT_BUILD
712  __kmp_itt_single_end(gtid);
713 #endif /* USE_ITT_BUILD */
714  if (__kmp_env_consistency_check)
715  __kmp_pop_workshare(gtid, ct_psingle, NULL);
716 }
717 
718 /* determine if we can go parallel or must use a serialized parallel region and
719  * how many threads we can use
720  * set_nproc is the number of threads requested for the team
721  * returns 0 if we should serialize or only use one thread,
722  * otherwise the number of threads to use
723  * The forkjoin lock is held by the caller. */
724 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
725  int master_tid, int set_nthreads,
726  int enter_teams) {
727  int capacity;
728  int new_nthreads;
729  KMP_DEBUG_ASSERT(__kmp_init_serial);
730  KMP_DEBUG_ASSERT(root && parent_team);
731  kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
732 
733  // If dyn-var is set, dynamically adjust the number of desired threads,
734  // according to the method specified by dynamic_mode.
735  new_nthreads = set_nthreads;
736  if (!get__dynamic_2(parent_team, master_tid)) {
737  ;
738  }
739 #ifdef USE_LOAD_BALANCE
740  else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
741  new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
742  if (new_nthreads == 1) {
743  KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
744  "reservation to 1 thread\n",
745  master_tid));
746  return 1;
747  }
748  if (new_nthreads < set_nthreads) {
749  KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
750  "reservation to %d threads\n",
751  master_tid, new_nthreads));
752  }
753  }
754 #endif /* USE_LOAD_BALANCE */
755  else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
756  new_nthreads = __kmp_avail_proc - __kmp_nth +
757  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
758  if (new_nthreads <= 1) {
759  KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
760  "reservation to 1 thread\n",
761  master_tid));
762  return 1;
763  }
764  if (new_nthreads < set_nthreads) {
765  KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
766  "reservation to %d threads\n",
767  master_tid, new_nthreads));
768  } else {
769  new_nthreads = set_nthreads;
770  }
771  } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
772  if (set_nthreads > 2) {
773  new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
774  new_nthreads = (new_nthreads % set_nthreads) + 1;
775  if (new_nthreads == 1) {
776  KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
777  "reservation to 1 thread\n",
778  master_tid));
779  return 1;
780  }
781  if (new_nthreads < set_nthreads) {
782  KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
783  "reservation to %d threads\n",
784  master_tid, new_nthreads));
785  }
786  }
787  } else {
788  KMP_ASSERT(0);
789  }
790 
791  // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
792  if (__kmp_nth + new_nthreads -
793  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
794  __kmp_max_nth) {
795  int tl_nthreads = __kmp_max_nth - __kmp_nth +
796  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
797  if (tl_nthreads <= 0) {
798  tl_nthreads = 1;
799  }
800 
801  // If dyn-var is false, emit a 1-time warning.
802  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
803  __kmp_reserve_warn = 1;
804  __kmp_msg(kmp_ms_warning,
805  KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
806  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
807  }
808  if (tl_nthreads == 1) {
809  KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
810  "reduced reservation to 1 thread\n",
811  master_tid));
812  return 1;
813  }
814  KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
815  "reservation to %d threads\n",
816  master_tid, tl_nthreads));
817  new_nthreads = tl_nthreads;
818  }
819 
820  // Respect OMP_THREAD_LIMIT
821  int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
822  int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
823  if (cg_nthreads + new_nthreads -
824  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
825  max_cg_threads) {
826  int tl_nthreads = max_cg_threads - cg_nthreads +
827  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
828  if (tl_nthreads <= 0) {
829  tl_nthreads = 1;
830  }
831 
832  // If dyn-var is false, emit a 1-time warning.
833  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
834  __kmp_reserve_warn = 1;
835  __kmp_msg(kmp_ms_warning,
836  KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
837  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
838  }
839  if (tl_nthreads == 1) {
840  KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
841  "reduced reservation to 1 thread\n",
842  master_tid));
843  return 1;
844  }
845  KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
846  "reservation to %d threads\n",
847  master_tid, tl_nthreads));
848  new_nthreads = tl_nthreads;
849  }
850 
851  // Check if the threads array is large enough, or needs expanding.
852  // See comment in __kmp_register_root() about the adjustment if
853  // __kmp_threads[0] == NULL.
854  capacity = __kmp_threads_capacity;
855  if (TCR_PTR(__kmp_threads[0]) == NULL) {
856  --capacity;
857  }
858  // If it is not for initializing the hidden helper team, we need to take
859  // __kmp_hidden_helper_threads_num out of the capacity because it is included
860  // in __kmp_threads_capacity.
861  if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
862  capacity -= __kmp_hidden_helper_threads_num;
863  }
864  if (__kmp_nth + new_nthreads -
865  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
866  capacity) {
867  // Expand the threads array.
868  int slotsRequired = __kmp_nth + new_nthreads -
869  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
870  capacity;
871  int slotsAdded = __kmp_expand_threads(slotsRequired);
872  if (slotsAdded < slotsRequired) {
873  // The threads array was not expanded enough.
874  new_nthreads -= (slotsRequired - slotsAdded);
875  KMP_ASSERT(new_nthreads >= 1);
876 
877  // If dyn-var is false, emit a 1-time warning.
878  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
879  __kmp_reserve_warn = 1;
880  if (__kmp_tp_cached) {
881  __kmp_msg(kmp_ms_warning,
882  KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
883  KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
884  KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
885  } else {
886  __kmp_msg(kmp_ms_warning,
887  KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
888  KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
889  }
890  }
891  }
892  }
893 
894 #ifdef KMP_DEBUG
895  if (new_nthreads == 1) {
896  KC_TRACE(10,
897  ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
898  "dead roots and rechecking; requested %d threads\n",
899  __kmp_get_gtid(), set_nthreads));
900  } else {
901  KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
902  " %d threads\n",
903  __kmp_get_gtid(), new_nthreads, set_nthreads));
904  }
905 #endif // KMP_DEBUG
906  return new_nthreads;
907 }
908 
909 /* Allocate threads from the thread pool and assign them to the new team. We are
910  assured that there are enough threads available, because we checked on that
911  earlier within critical section forkjoin */
912 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
913  kmp_info_t *master_th, int master_gtid) {
914  int i;
915  int use_hot_team;
916 
917  KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
918  KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
919  KMP_MB();
920 
921  /* first, let's setup the primary thread */
922  master_th->th.th_info.ds.ds_tid = 0;
923  master_th->th.th_team = team;
924  master_th->th.th_team_nproc = team->t.t_nproc;
925  master_th->th.th_team_master = master_th;
926  master_th->th.th_team_serialized = FALSE;
927  master_th->th.th_dispatch = &team->t.t_dispatch[0];
928 
929 /* make sure we are not the optimized hot team */
930 #if KMP_NESTED_HOT_TEAMS
931  use_hot_team = 0;
932  kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
933  if (hot_teams) { // hot teams array is not allocated if
934  // KMP_HOT_TEAMS_MAX_LEVEL=0
935  int level = team->t.t_active_level - 1; // index in array of hot teams
936  if (master_th->th.th_teams_microtask) { // are we inside the teams?
937  if (master_th->th.th_teams_size.nteams > 1) {
938  ++level; // level was not increased in teams construct for
939  // team_of_masters
940  }
941  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
942  master_th->th.th_teams_level == team->t.t_level) {
943  ++level; // level was not increased in teams construct for
944  // team_of_workers before the parallel
945  } // team->t.t_level will be increased inside parallel
946  }
947  if (level < __kmp_hot_teams_max_level) {
948  if (hot_teams[level].hot_team) {
949  // hot team has already been allocated for given level
950  KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
951  use_hot_team = 1; // the team is ready to use
952  } else {
953  use_hot_team = 0; // AC: threads are not allocated yet
954  hot_teams[level].hot_team = team; // remember new hot team
955  hot_teams[level].hot_team_nth = team->t.t_nproc;
956  }
957  } else {
958  use_hot_team = 0;
959  }
960  }
961 #else
962  use_hot_team = team == root->r.r_hot_team;
963 #endif
964  if (!use_hot_team) {
965 
966  /* install the primary thread */
967  team->t.t_threads[0] = master_th;
968  __kmp_initialize_info(master_th, team, 0, master_gtid);
969 
970  /* now, install the worker threads */
971  for (i = 1; i < team->t.t_nproc; i++) {
972 
973  /* fork or reallocate a new thread and install it in team */
974  kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
975  team->t.t_threads[i] = thr;
976  KMP_DEBUG_ASSERT(thr);
977  KMP_DEBUG_ASSERT(thr->th.th_team == team);
978  /* align team and thread arrived states */
979  KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
980  "T#%d(%d:%d) join =%llu, plain=%llu\n",
981  __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
982  __kmp_gtid_from_tid(i, team), team->t.t_id, i,
983  team->t.t_bar[bs_forkjoin_barrier].b_arrived,
984  team->t.t_bar[bs_plain_barrier].b_arrived));
985  thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
986  thr->th.th_teams_level = master_th->th.th_teams_level;
987  thr->th.th_teams_size = master_th->th.th_teams_size;
988  { // Initialize threads' barrier data.
989  int b;
990  kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
991  for (b = 0; b < bs_last_barrier; ++b) {
992  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
993  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
994 #if USE_DEBUGGER
995  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
996 #endif
997  }
998  }
999  }
1000 
1001 #if KMP_AFFINITY_SUPPORTED
1002  __kmp_partition_places(team);
1003 #endif
1004  }
1005 
1006  if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1007  for (i = 0; i < team->t.t_nproc; i++) {
1008  kmp_info_t *thr = team->t.t_threads[i];
1009  if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1010  thr->th.th_prev_level != team->t.t_level) {
1011  team->t.t_display_affinity = 1;
1012  break;
1013  }
1014  }
1015  }
1016 
1017  KMP_MB();
1018 }
1019 
1020 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1021 // Propagate any changes to the floating point control registers out to the team
1022 // We try to avoid unnecessary writes to the relevant cache line in the team
1023 // structure, so we don't make changes unless they are needed.
1024 inline static void propagateFPControl(kmp_team_t *team) {
1025  if (__kmp_inherit_fp_control) {
1026  kmp_int16 x87_fpu_control_word;
1027  kmp_uint32 mxcsr;
1028 
1029  // Get primary thread's values of FPU control flags (both X87 and vector)
1030  __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1031  __kmp_store_mxcsr(&mxcsr);
1032  mxcsr &= KMP_X86_MXCSR_MASK;
1033 
1034  // There is no point looking at t_fp_control_saved here.
1035  // If it is TRUE, we still have to update the values if they are different
1036  // from those we now have. If it is FALSE we didn't save anything yet, but
1037  // our objective is the same. We have to ensure that the values in the team
1038  // are the same as those we have.
1039  // So, this code achieves what we need whether or not t_fp_control_saved is
1040  // true. By checking whether the value needs updating we avoid unnecessary
1041  // writes that would put the cache-line into a written state, causing all
1042  // threads in the team to have to read it again.
1043  KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1044  KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1045  // Although we don't use this value, other code in the runtime wants to know
1046  // whether it should restore them. So we must ensure it is correct.
1047  KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1048  } else {
1049  // Similarly here. Don't write to this cache-line in the team structure
1050  // unless we have to.
1051  KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1052  }
1053 }
1054 
1055 // Do the opposite, setting the hardware registers to the updated values from
1056 // the team.
1057 inline static void updateHWFPControl(kmp_team_t *team) {
1058  if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1059  // Only reset the fp control regs if they have been changed in the team.
1060  // the parallel region that we are exiting.
1061  kmp_int16 x87_fpu_control_word;
1062  kmp_uint32 mxcsr;
1063  __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1064  __kmp_store_mxcsr(&mxcsr);
1065  mxcsr &= KMP_X86_MXCSR_MASK;
1066 
1067  if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1068  __kmp_clear_x87_fpu_status_word();
1069  __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1070  }
1071 
1072  if (team->t.t_mxcsr != mxcsr) {
1073  __kmp_load_mxcsr(&team->t.t_mxcsr);
1074  }
1075  }
1076 }
1077 #else
1078 #define propagateFPControl(x) ((void)0)
1079 #define updateHWFPControl(x) ((void)0)
1080 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1081 
1082 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1083  int realloc); // forward declaration
1084 
1085 /* Run a parallel region that has been serialized, so runs only in a team of the
1086  single primary thread. */
1087 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1088  kmp_info_t *this_thr;
1089  kmp_team_t *serial_team;
1090 
1091  KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1092 
1093  /* Skip all this code for autopar serialized loops since it results in
1094  unacceptable overhead */
1095  if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1096  return;
1097 
1098  if (!TCR_4(__kmp_init_parallel))
1099  __kmp_parallel_initialize();
1100  __kmp_resume_if_soft_paused();
1101 
1102  this_thr = __kmp_threads[global_tid];
1103  serial_team = this_thr->th.th_serial_team;
1104 
1105  /* utilize the serialized team held by this thread */
1106  KMP_DEBUG_ASSERT(serial_team);
1107  KMP_MB();
1108 
1109  if (__kmp_tasking_mode != tskm_immediate_exec) {
1110  KMP_DEBUG_ASSERT(
1111  this_thr->th.th_task_team ==
1112  this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1113  KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1114  NULL);
1115  KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1116  "team %p, new task_team = NULL\n",
1117  global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1118  this_thr->th.th_task_team = NULL;
1119  }
1120 
1121  kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1122  if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1123  proc_bind = proc_bind_false;
1124  } else if (proc_bind == proc_bind_default) {
1125  // No proc_bind clause was specified, so use the current value
1126  // of proc-bind-var for this parallel region.
1127  proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1128  }
1129  // Reset for next parallel region
1130  this_thr->th.th_set_proc_bind = proc_bind_default;
1131 
1132 #if OMPT_SUPPORT
1133  ompt_data_t ompt_parallel_data = ompt_data_none;
1134  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1135  if (ompt_enabled.enabled &&
1136  this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1137 
1138  ompt_task_info_t *parent_task_info;
1139  parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1140 
1141  parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1142  if (ompt_enabled.ompt_callback_parallel_begin) {
1143  int team_size = 1;
1144 
1145  ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1146  &(parent_task_info->task_data), &(parent_task_info->frame),
1147  &ompt_parallel_data, team_size,
1148  ompt_parallel_invoker_program | ompt_parallel_team, codeptr);
1149  }
1150  }
1151 #endif // OMPT_SUPPORT
1152 
1153  if (this_thr->th.th_team != serial_team) {
1154  // Nested level will be an index in the nested nthreads array
1155  int level = this_thr->th.th_team->t.t_level;
1156 
1157  if (serial_team->t.t_serialized) {
1158  /* this serial team was already used
1159  TODO increase performance by making this locks more specific */
1160  kmp_team_t *new_team;
1161 
1162  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1163 
1164  new_team =
1165  __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1166 #if OMPT_SUPPORT
1167  ompt_parallel_data,
1168 #endif
1169  proc_bind, &this_thr->th.th_current_task->td_icvs,
1170  0 USE_NESTED_HOT_ARG(NULL));
1171  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1172  KMP_ASSERT(new_team);
1173 
1174  /* setup new serialized team and install it */
1175  new_team->t.t_threads[0] = this_thr;
1176  new_team->t.t_parent = this_thr->th.th_team;
1177  serial_team = new_team;
1178  this_thr->th.th_serial_team = serial_team;
1179 
1180  KF_TRACE(
1181  10,
1182  ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1183  global_tid, serial_team));
1184 
1185  /* TODO the above breaks the requirement that if we run out of resources,
1186  then we can still guarantee that serialized teams are ok, since we may
1187  need to allocate a new one */
1188  } else {
1189  KF_TRACE(
1190  10,
1191  ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1192  global_tid, serial_team));
1193  }
1194 
1195  /* we have to initialize this serial team */
1196  KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1197  KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1198  KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1199  serial_team->t.t_ident = loc;
1200  serial_team->t.t_serialized = 1;
1201  serial_team->t.t_nproc = 1;
1202  serial_team->t.t_parent = this_thr->th.th_team;
1203  serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1204  this_thr->th.th_team = serial_team;
1205  serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1206 
1207  KF_TRACE(10, ("__kmpc_serialized_parallel: T#d curtask=%p\n", global_tid,
1208  this_thr->th.th_current_task));
1209  KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1210  this_thr->th.th_current_task->td_flags.executing = 0;
1211 
1212  __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1213 
1214  /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1215  implicit task for each serialized task represented by
1216  team->t.t_serialized? */
1217  copy_icvs(&this_thr->th.th_current_task->td_icvs,
1218  &this_thr->th.th_current_task->td_parent->td_icvs);
1219 
1220  // Thread value exists in the nested nthreads array for the next nested
1221  // level
1222  if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1223  this_thr->th.th_current_task->td_icvs.nproc =
1224  __kmp_nested_nth.nth[level + 1];
1225  }
1226 
1227  if (__kmp_nested_proc_bind.used &&
1228  (level + 1 < __kmp_nested_proc_bind.used)) {
1229  this_thr->th.th_current_task->td_icvs.proc_bind =
1230  __kmp_nested_proc_bind.bind_types[level + 1];
1231  }
1232 
1233 #if USE_DEBUGGER
1234  serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1235 #endif
1236  this_thr->th.th_info.ds.ds_tid = 0;
1237 
1238  /* set thread cache values */
1239  this_thr->th.th_team_nproc = 1;
1240  this_thr->th.th_team_master = this_thr;
1241  this_thr->th.th_team_serialized = 1;
1242 
1243  serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1244  serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1245  serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1246 
1247  propagateFPControl(serial_team);
1248 
1249  /* check if we need to allocate dispatch buffers stack */
1250  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1251  if (!serial_team->t.t_dispatch->th_disp_buffer) {
1252  serial_team->t.t_dispatch->th_disp_buffer =
1253  (dispatch_private_info_t *)__kmp_allocate(
1254  sizeof(dispatch_private_info_t));
1255  }
1256  this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1257 
1258  KMP_MB();
1259 
1260  } else {
1261  /* this serialized team is already being used,
1262  * that's fine, just add another nested level */
1263  KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1264  KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1265  KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1266  ++serial_team->t.t_serialized;
1267  this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1268 
1269  // Nested level will be an index in the nested nthreads array
1270  int level = this_thr->th.th_team->t.t_level;
1271  // Thread value exists in the nested nthreads array for the next nested
1272  // level
1273  if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1274  this_thr->th.th_current_task->td_icvs.nproc =
1275  __kmp_nested_nth.nth[level + 1];
1276  }
1277  serial_team->t.t_level++;
1278  KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1279  "of serial team %p to %d\n",
1280  global_tid, serial_team, serial_team->t.t_level));
1281 
1282  /* allocate/push dispatch buffers stack */
1283  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1284  {
1285  dispatch_private_info_t *disp_buffer =
1286  (dispatch_private_info_t *)__kmp_allocate(
1287  sizeof(dispatch_private_info_t));
1288  disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1289  serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1290  }
1291  this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1292 
1293  KMP_MB();
1294  }
1295  KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1296 
1297  // Perform the display affinity functionality for
1298  // serialized parallel regions
1299  if (__kmp_display_affinity) {
1300  if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1301  this_thr->th.th_prev_num_threads != 1) {
1302  // NULL means use the affinity-format-var ICV
1303  __kmp_aux_display_affinity(global_tid, NULL);
1304  this_thr->th.th_prev_level = serial_team->t.t_level;
1305  this_thr->th.th_prev_num_threads = 1;
1306  }
1307  }
1308 
1309  if (__kmp_env_consistency_check)
1310  __kmp_push_parallel(global_tid, NULL);
1311 #if OMPT_SUPPORT
1312  serial_team->t.ompt_team_info.master_return_address = codeptr;
1313  if (ompt_enabled.enabled &&
1314  this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1315  OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1316  OMPT_GET_FRAME_ADDRESS(0);
1317 
1318  ompt_lw_taskteam_t lw_taskteam;
1319  __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1320  &ompt_parallel_data, codeptr);
1321 
1322  __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1323  // don't use lw_taskteam after linking. content was swaped
1324 
1325  /* OMPT implicit task begin */
1326  if (ompt_enabled.ompt_callback_implicit_task) {
1327  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1328  ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1329  OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid),
1330  ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1331  OMPT_CUR_TASK_INFO(this_thr)->thread_num =
1332  __kmp_tid_from_gtid(global_tid);
1333  }
1334 
1335  /* OMPT state */
1336  this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1337  OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1338  OMPT_GET_FRAME_ADDRESS(0);
1339  }
1340 #endif
1341 }
1342 
1343 /* most of the work for a fork */
1344 /* return true if we really went parallel, false if serialized */
1345 int __kmp_fork_call(ident_t *loc, int gtid,
1346  enum fork_context_e call_context, // Intel, GNU, ...
1347  kmp_int32 argc, microtask_t microtask, launch_t invoker,
1348  kmp_va_list ap) {
1349  void **argv;
1350  int i;
1351  int master_tid;
1352  int master_this_cons;
1353  kmp_team_t *team;
1354  kmp_team_t *parent_team;
1355  kmp_info_t *master_th;
1356  kmp_root_t *root;
1357  int nthreads;
1358  int master_active;
1359  int master_set_numthreads;
1360  int level;
1361  int active_level;
1362  int teams_level;
1363 #if KMP_NESTED_HOT_TEAMS
1364  kmp_hot_team_ptr_t **p_hot_teams;
1365 #endif
1366  { // KMP_TIME_BLOCK
1367  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1368  KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1369 
1370  KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1371  if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1372  /* Some systems prefer the stack for the root thread(s) to start with */
1373  /* some gap from the parent stack to prevent false sharing. */
1374  void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1375  /* These 2 lines below are so this does not get optimized out */
1376  if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1377  __kmp_stkpadding += (short)((kmp_int64)dummy);
1378  }
1379 
1380  /* initialize if needed */
1381  KMP_DEBUG_ASSERT(
1382  __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1383  if (!TCR_4(__kmp_init_parallel))
1384  __kmp_parallel_initialize();
1385  __kmp_resume_if_soft_paused();
1386 
1387  /* setup current data */
1388  master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with
1389  // shutdown
1390  parent_team = master_th->th.th_team;
1391  master_tid = master_th->th.th_info.ds.ds_tid;
1392  master_this_cons = master_th->th.th_local.this_construct;
1393  root = master_th->th.th_root;
1394  master_active = root->r.r_active;
1395  master_set_numthreads = master_th->th.th_set_nproc;
1396 
1397 #if OMPT_SUPPORT
1398  ompt_data_t ompt_parallel_data = ompt_data_none;
1399  ompt_data_t *parent_task_data;
1400  ompt_frame_t *ompt_frame;
1401  ompt_data_t *implicit_task_data;
1402  void *return_address = NULL;
1403 
1404  if (ompt_enabled.enabled) {
1405  __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1406  NULL, NULL);
1407  return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1408  }
1409 #endif
1410 
1411  // Assign affinity to root thread if it hasn't happened yet
1412  __kmp_assign_root_init_mask();
1413 
1414  // Nested level will be an index in the nested nthreads array
1415  level = parent_team->t.t_level;
1416  // used to launch non-serial teams even if nested is not allowed
1417  active_level = parent_team->t.t_active_level;
1418  // needed to check nesting inside the teams
1419  teams_level = master_th->th.th_teams_level;
1420 #if KMP_NESTED_HOT_TEAMS
1421  p_hot_teams = &master_th->th.th_hot_teams;
1422  if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1423  *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1424  sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1425  (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1426  // it is either actual or not needed (when active_level > 0)
1427  (*p_hot_teams)[0].hot_team_nth = 1;
1428  }
1429 #endif
1430 
1431 #if OMPT_SUPPORT
1432  if (ompt_enabled.enabled) {
1433  if (ompt_enabled.ompt_callback_parallel_begin) {
1434  int team_size = master_set_numthreads
1435  ? master_set_numthreads
1436  : get__nproc_2(parent_team, master_tid);
1437  int flags = OMPT_INVOKER(call_context) |
1438  ((microtask == (microtask_t)__kmp_teams_master)
1439  ? ompt_parallel_league
1440  : ompt_parallel_team);
1441  ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1442  parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags,
1443  return_address);
1444  }
1445  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1446  }
1447 #endif
1448 
1449  master_th->th.th_ident = loc;
1450 
1451  if (master_th->th.th_teams_microtask && ap &&
1452  microtask != (microtask_t)__kmp_teams_master && level == teams_level) {
1453  // AC: This is start of parallel that is nested inside teams construct.
1454  // The team is actual (hot), all workers are ready at the fork barrier.
1455  // No lock needed to initialize the team a bit, then free workers.
1456  parent_team->t.t_ident = loc;
1457  __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1458  parent_team->t.t_argc = argc;
1459  argv = (void **)parent_team->t.t_argv;
1460  for (i = argc - 1; i >= 0; --i)
1461  *argv++ = va_arg(kmp_va_deref(ap), void *);
1462  // Increment our nested depth levels, but not increase the serialization
1463  if (parent_team == master_th->th.th_serial_team) {
1464  // AC: we are in serialized parallel
1465  __kmpc_serialized_parallel(loc, gtid);
1466  KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1467 
1468  if (call_context == fork_context_gnu) {
1469  // AC: need to decrement t_serialized for enquiry functions to work
1470  // correctly, will restore at join time
1471  parent_team->t.t_serialized--;
1472  return TRUE;
1473  }
1474 
1475 #if OMPD_SUPPORT
1476  parent_team->t.t_pkfn = microtask;
1477 #endif
1478 
1479 #if OMPT_SUPPORT
1480  void *dummy;
1481  void **exit_frame_p;
1482 
1483  ompt_lw_taskteam_t lw_taskteam;
1484 
1485  if (ompt_enabled.enabled) {
1486  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1487  &ompt_parallel_data, return_address);
1488  exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1489 
1490  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1491  // don't use lw_taskteam after linking. content was swaped
1492 
1493  /* OMPT implicit task begin */
1494  implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1495  if (ompt_enabled.ompt_callback_implicit_task) {
1496  OMPT_CUR_TASK_INFO(master_th)->thread_num =
1497  __kmp_tid_from_gtid(gtid);
1498  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1499  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1500  implicit_task_data, 1,
1501  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1502  }
1503 
1504  /* OMPT state */
1505  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1506  } else {
1507  exit_frame_p = &dummy;
1508  }
1509 #endif
1510  // AC: need to decrement t_serialized for enquiry functions to work
1511  // correctly, will restore at join time
1512  parent_team->t.t_serialized--;
1513 
1514  {
1515  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1516  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1517  __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1518 #if OMPT_SUPPORT
1519  ,
1520  exit_frame_p
1521 #endif
1522  );
1523  }
1524 
1525 #if OMPT_SUPPORT
1526  if (ompt_enabled.enabled) {
1527  *exit_frame_p = NULL;
1528  OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1529  if (ompt_enabled.ompt_callback_implicit_task) {
1530  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1531  ompt_scope_end, NULL, implicit_task_data, 1,
1532  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1533  }
1534  ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1535  __ompt_lw_taskteam_unlink(master_th);
1536  if (ompt_enabled.ompt_callback_parallel_end) {
1537  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1538  &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th),
1539  OMPT_INVOKER(call_context) | ompt_parallel_team,
1540  return_address);
1541  }
1542  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1543  }
1544 #endif
1545  return TRUE;
1546  }
1547 
1548  parent_team->t.t_pkfn = microtask;
1549  parent_team->t.t_invoke = invoker;
1550  KMP_ATOMIC_INC(&root->r.r_in_parallel);
1551  parent_team->t.t_active_level++;
1552  parent_team->t.t_level++;
1553  parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1554 
1555 #if OMPT_SUPPORT
1556  if (ompt_enabled.enabled) {
1557  ompt_lw_taskteam_t lw_taskteam;
1558  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1559  &ompt_parallel_data, return_address);
1560  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true);
1561  }
1562 #endif
1563 
1564  /* Change number of threads in the team if requested */
1565  if (master_set_numthreads) { // The parallel has num_threads clause
1566  if (master_set_numthreads < master_th->th.th_teams_size.nth) {
1567  // AC: only can reduce number of threads dynamically, can't increase
1568  kmp_info_t **other_threads = parent_team->t.t_threads;
1569  parent_team->t.t_nproc = master_set_numthreads;
1570  for (i = 0; i < master_set_numthreads; ++i) {
1571  other_threads[i]->th.th_team_nproc = master_set_numthreads;
1572  }
1573  // Keep extra threads hot in the team for possible next parallels
1574  }
1575  master_th->th.th_set_nproc = 0;
1576  }
1577 
1578 #if USE_DEBUGGER
1579  if (__kmp_debugging) { // Let debugger override number of threads.
1580  int nth = __kmp_omp_num_threads(loc);
1581  if (nth > 0) { // 0 means debugger doesn't want to change num threads
1582  master_set_numthreads = nth;
1583  }
1584  }
1585 #endif
1586 
1587 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1588  if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) ||
1589  KMP_ITT_DEBUG) &&
1590  __kmp_forkjoin_frames_mode == 3 &&
1591  parent_team->t.t_active_level == 1 // only report frames at level 1
1592  && master_th->th.th_teams_size.nteams == 1) {
1593  kmp_uint64 tmp_time = __itt_get_timestamp();
1594  master_th->th.th_frame_time = tmp_time;
1595  parent_team->t.t_region_time = tmp_time;
1596  }
1597  if (__itt_stack_caller_create_ptr) {
1598  KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
1599  // create new stack stitching id before entering fork barrier
1600  parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
1601  }
1602 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1603 
1604  KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "
1605  "master_th=%p, gtid=%d\n",
1606  root, parent_team, master_th, gtid));
1607  __kmp_internal_fork(loc, gtid, parent_team);
1608  KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, "
1609  "master_th=%p, gtid=%d\n",
1610  root, parent_team, master_th, gtid));
1611 
1612  if (call_context == fork_context_gnu)
1613  return TRUE;
1614 
1615  /* Invoke microtask for PRIMARY thread */
1616  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
1617  parent_team->t.t_id, parent_team->t.t_pkfn));
1618 
1619  if (!parent_team->t.t_invoke(gtid)) {
1620  KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
1621  }
1622  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
1623  parent_team->t.t_id, parent_team->t.t_pkfn));
1624  KMP_MB(); /* Flush all pending memory write invalidates. */
1625 
1626  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
1627 
1628  return TRUE;
1629  } // Parallel closely nested in teams construct
1630 
1631 #if KMP_DEBUG
1632  if (__kmp_tasking_mode != tskm_immediate_exec) {
1633  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1634  parent_team->t.t_task_team[master_th->th.th_task_state]);
1635  }
1636 #endif
1637 
1638  int enter_teams = 0;
1639  if (parent_team->t.t_active_level >=
1640  master_th->th.th_current_task->td_icvs.max_active_levels) {
1641  nthreads = 1;
1642  } else {
1643  enter_teams = ((ap == NULL && active_level == 0) ||
1644  (ap && teams_level > 0 && teams_level == level));
1645  nthreads =
1646  master_set_numthreads
1647  ? master_set_numthreads
1648  : get__nproc_2(
1649  parent_team,
1650  master_tid); // TODO: get nproc directly from current task
1651 
1652  // Check if we need to take forkjoin lock? (no need for serialized
1653  // parallel out of teams construct). This code moved here from
1654  // __kmp_reserve_threads() to speedup nested serialized parallels.
1655  if (nthreads > 1) {
1656  if ((get__max_active_levels(master_th) == 1 &&
1657  (root->r.r_in_parallel && !enter_teams)) ||
1658  (__kmp_library == library_serial)) {
1659  KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d"
1660  " threads\n",
1661  gtid, nthreads));
1662  nthreads = 1;
1663  }
1664  }
1665  if (nthreads > 1) {
1666  /* determine how many new threads we can use */
1667  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1668  /* AC: If we execute teams from parallel region (on host), then teams
1669  should be created but each can only have 1 thread if nesting is
1670  disabled. If teams called from serial region, then teams and their
1671  threads should be created regardless of the nesting setting. */
1672  nthreads = __kmp_reserve_threads(root, parent_team, master_tid,
1673  nthreads, enter_teams);
1674  if (nthreads == 1) {
1675  // Free lock for single thread execution here; for multi-thread
1676  // execution it will be freed later after team of threads created
1677  // and initialized
1678  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1679  }
1680  }
1681  }
1682  KMP_DEBUG_ASSERT(nthreads > 0);
1683 
1684  // If we temporarily changed the set number of threads then restore it now
1685  master_th->th.th_set_nproc = 0;
1686 
1687  /* create a serialized parallel region? */
1688  if (nthreads == 1) {
1689 /* josh todo: hypothetical question: what do we do for OS X*? */
1690 #if KMP_OS_LINUX && \
1691  (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1692  void *args[argc];
1693 #else
1694  void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1695 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1696  KMP_ARCH_AARCH64) */
1697 
1698  KA_TRACE(20,
1699  ("__kmp_fork_call: T#%d serializing parallel region\n", gtid));
1700 
1701  __kmpc_serialized_parallel(loc, gtid);
1702 
1703 #if OMPD_SUPPORT
1704  master_th->th.th_serial_team->t.t_pkfn = microtask;
1705 #endif
1706 
1707  if (call_context == fork_context_intel) {
1708  /* TODO this sucks, use the compiler itself to pass args! :) */
1709  master_th->th.th_serial_team->t.t_ident = loc;
1710  if (!ap) {
1711  // revert change made in __kmpc_serialized_parallel()
1712  master_th->th.th_serial_team->t.t_level--;
1713  // Get args from parent team for teams construct
1714 
1715 #if OMPT_SUPPORT
1716  void *dummy;
1717  void **exit_frame_p;
1718  ompt_task_info_t *task_info;
1719 
1720  ompt_lw_taskteam_t lw_taskteam;
1721 
1722  if (ompt_enabled.enabled) {
1723  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1724  &ompt_parallel_data, return_address);
1725 
1726  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1727  // don't use lw_taskteam after linking. content was swaped
1728 
1729  task_info = OMPT_CUR_TASK_INFO(master_th);
1730  exit_frame_p = &(task_info->frame.exit_frame.ptr);
1731  if (ompt_enabled.ompt_callback_implicit_task) {
1732  OMPT_CUR_TASK_INFO(master_th)->thread_num =
1733  __kmp_tid_from_gtid(gtid);
1734  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1735  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1736  &(task_info->task_data), 1,
1737  OMPT_CUR_TASK_INFO(master_th)->thread_num,
1738  ompt_task_implicit);
1739  }
1740 
1741  /* OMPT state */
1742  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1743  } else {
1744  exit_frame_p = &dummy;
1745  }
1746 #endif
1747 
1748  {
1749  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1750  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1751  __kmp_invoke_microtask(microtask, gtid, 0, argc,
1752  parent_team->t.t_argv
1753 #if OMPT_SUPPORT
1754  ,
1755  exit_frame_p
1756 #endif
1757  );
1758  }
1759 
1760 #if OMPT_SUPPORT
1761  if (ompt_enabled.enabled) {
1762  *exit_frame_p = NULL;
1763  if (ompt_enabled.ompt_callback_implicit_task) {
1764  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1765  ompt_scope_end, NULL, &(task_info->task_data), 1,
1766  OMPT_CUR_TASK_INFO(master_th)->thread_num,
1767  ompt_task_implicit);
1768  }
1769  ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1770  __ompt_lw_taskteam_unlink(master_th);
1771  if (ompt_enabled.ompt_callback_parallel_end) {
1772  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1773  &ompt_parallel_data, parent_task_data,
1774  OMPT_INVOKER(call_context) | ompt_parallel_team,
1775  return_address);
1776  }
1777  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1778  }
1779 #endif
1780  } else if (microtask == (microtask_t)__kmp_teams_master) {
1781  KMP_DEBUG_ASSERT(master_th->th.th_team ==
1782  master_th->th.th_serial_team);
1783  team = master_th->th.th_team;
1784  // team->t.t_pkfn = microtask;
1785  team->t.t_invoke = invoker;
1786  __kmp_alloc_argv_entries(argc, team, TRUE);
1787  team->t.t_argc = argc;
1788  argv = (void **)team->t.t_argv;
1789  if (ap) {
1790  for (i = argc - 1; i >= 0; --i)
1791  *argv++ = va_arg(kmp_va_deref(ap), void *);
1792  } else {
1793  for (i = 0; i < argc; ++i)
1794  // Get args from parent team for teams construct
1795  argv[i] = parent_team->t.t_argv[i];
1796  }
1797  // AC: revert change made in __kmpc_serialized_parallel()
1798  // because initial code in teams should have level=0
1799  team->t.t_level--;
1800  // AC: call special invoker for outer "parallel" of teams construct
1801  invoker(gtid);
1802 #if OMPT_SUPPORT
1803  if (ompt_enabled.enabled) {
1804  ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th);
1805  if (ompt_enabled.ompt_callback_implicit_task) {
1806  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1807  ompt_scope_end, NULL, &(task_info->task_data), 0,
1808  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial);
1809  }
1810  if (ompt_enabled.ompt_callback_parallel_end) {
1811  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1812  &ompt_parallel_data, parent_task_data,
1813  OMPT_INVOKER(call_context) | ompt_parallel_league,
1814  return_address);
1815  }
1816  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1817  }
1818 #endif
1819  } else {
1820  argv = args;
1821  for (i = argc - 1; i >= 0; --i)
1822  *argv++ = va_arg(kmp_va_deref(ap), void *);
1823  KMP_MB();
1824 
1825 #if OMPT_SUPPORT
1826  void *dummy;
1827  void **exit_frame_p;
1828  ompt_task_info_t *task_info;
1829 
1830  ompt_lw_taskteam_t lw_taskteam;
1831 
1832  if (ompt_enabled.enabled) {
1833  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1834  &ompt_parallel_data, return_address);
1835  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1836  // don't use lw_taskteam after linking. content was swaped
1837  task_info = OMPT_CUR_TASK_INFO(master_th);
1838  exit_frame_p = &(task_info->frame.exit_frame.ptr);
1839 
1840  /* OMPT implicit task begin */
1841  implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1842  if (ompt_enabled.ompt_callback_implicit_task) {
1843  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1844  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1845  implicit_task_data, 1, __kmp_tid_from_gtid(gtid),
1846  ompt_task_implicit);
1847  OMPT_CUR_TASK_INFO(master_th)->thread_num =
1848  __kmp_tid_from_gtid(gtid);
1849  }
1850 
1851  /* OMPT state */
1852  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1853  } else {
1854  exit_frame_p = &dummy;
1855  }
1856 #endif
1857 
1858  {
1859  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1860  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1861  __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1862 #if OMPT_SUPPORT
1863  ,
1864  exit_frame_p
1865 #endif
1866  );
1867  }
1868 
1869 #if OMPT_SUPPORT
1870  if (ompt_enabled.enabled) {
1871  *exit_frame_p = NULL;
1872  if (ompt_enabled.ompt_callback_implicit_task) {
1873  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1874  ompt_scope_end, NULL, &(task_info->task_data), 1,
1875  OMPT_CUR_TASK_INFO(master_th)->thread_num,
1876  ompt_task_implicit);
1877  }
1878 
1879  ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1880  __ompt_lw_taskteam_unlink(master_th);
1881  if (ompt_enabled.ompt_callback_parallel_end) {
1882  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1883  &ompt_parallel_data, parent_task_data,
1884  OMPT_INVOKER(call_context) | ompt_parallel_team,
1885  return_address);
1886  }
1887  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1888  }
1889 #endif
1890  }
1891  } else if (call_context == fork_context_gnu) {
1892 #if OMPT_SUPPORT
1893  ompt_lw_taskteam_t lwt;
1894  __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data,
1895  return_address);
1896 
1897  lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1898  __ompt_lw_taskteam_link(&lwt, master_th, 1);
1899 // don't use lw_taskteam after linking. content was swaped
1900 #endif
1901 
1902  // we were called from GNU native code
1903  KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1904  return FALSE;
1905  } else {
1906  KMP_ASSERT2(call_context < fork_context_last,
1907  "__kmp_fork_call: unknown fork_context parameter");
1908  }
1909 
1910  KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1911  KMP_MB();
1912  return FALSE;
1913  } // if (nthreads == 1)
1914 
1915  // GEH: only modify the executing flag in the case when not serialized
1916  // serialized case is handled in kmpc_serialized_parallel
1917  KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
1918  "curtask=%p, curtask_max_aclevel=%d\n",
1919  parent_team->t.t_active_level, master_th,
1920  master_th->th.th_current_task,
1921  master_th->th.th_current_task->td_icvs.max_active_levels));
1922  // TODO: GEH - cannot do this assertion because root thread not set up as
1923  // executing
1924  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
1925  master_th->th.th_current_task->td_flags.executing = 0;
1926 
1927  if (!master_th->th.th_teams_microtask || level > teams_level) {
1928  /* Increment our nested depth level */
1929  KMP_ATOMIC_INC(&root->r.r_in_parallel);
1930  }
1931 
1932  // See if we need to make a copy of the ICVs.
1933  int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
1934  if ((level + 1 < __kmp_nested_nth.used) &&
1935  (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
1936  nthreads_icv = __kmp_nested_nth.nth[level + 1];
1937  } else {
1938  nthreads_icv = 0; // don't update
1939  }
1940 
1941  // Figure out the proc_bind_policy for the new team.
1942  kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1943  kmp_proc_bind_t proc_bind_icv =
1944  proc_bind_default; // proc_bind_default means don't update
1945  if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1946  proc_bind = proc_bind_false;
1947  } else {
1948  if (proc_bind == proc_bind_default) {
1949  // No proc_bind clause specified; use current proc-bind-var for this
1950  // parallel region
1951  proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1952  }
1953  /* else: The proc_bind policy was specified explicitly on parallel clause.
1954  This overrides proc-bind-var for this parallel region, but does not
1955  change proc-bind-var. */
1956  // Figure the value of proc-bind-var for the child threads.
1957  if ((level + 1 < __kmp_nested_proc_bind.used) &&
1958  (__kmp_nested_proc_bind.bind_types[level + 1] !=
1959  master_th->th.th_current_task->td_icvs.proc_bind)) {
1960  proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
1961  }
1962  }
1963 
1964  // Reset for next parallel region
1965  master_th->th.th_set_proc_bind = proc_bind_default;
1966 
1967  if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) {
1968  kmp_internal_control_t new_icvs;
1969  copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
1970  new_icvs.next = NULL;
1971  if (nthreads_icv > 0) {
1972  new_icvs.nproc = nthreads_icv;
1973  }
1974  if (proc_bind_icv != proc_bind_default) {
1975  new_icvs.proc_bind = proc_bind_icv;
1976  }
1977 
1978  /* allocate a new parallel team */
1979  KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
1980  team = __kmp_allocate_team(root, nthreads, nthreads,
1981 #if OMPT_SUPPORT
1982  ompt_parallel_data,
1983 #endif
1984  proc_bind, &new_icvs,
1985  argc USE_NESTED_HOT_ARG(master_th));
1986  } else {
1987  /* allocate a new parallel team */
1988  KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
1989  team = __kmp_allocate_team(root, nthreads, nthreads,
1990 #if OMPT_SUPPORT
1991  ompt_parallel_data,
1992 #endif
1993  proc_bind,
1994  &master_th->th.th_current_task->td_icvs,
1995  argc USE_NESTED_HOT_ARG(master_th));
1996  }
1997  KF_TRACE(
1998  10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
1999 
2000  /* setup the new team */
2001  KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2002  KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2003  KMP_CHECK_UPDATE(team->t.t_ident, loc);
2004  KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2005  KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2006 #if OMPT_SUPPORT
2007  KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2008  return_address);
2009 #endif
2010  KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2011  // TODO: parent_team->t.t_level == INT_MAX ???
2012  if (!master_th->th.th_teams_microtask || level > teams_level) {
2013  int new_level = parent_team->t.t_level + 1;
2014  KMP_CHECK_UPDATE(team->t.t_level, new_level);
2015  new_level = parent_team->t.t_active_level + 1;
2016  KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2017  } else {
2018  // AC: Do not increase parallel level at start of the teams construct
2019  int new_level = parent_team->t.t_level;
2020  KMP_CHECK_UPDATE(team->t.t_level, new_level);
2021  new_level = parent_team->t.t_active_level;
2022  KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2023  }
2024  kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2025  // set primary thread's schedule as new run-time schedule
2026  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2027 
2028  KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2029  KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2030 
2031  // Update the floating point rounding in the team if required.
2032  propagateFPControl(team);
2033 #if OMPD_SUPPORT
2034  if (ompd_state & OMPD_ENABLE_BP)
2035  ompd_bp_parallel_begin();
2036 #endif
2037 
2038  if (__kmp_tasking_mode != tskm_immediate_exec) {
2039  // Set primary thread's task team to team's task team. Unless this is hot
2040  // team, it should be NULL.
2041  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2042  parent_team->t.t_task_team[master_th->th.th_task_state]);
2043  KA_TRACE(20, ("__kmp_fork_call: Primary T#%d pushing task_team %p / team "
2044  "%p, new task_team %p / team %p\n",
2045  __kmp_gtid_from_thread(master_th),
2046  master_th->th.th_task_team, parent_team,
2047  team->t.t_task_team[master_th->th.th_task_state], team));
2048 
2049  if (active_level || master_th->th.th_task_team) {
2050  // Take a memo of primary thread's task_state
2051  KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2052  if (master_th->th.th_task_state_top >=
2053  master_th->th.th_task_state_stack_sz) { // increase size
2054  kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2055  kmp_uint8 *old_stack, *new_stack;
2056  kmp_uint32 i;
2057  new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2058  for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2059  new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2060  }
2061  for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2062  ++i) { // zero-init rest of stack
2063  new_stack[i] = 0;
2064  }
2065  old_stack = master_th->th.th_task_state_memo_stack;
2066  master_th->th.th_task_state_memo_stack = new_stack;
2067  master_th->th.th_task_state_stack_sz = new_size;
2068  __kmp_free(old_stack);
2069  }
2070  // Store primary thread's task_state on stack
2071  master_th->th
2072  .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2073  master_th->th.th_task_state;
2074  master_th->th.th_task_state_top++;
2075 #if KMP_NESTED_HOT_TEAMS
2076  if (master_th->th.th_hot_teams &&
2077  active_level < __kmp_hot_teams_max_level &&
2078  team == master_th->th.th_hot_teams[active_level].hot_team) {
2079  // Restore primary thread's nested state if nested hot team
2080  master_th->th.th_task_state =
2081  master_th->th
2082  .th_task_state_memo_stack[master_th->th.th_task_state_top];
2083  } else {
2084 #endif
2085  master_th->th.th_task_state = 0;
2086 #if KMP_NESTED_HOT_TEAMS
2087  }
2088 #endif
2089  }
2090 #if !KMP_NESTED_HOT_TEAMS
2091  KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2092  (team == root->r.r_hot_team));
2093 #endif
2094  }
2095 
2096  KA_TRACE(
2097  20,
2098  ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2099  gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2100  team->t.t_nproc));
2101  KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2102  (team->t.t_master_tid == 0 &&
2103  (team->t.t_parent == root->r.r_root_team ||
2104  team->t.t_parent->t.t_serialized)));
2105  KMP_MB();
2106 
2107  /* now, setup the arguments */
2108  argv = (void **)team->t.t_argv;
2109  if (ap) {
2110  for (i = argc - 1; i >= 0; --i) {
2111  void *new_argv = va_arg(kmp_va_deref(ap), void *);
2112  KMP_CHECK_UPDATE(*argv, new_argv);
2113  argv++;
2114  }
2115  } else {
2116  for (i = 0; i < argc; ++i) {
2117  // Get args from parent team for teams construct
2118  KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2119  }
2120  }
2121 
2122  /* now actually fork the threads */
2123  KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2124  if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2125  root->r.r_active = TRUE;
2126 
2127  __kmp_fork_team_threads(root, team, master_th, gtid);
2128  __kmp_setup_icv_copy(team, nthreads,
2129  &master_th->th.th_current_task->td_icvs, loc);
2130 
2131 #if OMPT_SUPPORT
2132  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2133 #endif
2134 
2135  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2136 
2137 #if USE_ITT_BUILD
2138  if (team->t.t_active_level == 1 // only report frames at level 1
2139  && !master_th->th.th_teams_microtask) { // not in teams construct
2140 #if USE_ITT_NOTIFY
2141  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2142  (__kmp_forkjoin_frames_mode == 3 ||
2143  __kmp_forkjoin_frames_mode == 1)) {
2144  kmp_uint64 tmp_time = 0;
2145  if (__itt_get_timestamp_ptr)
2146  tmp_time = __itt_get_timestamp();
2147  // Internal fork - report frame begin
2148  master_th->th.th_frame_time = tmp_time;
2149  if (__kmp_forkjoin_frames_mode == 3)
2150  team->t.t_region_time = tmp_time;
2151  } else
2152 // only one notification scheme (either "submit" or "forking/joined", not both)
2153 #endif /* USE_ITT_NOTIFY */
2154  if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2155  __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2156  // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2157  __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2158  }
2159  }
2160 #endif /* USE_ITT_BUILD */
2161 
2162  /* now go on and do the work */
2163  KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2164  KMP_MB();
2165  KF_TRACE(10,
2166  ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2167  root, team, master_th, gtid));
2168 
2169 #if USE_ITT_BUILD
2170  if (__itt_stack_caller_create_ptr) {
2171  // create new stack stitching id before entering fork barrier
2172  if (!enter_teams) {
2173  KMP_DEBUG_ASSERT(team->t.t_stack_id == NULL);
2174  team->t.t_stack_id = __kmp_itt_stack_caller_create();
2175  } else if (parent_team->t.t_serialized) {
2176  // keep stack stitching id in the serialized parent_team;
2177  // current team will be used for parallel inside the teams;
2178  // if parent_team is active, then it already keeps stack stitching id
2179  // for the league of teams
2180  KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
2181  parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
2182  }
2183  }
2184 #endif /* USE_ITT_BUILD */
2185 
2186  // AC: skip __kmp_internal_fork at teams construct, let only primary
2187  // threads execute
2188  if (ap) {
2189  __kmp_internal_fork(loc, gtid, team);
2190  KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2191  "master_th=%p, gtid=%d\n",
2192  root, team, master_th, gtid));
2193  }
2194 
2195  if (call_context == fork_context_gnu) {
2196  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2197  return TRUE;
2198  }
2199 
2200  /* Invoke microtask for PRIMARY thread */
2201  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2202  team->t.t_id, team->t.t_pkfn));
2203  } // END of timer KMP_fork_call block
2204 
2205 #if KMP_STATS_ENABLED
2206  // If beginning a teams construct, then change thread state
2207  stats_state_e previous_state = KMP_GET_THREAD_STATE();
2208  if (!ap) {
2209  KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
2210  }
2211 #endif
2212 
2213  if (!team->t.t_invoke(gtid)) {
2214  KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
2215  }
2216 
2217 #if KMP_STATS_ENABLED
2218  // If was beginning of a teams construct, then reset thread state
2219  if (!ap) {
2220  KMP_SET_THREAD_STATE(previous_state);
2221  }
2222 #endif
2223 
2224  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2225  team->t.t_id, team->t.t_pkfn));
2226  KMP_MB(); /* Flush all pending memory write invalidates. */
2227 
2228  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2229 #if OMPT_SUPPORT
2230  if (ompt_enabled.enabled) {
2231  master_th->th.ompt_thread_info.state = ompt_state_overhead;
2232  }
2233 #endif
2234 
2235  return TRUE;
2236 }
2237 
2238 #if OMPT_SUPPORT
2239 static inline void __kmp_join_restore_state(kmp_info_t *thread,
2240  kmp_team_t *team) {
2241  // restore state outside the region
2242  thread->th.ompt_thread_info.state =
2243  ((team->t.t_serialized) ? ompt_state_work_serial
2244  : ompt_state_work_parallel);
2245 }
2246 
2247 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2248  kmp_team_t *team, ompt_data_t *parallel_data,
2249  int flags, void *codeptr) {
2250  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2251  if (ompt_enabled.ompt_callback_parallel_end) {
2252  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2253  parallel_data, &(task_info->task_data), flags, codeptr);
2254  }
2255 
2256  task_info->frame.enter_frame = ompt_data_none;
2257  __kmp_join_restore_state(thread, team);
2258 }
2259 #endif
2260 
2261 void __kmp_join_call(ident_t *loc, int gtid
2262 #if OMPT_SUPPORT
2263  ,
2264  enum fork_context_e fork_context
2265 #endif
2266  ,
2267  int exit_teams) {
2268  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2269  kmp_team_t *team;
2270  kmp_team_t *parent_team;
2271  kmp_info_t *master_th;
2272  kmp_root_t *root;
2273  int master_active;
2274 
2275  KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2276 
2277  /* setup current data */
2278  master_th = __kmp_threads[gtid];
2279  root = master_th->th.th_root;
2280  team = master_th->th.th_team;
2281  parent_team = team->t.t_parent;
2282 
2283  master_th->th.th_ident = loc;
2284 
2285 #if OMPT_SUPPORT
2286  void *team_microtask = (void *)team->t.t_pkfn;
2287  // For GOMP interface with serialized parallel, need the
2288  // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task
2289  // and end-parallel events.
2290  if (ompt_enabled.enabled &&
2291  !(team->t.t_serialized && fork_context == fork_context_gnu)) {
2292  master_th->th.ompt_thread_info.state = ompt_state_overhead;
2293  }
2294 #endif
2295 
2296 #if KMP_DEBUG
2297  if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2298  KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2299  "th_task_team = %p\n",
2300  __kmp_gtid_from_thread(master_th), team,
2301  team->t.t_task_team[master_th->th.th_task_state],
2302  master_th->th.th_task_team));
2303  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2304  team->t.t_task_team[master_th->th.th_task_state]);
2305  }
2306 #endif
2307 
2308  if (team->t.t_serialized) {
2309  if (master_th->th.th_teams_microtask) {
2310  // We are in teams construct
2311  int level = team->t.t_level;
2312  int tlevel = master_th->th.th_teams_level;
2313  if (level == tlevel) {
2314  // AC: we haven't incremented it earlier at start of teams construct,
2315  // so do it here - at the end of teams construct
2316  team->t.t_level++;
2317  } else if (level == tlevel + 1) {
2318  // AC: we are exiting parallel inside teams, need to increment
2319  // serialization in order to restore it in the next call to
2320  // __kmpc_end_serialized_parallel
2321  team->t.t_serialized++;
2322  }
2323  }
2324  __kmpc_end_serialized_parallel(loc, gtid);
2325 
2326 #if OMPT_SUPPORT
2327  if (ompt_enabled.enabled) {
2328  __kmp_join_restore_state(master_th, parent_team);
2329  }
2330 #endif
2331 
2332  return;
2333  }
2334 
2335  master_active = team->t.t_master_active;
2336 
2337  if (!exit_teams) {
2338  // AC: No barrier for internal teams at exit from teams construct.
2339  // But there is barrier for external team (league).
2340  __kmp_internal_join(loc, gtid, team);
2341 #if USE_ITT_BUILD
2342  if (__itt_stack_caller_create_ptr) {
2343  KMP_DEBUG_ASSERT(team->t.t_stack_id != NULL);
2344  // destroy the stack stitching id after join barrier
2345  __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id);
2346  team->t.t_stack_id = NULL;
2347  }
2348 #endif
2349  } else {
2350  master_th->th.th_task_state =
2351  0; // AC: no tasking in teams (out of any parallel)
2352 #if USE_ITT_BUILD
2353  if (__itt_stack_caller_create_ptr && parent_team->t.t_serialized) {
2354  KMP_DEBUG_ASSERT(parent_team->t.t_stack_id != NULL);
2355  // destroy the stack stitching id on exit from the teams construct
2356  // if parent_team is active, then the id will be destroyed later on
2357  // by master of the league of teams
2358  __kmp_itt_stack_caller_destroy((__itt_caller)parent_team->t.t_stack_id);
2359  parent_team->t.t_stack_id = NULL;
2360  }
2361 #endif
2362  }
2363 
2364  KMP_MB();
2365 
2366 #if OMPT_SUPPORT
2367  ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2368  void *codeptr = team->t.ompt_team_info.master_return_address;
2369 #endif
2370 
2371 #if USE_ITT_BUILD
2372  // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2373  if (team->t.t_active_level == 1 &&
2374  (!master_th->th.th_teams_microtask || /* not in teams construct */
2375  master_th->th.th_teams_size.nteams == 1)) {
2376  master_th->th.th_ident = loc;
2377  // only one notification scheme (either "submit" or "forking/joined", not
2378  // both)
2379  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2380  __kmp_forkjoin_frames_mode == 3)
2381  __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2382  master_th->th.th_frame_time, 0, loc,
2383  master_th->th.th_team_nproc, 1);
2384  else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2385  !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2386  __kmp_itt_region_joined(gtid);
2387  } // active_level == 1
2388 #endif /* USE_ITT_BUILD */
2389 
2390  if (master_th->th.th_teams_microtask && !exit_teams &&
2391  team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2392  team->t.t_level == master_th->th.th_teams_level + 1) {
2393 // AC: We need to leave the team structure intact at the end of parallel
2394 // inside the teams construct, so that at the next parallel same (hot) team
2395 // works, only adjust nesting levels
2396 #if OMPT_SUPPORT
2397  ompt_data_t ompt_parallel_data = ompt_data_none;
2398  if (ompt_enabled.enabled) {
2399  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2400  if (ompt_enabled.ompt_callback_implicit_task) {
2401  int ompt_team_size = team->t.t_nproc;
2402  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2403  ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2404  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
2405  }
2406  task_info->frame.exit_frame = ompt_data_none;
2407  task_info->task_data = ompt_data_none;
2408  ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
2409  __ompt_lw_taskteam_unlink(master_th);
2410  }
2411 #endif
2412  /* Decrement our nested depth level */
2413  team->t.t_level--;
2414  team->t.t_active_level--;
2415  KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2416 
2417  // Restore number of threads in the team if needed. This code relies on
2418  // the proper adjustment of th_teams_size.nth after the fork in
2419  // __kmp_teams_master on each teams primary thread in the case that
2420  // __kmp_reserve_threads reduced it.
2421  if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2422  int old_num = master_th->th.th_team_nproc;
2423  int new_num = master_th->th.th_teams_size.nth;
2424  kmp_info_t **other_threads = team->t.t_threads;
2425  team->t.t_nproc = new_num;
2426  for (int i = 0; i < old_num; ++i) {
2427  other_threads[i]->th.th_team_nproc = new_num;
2428  }
2429  // Adjust states of non-used threads of the team
2430  for (int i = old_num; i < new_num; ++i) {
2431  // Re-initialize thread's barrier data.
2432  KMP_DEBUG_ASSERT(other_threads[i]);
2433  kmp_balign_t *balign = other_threads[i]->th.th_bar;
2434  for (int b = 0; b < bs_last_barrier; ++b) {
2435  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2436  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2437 #if USE_DEBUGGER
2438  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2439 #endif
2440  }
2441  if (__kmp_tasking_mode != tskm_immediate_exec) {
2442  // Synchronize thread's task state
2443  other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2444  }
2445  }
2446  }
2447 
2448 #if OMPT_SUPPORT
2449  if (ompt_enabled.enabled) {
2450  __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data,
2451  OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr);
2452  }
2453 #endif
2454 
2455  return;
2456  }
2457 
2458  /* do cleanup and restore the parent team */
2459  master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2460  master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2461 
2462  master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2463 
2464  /* jc: The following lock has instructions with REL and ACQ semantics,
2465  separating the parallel user code called in this parallel region
2466  from the serial user code called after this function returns. */
2467  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2468 
2469  if (!master_th->th.th_teams_microtask ||
2470  team->t.t_level > master_th->th.th_teams_level) {
2471  /* Decrement our nested depth level */
2472  KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2473  }
2474  KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2475 
2476 #if OMPT_SUPPORT
2477  if (ompt_enabled.enabled) {
2478  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2479  if (ompt_enabled.ompt_callback_implicit_task) {
2480  int flags = (team_microtask == (void *)__kmp_teams_master)
2481  ? ompt_task_initial
2482  : ompt_task_implicit;
2483  int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc;
2484  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2485  ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2486  OMPT_CUR_TASK_INFO(master_th)->thread_num, flags);
2487  }
2488  task_info->frame.exit_frame = ompt_data_none;
2489  task_info->task_data = ompt_data_none;
2490  }
2491 #endif
2492 
2493  KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2494  master_th, team));
2495  __kmp_pop_current_task_from_thread(master_th);
2496 
2497 #if KMP_AFFINITY_SUPPORTED
2498  // Restore master thread's partition.
2499  master_th->th.th_first_place = team->t.t_first_place;
2500  master_th->th.th_last_place = team->t.t_last_place;
2501 #endif // KMP_AFFINITY_SUPPORTED
2502  master_th->th.th_def_allocator = team->t.t_def_allocator;
2503 
2504 #if OMPD_SUPPORT
2505  if (ompd_state & OMPD_ENABLE_BP)
2506  ompd_bp_parallel_end();
2507 #endif
2508  updateHWFPControl(team);
2509 
2510  if (root->r.r_active != master_active)
2511  root->r.r_active = master_active;
2512 
2513  __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2514  master_th)); // this will free worker threads
2515 
2516  /* this race was fun to find. make sure the following is in the critical
2517  region otherwise assertions may fail occasionally since the old team may be
2518  reallocated and the hierarchy appears inconsistent. it is actually safe to
2519  run and won't cause any bugs, but will cause those assertion failures. it's
2520  only one deref&assign so might as well put this in the critical region */
2521  master_th->th.th_team = parent_team;
2522  master_th->th.th_team_nproc = parent_team->t.t_nproc;
2523  master_th->th.th_team_master = parent_team->t.t_threads[0];
2524  master_th->th.th_team_serialized = parent_team->t.t_serialized;
2525 
2526  /* restore serialized team, if need be */
2527  if (parent_team->t.t_serialized &&
2528  parent_team != master_th->th.th_serial_team &&
2529  parent_team != root->r.r_root_team) {
2530  __kmp_free_team(root,
2531  master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2532  master_th->th.th_serial_team = parent_team;
2533  }
2534 
2535  if (__kmp_tasking_mode != tskm_immediate_exec) {
2536  if (master_th->th.th_task_state_top >
2537  0) { // Restore task state from memo stack
2538  KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2539  // Remember primary thread's state if we re-use this nested hot team
2540  master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2541  master_th->th.th_task_state;
2542  --master_th->th.th_task_state_top; // pop
2543  // Now restore state at this level
2544  master_th->th.th_task_state =
2545  master_th->th
2546  .th_task_state_memo_stack[master_th->th.th_task_state_top];
2547  }
2548  // Copy the task team from the parent team to the primary thread
2549  master_th->th.th_task_team =
2550  parent_team->t.t_task_team[master_th->th.th_task_state];
2551  KA_TRACE(20,
2552  ("__kmp_join_call: Primary T#%d restoring task_team %p, team %p\n",
2553  __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2554  parent_team));
2555  }
2556 
2557  // TODO: GEH - cannot do this assertion because root thread not set up as
2558  // executing
2559  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2560  master_th->th.th_current_task->td_flags.executing = 1;
2561 
2562  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2563 
2564 #if OMPT_SUPPORT
2565  int flags =
2566  OMPT_INVOKER(fork_context) |
2567  ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league
2568  : ompt_parallel_team);
2569  if (ompt_enabled.enabled) {
2570  __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags,
2571  codeptr);
2572  }
2573 #endif
2574 
2575  KMP_MB();
2576  KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2577 }
2578 
2579 /* Check whether we should push an internal control record onto the
2580  serial team stack. If so, do it. */
2581 void __kmp_save_internal_controls(kmp_info_t *thread) {
2582 
2583  if (thread->th.th_team != thread->th.th_serial_team) {
2584  return;
2585  }
2586  if (thread->th.th_team->t.t_serialized > 1) {
2587  int push = 0;
2588 
2589  if (thread->th.th_team->t.t_control_stack_top == NULL) {
2590  push = 1;
2591  } else {
2592  if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2593  thread->th.th_team->t.t_serialized) {
2594  push = 1;
2595  }
2596  }
2597  if (push) { /* push a record on the serial team's stack */
2598  kmp_internal_control_t *control =
2599  (kmp_internal_control_t *)__kmp_allocate(
2600  sizeof(kmp_internal_control_t));
2601 
2602  copy_icvs(control, &thread->th.th_current_task->td_icvs);
2603 
2604  control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2605 
2606  control->next = thread->th.th_team->t.t_control_stack_top;
2607  thread->th.th_team->t.t_control_stack_top = control;
2608  }
2609  }
2610 }
2611 
2612 /* Changes set_nproc */
2613 void __kmp_set_num_threads(int new_nth, int gtid) {
2614  kmp_info_t *thread;
2615  kmp_root_t *root;
2616 
2617  KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2618  KMP_DEBUG_ASSERT(__kmp_init_serial);
2619 
2620  if (new_nth < 1)
2621  new_nth = 1;
2622  else if (new_nth > __kmp_max_nth)
2623  new_nth = __kmp_max_nth;
2624 
2625  KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2626  thread = __kmp_threads[gtid];
2627  if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2628  return; // nothing to do
2629 
2630  __kmp_save_internal_controls(thread);
2631 
2632  set__nproc(thread, new_nth);
2633 
2634  // If this omp_set_num_threads() call will cause the hot team size to be
2635  // reduced (in the absence of a num_threads clause), then reduce it now,
2636  // rather than waiting for the next parallel region.
2637  root = thread->th.th_root;
2638  if (__kmp_init_parallel && (!root->r.r_active) &&
2639  (root->r.r_hot_team->t.t_nproc > new_nth)
2640 #if KMP_NESTED_HOT_TEAMS
2641  && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2642 #endif
2643  ) {
2644  kmp_team_t *hot_team = root->r.r_hot_team;
2645  int f;
2646 
2647  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2648 
2649  // Release the extra threads we don't need any more.
2650  for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2651  KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2652  if (__kmp_tasking_mode != tskm_immediate_exec) {
2653  // When decreasing team size, threads no longer in the team should unref
2654  // task team.
2655  hot_team->t.t_threads[f]->th.th_task_team = NULL;
2656  }
2657  __kmp_free_thread(hot_team->t.t_threads[f]);
2658  hot_team->t.t_threads[f] = NULL;
2659  }
2660  hot_team->t.t_nproc = new_nth;
2661 #if KMP_NESTED_HOT_TEAMS
2662  if (thread->th.th_hot_teams) {
2663  KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2664  thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2665  }
2666 #endif
2667 
2668  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2669 
2670  // Update the t_nproc field in the threads that are still active.
2671  for (f = 0; f < new_nth; f++) {
2672  KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2673  hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2674  }
2675  // Special flag in case omp_set_num_threads() call
2676  hot_team->t.t_size_changed = -1;
2677  }
2678 }
2679 
2680 /* Changes max_active_levels */
2681 void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2682  kmp_info_t *thread;
2683 
2684  KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2685  "%d = (%d)\n",
2686  gtid, max_active_levels));
2687  KMP_DEBUG_ASSERT(__kmp_init_serial);
2688 
2689  // validate max_active_levels
2690  if (max_active_levels < 0) {
2691  KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2692  // We ignore this call if the user has specified a negative value.
2693  // The current setting won't be changed. The last valid setting will be
2694  // used. A warning will be issued (if warnings are allowed as controlled by
2695  // the KMP_WARNINGS env var).
2696  KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2697  "max_active_levels for thread %d = (%d)\n",
2698  gtid, max_active_levels));
2699  return;
2700  }
2701  if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2702  // it's OK, the max_active_levels is within the valid range: [ 0;
2703  // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2704  // We allow a zero value. (implementation defined behavior)
2705  } else {
2706  KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2707  KMP_MAX_ACTIVE_LEVELS_LIMIT);
2708  max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2709  // Current upper limit is MAX_INT. (implementation defined behavior)
2710  // If the input exceeds the upper limit, we correct the input to be the
2711  // upper limit. (implementation defined behavior)
2712  // Actually, the flow should never get here until we use MAX_INT limit.
2713  }
2714  KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2715  "max_active_levels for thread %d = (%d)\n",
2716  gtid, max_active_levels));
2717 
2718  thread = __kmp_threads[gtid];
2719 
2720  __kmp_save_internal_controls(thread);
2721 
2722  set__max_active_levels(thread, max_active_levels);
2723 }
2724 
2725 /* Gets max_active_levels */
2726 int __kmp_get_max_active_levels(int gtid) {
2727  kmp_info_t *thread;
2728 
2729  KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2730  KMP_DEBUG_ASSERT(__kmp_init_serial);
2731 
2732  thread = __kmp_threads[gtid];
2733  KMP_DEBUG_ASSERT(thread->th.th_current_task);
2734  KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2735  "curtask_maxaclevel=%d\n",
2736  gtid, thread->th.th_current_task,
2737  thread->th.th_current_task->td_icvs.max_active_levels));
2738  return thread->th.th_current_task->td_icvs.max_active_levels;
2739 }
2740 
2741 // nteams-var per-device ICV
2742 void __kmp_set_num_teams(int num_teams) {
2743  if (num_teams > 0)
2744  __kmp_nteams = num_teams;
2745 }
2746 int __kmp_get_max_teams(void) { return __kmp_nteams; }
2747 // teams-thread-limit-var per-device ICV
2748 void __kmp_set_teams_thread_limit(int limit) {
2749  if (limit > 0)
2750  __kmp_teams_thread_limit = limit;
2751 }
2752 int __kmp_get_teams_thread_limit(void) { return __kmp_teams_thread_limit; }
2753 
2754 KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int));
2755 KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int));
2756 
2757 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2758 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2759  kmp_info_t *thread;
2760  kmp_sched_t orig_kind;
2761  // kmp_team_t *team;
2762 
2763  KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2764  gtid, (int)kind, chunk));
2765  KMP_DEBUG_ASSERT(__kmp_init_serial);
2766 
2767  // Check if the kind parameter is valid, correct if needed.
2768  // Valid parameters should fit in one of two intervals - standard or extended:
2769  // <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2770  // 2008-01-25: 0, 1 - 4, 5, 100, 101 - 102, 103
2771  orig_kind = kind;
2772  kind = __kmp_sched_without_mods(kind);
2773 
2774  if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2775  (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2776  // TODO: Hint needs attention in case we change the default schedule.
2777  __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2778  KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2779  __kmp_msg_null);
2780  kind = kmp_sched_default;
2781  chunk = 0; // ignore chunk value in case of bad kind
2782  }
2783 
2784  thread = __kmp_threads[gtid];
2785 
2786  __kmp_save_internal_controls(thread);
2787 
2788  if (kind < kmp_sched_upper_std) {
2789  if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2790  // differ static chunked vs. unchunked: chunk should be invalid to
2791  // indicate unchunked schedule (which is the default)
2792  thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2793  } else {
2794  thread->th.th_current_task->td_icvs.sched.r_sched_type =
2795  __kmp_sch_map[kind - kmp_sched_lower - 1];
2796  }
2797  } else {
2798  // __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2799  // kmp_sched_lower - 2 ];
2800  thread->th.th_current_task->td_icvs.sched.r_sched_type =
2801  __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2802  kmp_sched_lower - 2];
2803  }
2804  __kmp_sched_apply_mods_intkind(
2805  orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type));
2806  if (kind == kmp_sched_auto || chunk < 1) {
2807  // ignore parameter chunk for schedule auto
2808  thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2809  } else {
2810  thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2811  }
2812 }
2813 
2814 /* Gets def_sched_var ICV values */
2815 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2816  kmp_info_t *thread;
2817  enum sched_type th_type;
2818 
2819  KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
2820  KMP_DEBUG_ASSERT(__kmp_init_serial);
2821 
2822  thread = __kmp_threads[gtid];
2823 
2824  th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2825  switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) {
2826  case kmp_sch_static:
2827  case kmp_sch_static_greedy:
2828  case kmp_sch_static_balanced:
2829  *kind = kmp_sched_static;
2830  __kmp_sched_apply_mods_stdkind(kind, th_type);
2831  *chunk = 0; // chunk was not set, try to show this fact via zero value
2832  return;
2833  case kmp_sch_static_chunked:
2834  *kind = kmp_sched_static;
2835  break;
2836  case kmp_sch_dynamic_chunked:
2837  *kind = kmp_sched_dynamic;
2838  break;
2840  case kmp_sch_guided_iterative_chunked:
2841  case kmp_sch_guided_analytical_chunked:
2842  *kind = kmp_sched_guided;
2843  break;
2844  case kmp_sch_auto:
2845  *kind = kmp_sched_auto;
2846  break;
2847  case kmp_sch_trapezoidal:
2848  *kind = kmp_sched_trapezoidal;
2849  break;
2850 #if KMP_STATIC_STEAL_ENABLED
2851  case kmp_sch_static_steal:
2852  *kind = kmp_sched_static_steal;
2853  break;
2854 #endif
2855  default:
2856  KMP_FATAL(UnknownSchedulingType, th_type);
2857  }
2858 
2859  __kmp_sched_apply_mods_stdkind(kind, th_type);
2860  *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
2861 }
2862 
2863 int __kmp_get_ancestor_thread_num(int gtid, int level) {
2864 
2865  int ii, dd;
2866  kmp_team_t *team;
2867  kmp_info_t *thr;
2868 
2869  KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
2870  KMP_DEBUG_ASSERT(__kmp_init_serial);
2871 
2872  // validate level
2873  if (level == 0)
2874  return 0;
2875  if (level < 0)
2876  return -1;
2877  thr = __kmp_threads[gtid];
2878  team = thr->th.th_team;
2879  ii = team->t.t_level;
2880  if (level > ii)
2881  return -1;
2882 
2883  if (thr->th.th_teams_microtask) {
2884  // AC: we are in teams region where multiple nested teams have same level
2885  int tlevel = thr->th.th_teams_level; // the level of the teams construct
2886  if (level <=
2887  tlevel) { // otherwise usual algorithm works (will not touch the teams)
2888  KMP_DEBUG_ASSERT(ii >= tlevel);
2889  // AC: As we need to pass by the teams league, we need to artificially
2890  // increase ii
2891  if (ii == tlevel) {
2892  ii += 2; // three teams have same level
2893  } else {
2894  ii++; // two teams have same level
2895  }
2896  }
2897  }
2898 
2899  if (ii == level)
2900  return __kmp_tid_from_gtid(gtid);
2901 
2902  dd = team->t.t_serialized;
2903  level++;
2904  while (ii > level) {
2905  for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2906  }
2907  if ((team->t.t_serialized) && (!dd)) {
2908  team = team->t.t_parent;
2909  continue;
2910  }
2911  if (ii > level) {
2912  team = team->t.t_parent;
2913  dd = team->t.t_serialized;
2914  ii--;
2915  }
2916  }
2917 
2918  return (dd > 1) ? (0) : (team->t.t_master_tid);
2919 }
2920 
2921 int __kmp_get_team_size(int gtid, int level) {
2922 
2923  int ii, dd;
2924  kmp_team_t *team;
2925  kmp_info_t *thr;
2926 
2927  KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
2928  KMP_DEBUG_ASSERT(__kmp_init_serial);
2929 
2930  // validate level
2931  if (level == 0)
2932  return 1;
2933  if (level < 0)
2934  return -1;
2935  thr = __kmp_threads[gtid];
2936  team = thr->th.th_team;
2937  ii = team->t.t_level;
2938  if (level > ii)
2939  return -1;
2940 
2941  if (thr->th.th_teams_microtask) {
2942  // AC: we are in teams region where multiple nested teams have same level
2943  int tlevel = thr->th.th_teams_level; // the level of the teams construct
2944  if (level <=
2945  tlevel) { // otherwise usual algorithm works (will not touch the teams)
2946  KMP_DEBUG_ASSERT(ii >= tlevel);
2947  // AC: As we need to pass by the teams league, we need to artificially
2948  // increase ii
2949  if (ii == tlevel) {
2950  ii += 2; // three teams have same level
2951  } else {
2952  ii++; // two teams have same level
2953  }
2954  }
2955  }
2956 
2957  while (ii > level) {
2958  for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2959  }
2960  if (team->t.t_serialized && (!dd)) {
2961  team = team->t.t_parent;
2962  continue;
2963  }
2964  if (ii > level) {
2965  team = team->t.t_parent;
2966  ii--;
2967  }
2968  }
2969 
2970  return team->t.t_nproc;
2971 }
2972 
2973 kmp_r_sched_t __kmp_get_schedule_global() {
2974  // This routine created because pairs (__kmp_sched, __kmp_chunk) and
2975  // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
2976  // independently. So one can get the updated schedule here.
2977 
2978  kmp_r_sched_t r_sched;
2979 
2980  // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
2981  // __kmp_guided. __kmp_sched should keep original value, so that user can set
2982  // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
2983  // different roots (even in OMP 2.5)
2984  enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
2985  enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched);
2986  if (s == kmp_sch_static) {
2987  // replace STATIC with more detailed schedule (balanced or greedy)
2988  r_sched.r_sched_type = __kmp_static;
2989  } else if (s == kmp_sch_guided_chunked) {
2990  // replace GUIDED with more detailed schedule (iterative or analytical)
2991  r_sched.r_sched_type = __kmp_guided;
2992  } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
2993  r_sched.r_sched_type = __kmp_sched;
2994  }
2995  SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers);
2996 
2997  if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
2998  // __kmp_chunk may be wrong here (if it was not ever set)
2999  r_sched.chunk = KMP_DEFAULT_CHUNK;
3000  } else {
3001  r_sched.chunk = __kmp_chunk;
3002  }
3003 
3004  return r_sched;
3005 }
3006 
3007 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
3008  at least argc number of *t_argv entries for the requested team. */
3009 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
3010 
3011  KMP_DEBUG_ASSERT(team);
3012  if (!realloc || argc > team->t.t_max_argc) {
3013 
3014  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
3015  "current entries=%d\n",
3016  team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
3017  /* if previously allocated heap space for args, free them */
3018  if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
3019  __kmp_free((void *)team->t.t_argv);
3020 
3021  if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3022  /* use unused space in the cache line for arguments */
3023  team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3024  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3025  "argv entries\n",
3026  team->t.t_id, team->t.t_max_argc));
3027  team->t.t_argv = &team->t.t_inline_argv[0];
3028  if (__kmp_storage_map) {
3029  __kmp_print_storage_map_gtid(
3030  -1, &team->t.t_inline_argv[0],
3031  &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3032  (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3033  team->t.t_id);
3034  }
3035  } else {
3036  /* allocate space for arguments in the heap */
3037  team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3038  ? KMP_MIN_MALLOC_ARGV_ENTRIES
3039  : 2 * argc;
3040  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3041  "argv entries\n",
3042  team->t.t_id, team->t.t_max_argc));
3043  team->t.t_argv =
3044  (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3045  if (__kmp_storage_map) {
3046  __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3047  &team->t.t_argv[team->t.t_max_argc],
3048  sizeof(void *) * team->t.t_max_argc,
3049  "team_%d.t_argv", team->t.t_id);
3050  }
3051  }
3052  }
3053 }
3054 
3055 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3056  int i;
3057  int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3058  team->t.t_threads =
3059  (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3060  team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3061  sizeof(dispatch_shared_info_t) * num_disp_buff);
3062  team->t.t_dispatch =
3063  (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3064  team->t.t_implicit_task_taskdata =
3065  (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3066  team->t.t_max_nproc = max_nth;
3067 
3068  /* setup dispatch buffers */
3069  for (i = 0; i < num_disp_buff; ++i) {
3070  team->t.t_disp_buffer[i].buffer_index = i;
3071  team->t.t_disp_buffer[i].doacross_buf_idx = i;
3072  }
3073 }
3074 
3075 static void __kmp_free_team_arrays(kmp_team_t *team) {
3076  /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3077  int i;
3078  for (i = 0; i < team->t.t_max_nproc; ++i) {
3079  if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3080  __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3081  team->t.t_dispatch[i].th_disp_buffer = NULL;
3082  }
3083  }
3084 #if KMP_USE_HIER_SCHED
3085  __kmp_dispatch_free_hierarchies(team);
3086 #endif
3087  __kmp_free(team->t.t_threads);
3088  __kmp_free(team->t.t_disp_buffer);
3089  __kmp_free(team->t.t_dispatch);
3090  __kmp_free(team->t.t_implicit_task_taskdata);
3091  team->t.t_threads = NULL;
3092  team->t.t_disp_buffer = NULL;
3093  team->t.t_dispatch = NULL;
3094  team->t.t_implicit_task_taskdata = 0;
3095 }
3096 
3097 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3098  kmp_info_t **oldThreads = team->t.t_threads;
3099 
3100  __kmp_free(team->t.t_disp_buffer);
3101  __kmp_free(team->t.t_dispatch);
3102  __kmp_free(team->t.t_implicit_task_taskdata);
3103  __kmp_allocate_team_arrays(team, max_nth);
3104 
3105  KMP_MEMCPY(team->t.t_threads, oldThreads,
3106  team->t.t_nproc * sizeof(kmp_info_t *));
3107 
3108  __kmp_free(oldThreads);
3109 }
3110 
3111 static kmp_internal_control_t __kmp_get_global_icvs(void) {
3112 
3113  kmp_r_sched_t r_sched =
3114  __kmp_get_schedule_global(); // get current state of scheduling globals
3115 
3116  KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3117 
3118  kmp_internal_control_t g_icvs = {
3119  0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3120  (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3121  // adjustment of threads (per thread)
3122  (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3123  // whether blocktime is explicitly set
3124  __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3125 #if KMP_USE_MONITOR
3126  __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3127 // intervals
3128 #endif
3129  __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3130  // next parallel region (per thread)
3131  // (use a max ub on value if __kmp_parallel_initialize not called yet)
3132  __kmp_cg_max_nth, // int thread_limit;
3133  __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3134  // for max_active_levels
3135  r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3136  // {sched,chunk} pair
3137  __kmp_nested_proc_bind.bind_types[0],
3138  __kmp_default_device,
3139  NULL // struct kmp_internal_control *next;
3140  };
3141 
3142  return g_icvs;
3143 }
3144 
3145 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3146 
3147  kmp_internal_control_t gx_icvs;
3148  gx_icvs.serial_nesting_level =
3149  0; // probably =team->t.t_serial like in save_inter_controls
3150  copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3151  gx_icvs.next = NULL;
3152 
3153  return gx_icvs;
3154 }
3155 
3156 static void __kmp_initialize_root(kmp_root_t *root) {
3157  int f;
3158  kmp_team_t *root_team;
3159  kmp_team_t *hot_team;
3160  int hot_team_max_nth;
3161  kmp_r_sched_t r_sched =
3162  __kmp_get_schedule_global(); // get current state of scheduling globals
3163  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3164  KMP_DEBUG_ASSERT(root);
3165  KMP_ASSERT(!root->r.r_begin);
3166 
3167  /* setup the root state structure */
3168  __kmp_init_lock(&root->r.r_begin_lock);
3169  root->r.r_begin = FALSE;
3170  root->r.r_active = FALSE;
3171  root->r.r_in_parallel = 0;
3172  root->r.r_blocktime = __kmp_dflt_blocktime;
3173 #if KMP_AFFINITY_SUPPORTED
3174  root->r.r_affinity_assigned = FALSE;
3175 #endif
3176 
3177  /* setup the root team for this task */
3178  /* allocate the root team structure */
3179  KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3180 
3181  root_team =
3182  __kmp_allocate_team(root,
3183  1, // new_nproc
3184  1, // max_nproc
3185 #if OMPT_SUPPORT
3186  ompt_data_none, // root parallel id
3187 #endif
3188  __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3189  0 // argc
3190  USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3191  );
3192 #if USE_DEBUGGER
3193  // Non-NULL value should be assigned to make the debugger display the root
3194  // team.
3195  TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3196 #endif
3197 
3198  KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3199 
3200  root->r.r_root_team = root_team;
3201  root_team->t.t_control_stack_top = NULL;
3202 
3203  /* initialize root team */
3204  root_team->t.t_threads[0] = NULL;
3205  root_team->t.t_nproc = 1;
3206  root_team->t.t_serialized = 1;
3207  // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3208  root_team->t.t_sched.sched = r_sched.sched;
3209  KA_TRACE(
3210  20,
3211  ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3212  root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3213 
3214  /* setup the hot team for this task */
3215  /* allocate the hot team structure */
3216  KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3217 
3218  hot_team =
3219  __kmp_allocate_team(root,
3220  1, // new_nproc
3221  __kmp_dflt_team_nth_ub * 2, // max_nproc
3222 #if OMPT_SUPPORT
3223  ompt_data_none, // root parallel id
3224 #endif
3225  __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3226  0 // argc
3227  USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3228  );
3229  KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3230 
3231  root->r.r_hot_team = hot_team;
3232  root_team->t.t_control_stack_top = NULL;
3233 
3234  /* first-time initialization */
3235  hot_team->t.t_parent = root_team;
3236 
3237  /* initialize hot team */
3238  hot_team_max_nth = hot_team->t.t_max_nproc;
3239  for (f = 0; f < hot_team_max_nth; ++f) {
3240  hot_team->t.t_threads[f] = NULL;
3241  }
3242  hot_team->t.t_nproc = 1;
3243  // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3244  hot_team->t.t_sched.sched = r_sched.sched;
3245  hot_team->t.t_size_changed = 0;
3246 }
3247 
3248 #ifdef KMP_DEBUG
3249 
3250 typedef struct kmp_team_list_item {
3251  kmp_team_p const *entry;
3252  struct kmp_team_list_item *next;
3253 } kmp_team_list_item_t;
3254 typedef kmp_team_list_item_t *kmp_team_list_t;
3255 
3256 static void __kmp_print_structure_team_accum( // Add team to list of teams.
3257  kmp_team_list_t list, // List of teams.
3258  kmp_team_p const *team // Team to add.
3259 ) {
3260 
3261  // List must terminate with item where both entry and next are NULL.
3262  // Team is added to the list only once.
3263  // List is sorted in ascending order by team id.
3264  // Team id is *not* a key.
3265 
3266  kmp_team_list_t l;
3267 
3268  KMP_DEBUG_ASSERT(list != NULL);
3269  if (team == NULL) {
3270  return;
3271  }
3272 
3273  __kmp_print_structure_team_accum(list, team->t.t_parent);
3274  __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3275 
3276  // Search list for the team.
3277  l = list;
3278  while (l->next != NULL && l->entry != team) {
3279  l = l->next;
3280  }
3281  if (l->next != NULL) {
3282  return; // Team has been added before, exit.
3283  }
3284 
3285  // Team is not found. Search list again for insertion point.
3286  l = list;
3287  while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3288  l = l->next;
3289  }
3290 
3291  // Insert team.
3292  {
3293  kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3294  sizeof(kmp_team_list_item_t));
3295  *item = *l;
3296  l->entry = team;
3297  l->next = item;
3298  }
3299 }
3300 
3301 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3302 
3303 ) {
3304  __kmp_printf("%s", title);
3305  if (team != NULL) {
3306  __kmp_printf("%2x %p\n", team->t.t_id, team);
3307  } else {
3308  __kmp_printf(" - (nil)\n");
3309  }
3310 }
3311 
3312 static void __kmp_print_structure_thread(char const *title,
3313  kmp_info_p const *thread) {
3314  __kmp_printf("%s", title);
3315  if (thread != NULL) {
3316  __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3317  } else {
3318  __kmp_printf(" - (nil)\n");
3319  }
3320 }
3321 
3322 void __kmp_print_structure(void) {
3323 
3324  kmp_team_list_t list;
3325 
3326  // Initialize list of teams.
3327  list =
3328  (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3329  list->entry = NULL;
3330  list->next = NULL;
3331 
3332  __kmp_printf("\n------------------------------\nGlobal Thread "
3333  "Table\n------------------------------\n");
3334  {
3335  int gtid;
3336  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3337  __kmp_printf("%2d", gtid);
3338  if (__kmp_threads != NULL) {
3339  __kmp_printf(" %p", __kmp_threads[gtid]);
3340  }
3341  if (__kmp_root != NULL) {
3342  __kmp_printf(" %p", __kmp_root[gtid]);
3343  }
3344  __kmp_printf("\n");
3345  }
3346  }
3347 
3348  // Print out __kmp_threads array.
3349  __kmp_printf("\n------------------------------\nThreads\n--------------------"
3350  "----------\n");
3351  if (__kmp_threads != NULL) {
3352  int gtid;
3353  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3354  kmp_info_t const *thread = __kmp_threads[gtid];
3355  if (thread != NULL) {
3356  __kmp_printf("GTID %2d %p:\n", gtid, thread);
3357  __kmp_printf(" Our Root: %p\n", thread->th.th_root);
3358  __kmp_print_structure_team(" Our Team: ", thread->th.th_team);
3359  __kmp_print_structure_team(" Serial Team: ",
3360  thread->th.th_serial_team);
3361  __kmp_printf(" Threads: %2d\n", thread->th.th_team_nproc);
3362  __kmp_print_structure_thread(" Primary: ",
3363  thread->th.th_team_master);
3364  __kmp_printf(" Serialized?: %2d\n", thread->th.th_team_serialized);
3365  __kmp_printf(" Set NProc: %2d\n", thread->th.th_set_nproc);
3366  __kmp_printf(" Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3367  __kmp_print_structure_thread(" Next in pool: ",
3368  thread->th.th_next_pool);
3369  __kmp_printf("\n");
3370  __kmp_print_structure_team_accum(list, thread->th.th_team);
3371  __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3372  }
3373  }
3374  } else {
3375  __kmp_printf("Threads array is not allocated.\n");
3376  }
3377 
3378  // Print out __kmp_root array.
3379  __kmp_printf("\n------------------------------\nUbers\n----------------------"
3380  "--------\n");
3381  if (__kmp_root != NULL) {
3382  int gtid;
3383  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3384  kmp_root_t const *root = __kmp_root[gtid];
3385  if (root != NULL) {
3386  __kmp_printf("GTID %2d %p:\n", gtid, root);
3387  __kmp_print_structure_team(" Root Team: ", root->r.r_root_team);
3388  __kmp_print_structure_team(" Hot Team: ", root->r.r_hot_team);
3389  __kmp_print_structure_thread(" Uber Thread: ",
3390  root->r.r_uber_thread);
3391  __kmp_printf(" Active?: %2d\n", root->r.r_active);
3392  __kmp_printf(" In Parallel: %2d\n",
3393  KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3394  __kmp_printf("\n");
3395  __kmp_print_structure_team_accum(list, root->r.r_root_team);
3396  __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3397  }
3398  }
3399  } else {
3400  __kmp_printf("Ubers array is not allocated.\n");
3401  }
3402 
3403  __kmp_printf("\n------------------------------\nTeams\n----------------------"
3404  "--------\n");
3405  while (list->next != NULL) {
3406  kmp_team_p const *team = list->entry;
3407  int i;
3408  __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3409  __kmp_print_structure_team(" Parent Team: ", team->t.t_parent);
3410  __kmp_printf(" Primary TID: %2d\n", team->t.t_master_tid);
3411  __kmp_printf(" Max threads: %2d\n", team->t.t_max_nproc);
3412  __kmp_printf(" Levels of serial: %2d\n", team->t.t_serialized);
3413  __kmp_printf(" Number threads: %2d\n", team->t.t_nproc);
3414  for (i = 0; i < team->t.t_nproc; ++i) {
3415  __kmp_printf(" Thread %2d: ", i);
3416  __kmp_print_structure_thread("", team->t.t_threads[i]);
3417  }
3418  __kmp_print_structure_team(" Next in pool: ", team->t.t_next_pool);
3419  __kmp_printf("\n");
3420  list = list->next;
3421  }
3422 
3423  // Print out __kmp_thread_pool and __kmp_team_pool.
3424  __kmp_printf("\n------------------------------\nPools\n----------------------"
3425  "--------\n");
3426  __kmp_print_structure_thread("Thread pool: ",
3427  CCAST(kmp_info_t *, __kmp_thread_pool));
3428  __kmp_print_structure_team("Team pool: ",
3429  CCAST(kmp_team_t *, __kmp_team_pool));
3430  __kmp_printf("\n");
3431 
3432  // Free team list.
3433  while (list != NULL) {
3434  kmp_team_list_item_t *item = list;
3435  list = list->next;
3436  KMP_INTERNAL_FREE(item);
3437  }
3438 }
3439 
3440 #endif
3441 
3442 //---------------------------------------------------------------------------
3443 // Stuff for per-thread fast random number generator
3444 // Table of primes
3445 static const unsigned __kmp_primes[] = {
3446  0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3447  0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3448  0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3449  0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3450  0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3451  0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3452  0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3453  0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3454  0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3455  0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3456  0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3457 
3458 //---------------------------------------------------------------------------
3459 // __kmp_get_random: Get a random number using a linear congruential method.
3460 unsigned short __kmp_get_random(kmp_info_t *thread) {
3461  unsigned x = thread->th.th_x;
3462  unsigned short r = (unsigned short)(x >> 16);
3463 
3464  thread->th.th_x = x * thread->th.th_a + 1;
3465 
3466  KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3467  thread->th.th_info.ds.ds_tid, r));
3468 
3469  return r;
3470 }
3471 //--------------------------------------------------------
3472 // __kmp_init_random: Initialize a random number generator
3473 void __kmp_init_random(kmp_info_t *thread) {
3474  unsigned seed = thread->th.th_info.ds.ds_tid;
3475 
3476  thread->th.th_a =
3477  __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3478  thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3479  KA_TRACE(30,
3480  ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3481 }
3482 
3483 #if KMP_OS_WINDOWS
3484 /* reclaim array entries for root threads that are already dead, returns number
3485  * reclaimed */
3486 static int __kmp_reclaim_dead_roots(void) {
3487  int i, r = 0;
3488 
3489  for (i = 0; i < __kmp_threads_capacity; ++i) {
3490  if (KMP_UBER_GTID(i) &&
3491  !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3492  !__kmp_root[i]
3493  ->r.r_active) { // AC: reclaim only roots died in non-active state
3494  r += __kmp_unregister_root_other_thread(i);
3495  }
3496  }
3497  return r;
3498 }
3499 #endif
3500 
3501 /* This function attempts to create free entries in __kmp_threads and
3502  __kmp_root, and returns the number of free entries generated.
3503 
3504  For Windows* OS static library, the first mechanism used is to reclaim array
3505  entries for root threads that are already dead.
3506 
3507  On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3508  __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3509  capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3510  threadprivate cache array has been created. Synchronization with
3511  __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3512 
3513  After any dead root reclamation, if the clipping value allows array expansion
3514  to result in the generation of a total of nNeed free slots, the function does
3515  that expansion. If not, nothing is done beyond the possible initial root
3516  thread reclamation.
3517 
3518  If any argument is negative, the behavior is undefined. */
3519 static int __kmp_expand_threads(int nNeed) {
3520  int added = 0;
3521  int minimumRequiredCapacity;
3522  int newCapacity;
3523  kmp_info_t **newThreads;
3524  kmp_root_t **newRoot;
3525 
3526  // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3527  // resizing __kmp_threads does not need additional protection if foreign
3528  // threads are present
3529 
3530 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3531  /* only for Windows static library */
3532  /* reclaim array entries for root threads that are already dead */
3533  added = __kmp_reclaim_dead_roots();
3534 
3535  if (nNeed) {
3536  nNeed -= added;
3537  if (nNeed < 0)
3538  nNeed = 0;
3539  }
3540 #endif
3541  if (nNeed <= 0)
3542  return added;
3543 
3544  // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3545  // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3546  // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3547  // > __kmp_max_nth in one of two ways:
3548  //
3549  // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0]
3550  // may not be reused by another thread, so we may need to increase
3551  // __kmp_threads_capacity to __kmp_max_nth + 1.
3552  //
3553  // 2) New foreign root(s) are encountered. We always register new foreign
3554  // roots. This may cause a smaller # of threads to be allocated at
3555  // subsequent parallel regions, but the worker threads hang around (and
3556  // eventually go to sleep) and need slots in the __kmp_threads[] array.
3557  //
3558  // Anyway, that is the reason for moving the check to see if
3559  // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3560  // instead of having it performed here. -BB
3561 
3562  KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3563 
3564  /* compute expansion headroom to check if we can expand */
3565  if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3566  /* possible expansion too small -- give up */
3567  return added;
3568  }
3569  minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3570 
3571  newCapacity = __kmp_threads_capacity;
3572  do {
3573  newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3574  : __kmp_sys_max_nth;
3575  } while (newCapacity < minimumRequiredCapacity);
3576  newThreads = (kmp_info_t **)__kmp_allocate(
3577  (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3578  newRoot =
3579  (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3580  KMP_MEMCPY(newThreads, __kmp_threads,
3581  __kmp_threads_capacity * sizeof(kmp_info_t *));
3582  KMP_MEMCPY(newRoot, __kmp_root,
3583  __kmp_threads_capacity * sizeof(kmp_root_t *));
3584 
3585  kmp_info_t **temp_threads = __kmp_threads;
3586  *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3587  *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3588  __kmp_free(temp_threads);
3589  added += newCapacity - __kmp_threads_capacity;
3590  *(volatile int *)&__kmp_threads_capacity = newCapacity;
3591 
3592  if (newCapacity > __kmp_tp_capacity) {
3593  __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3594  if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3595  __kmp_threadprivate_resize_cache(newCapacity);
3596  } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3597  *(volatile int *)&__kmp_tp_capacity = newCapacity;
3598  }
3599  __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3600  }
3601 
3602  return added;
3603 }
3604 
3605 /* Register the current thread as a root thread and obtain our gtid. We must
3606  have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3607  thread that calls from __kmp_do_serial_initialize() */
3608 int __kmp_register_root(int initial_thread) {
3609  kmp_info_t *root_thread;
3610  kmp_root_t *root;
3611  int gtid;
3612  int capacity;
3613  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3614  KA_TRACE(20, ("__kmp_register_root: entered\n"));
3615  KMP_MB();
3616 
3617  /* 2007-03-02:
3618  If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3619  initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3620  work as expected -- it may return false (that means there is at least one
3621  empty slot in __kmp_threads array), but it is possible the only free slot
3622  is #0, which is reserved for initial thread and so cannot be used for this
3623  one. Following code workarounds this bug.
3624 
3625  However, right solution seems to be not reserving slot #0 for initial
3626  thread because:
3627  (1) there is no magic in slot #0,
3628  (2) we cannot detect initial thread reliably (the first thread which does
3629  serial initialization may be not a real initial thread).
3630  */
3631  capacity = __kmp_threads_capacity;
3632  if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3633  --capacity;
3634  }
3635 
3636  // If it is not for initializing the hidden helper team, we need to take
3637  // __kmp_hidden_helper_threads_num out of the capacity because it is included
3638  // in __kmp_threads_capacity.
3639  if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
3640  capacity -= __kmp_hidden_helper_threads_num;
3641  }
3642 
3643  /* see if there are too many threads */
3644  if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3645  if (__kmp_tp_cached) {
3646  __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3647  KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3648  KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3649  } else {
3650  __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3651  __kmp_msg_null);
3652  }
3653  }
3654 
3655  // When hidden helper task is enabled, __kmp_threads is organized as follows:
3656  // 0: initial thread, also a regular OpenMP thread.
3657  // [1, __kmp_hidden_helper_threads_num]: slots for hidden helper threads.
3658  // [__kmp_hidden_helper_threads_num + 1, __kmp_threads_capacity): slots for
3659  // regular OpenMP threads.
3660  if (TCR_4(__kmp_init_hidden_helper_threads)) {
3661  // Find an available thread slot for hidden helper thread. Slots for hidden
3662  // helper threads start from 1 to __kmp_hidden_helper_threads_num.
3663  for (gtid = 1; TCR_PTR(__kmp_threads[gtid]) != NULL &&
3664  gtid <= __kmp_hidden_helper_threads_num;
3665  gtid++)
3666  ;
3667  KMP_ASSERT(gtid <= __kmp_hidden_helper_threads_num);
3668  KA_TRACE(1, ("__kmp_register_root: found slot in threads array for "
3669  "hidden helper thread: T#%d\n",
3670  gtid));
3671  } else {
3672  /* find an available thread slot */
3673  // Don't reassign the zero slot since we need that to only be used by
3674  // initial thread. Slots for hidden helper threads should also be skipped.
3675  if (initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3676  gtid = 0;
3677  } else {
3678  for (gtid = __kmp_hidden_helper_threads_num + 1;
3679  TCR_PTR(__kmp_threads[gtid]) != NULL; gtid++)
3680  ;
3681  }
3682  KA_TRACE(
3683  1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3684  KMP_ASSERT(gtid < __kmp_threads_capacity);
3685  }
3686 
3687  /* update global accounting */
3688  __kmp_all_nth++;
3689  TCW_4(__kmp_nth, __kmp_nth + 1);
3690 
3691  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3692  // numbers of procs, and method #2 (keyed API call) for higher numbers.
3693  if (__kmp_adjust_gtid_mode) {
3694  if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3695  if (TCR_4(__kmp_gtid_mode) != 2) {
3696  TCW_4(__kmp_gtid_mode, 2);
3697  }
3698  } else {
3699  if (TCR_4(__kmp_gtid_mode) != 1) {
3700  TCW_4(__kmp_gtid_mode, 1);
3701  }
3702  }
3703  }
3704 
3705 #ifdef KMP_ADJUST_BLOCKTIME
3706  /* Adjust blocktime to zero if necessary */
3707  /* Middle initialization might not have occurred yet */
3708  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3709  if (__kmp_nth > __kmp_avail_proc) {
3710  __kmp_zero_bt = TRUE;
3711  }
3712  }
3713 #endif /* KMP_ADJUST_BLOCKTIME */
3714 
3715  /* setup this new hierarchy */
3716  if (!(root = __kmp_root[gtid])) {
3717  root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3718  KMP_DEBUG_ASSERT(!root->r.r_root_team);
3719  }
3720 
3721 #if KMP_STATS_ENABLED
3722  // Initialize stats as soon as possible (right after gtid assignment).
3723  __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3724  __kmp_stats_thread_ptr->startLife();
3725  KMP_SET_THREAD_STATE(SERIAL_REGION);
3726  KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3727 #endif
3728  __kmp_initialize_root(root);
3729 
3730  /* setup new root thread structure */
3731  if (root->r.r_uber_thread) {
3732  root_thread = root->r.r_uber_thread;
3733  } else {
3734  root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3735  if (__kmp_storage_map) {
3736  __kmp_print_thread_storage_map(root_thread, gtid);
3737  }
3738  root_thread->th.th_info.ds.ds_gtid = gtid;
3739 #if OMPT_SUPPORT
3740  root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3741 #endif
3742  root_thread->th.th_root = root;
3743  if (__kmp_env_consistency_check) {
3744  root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3745  }
3746 #if USE_FAST_MEMORY
3747  __kmp_initialize_fast_memory(root_thread);
3748 #endif /* USE_FAST_MEMORY */
3749 
3750 #if KMP_USE_BGET
3751  KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3752  __kmp_initialize_bget(root_thread);
3753 #endif
3754  __kmp_init_random(root_thread); // Initialize random number generator
3755  }
3756 
3757  /* setup the serial team held in reserve by the root thread */
3758  if (!root_thread->th.th_serial_team) {
3759  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3760  KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3761  root_thread->th.th_serial_team = __kmp_allocate_team(
3762  root, 1, 1,
3763 #if OMPT_SUPPORT
3764  ompt_data_none, // root parallel id
3765 #endif
3766  proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3767  }
3768  KMP_ASSERT(root_thread->th.th_serial_team);
3769  KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3770  root_thread->th.th_serial_team));
3771 
3772  /* drop root_thread into place */
3773  TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3774 
3775  root->r.r_root_team->t.t_threads[0] = root_thread;
3776  root->r.r_hot_team->t.t_threads[0] = root_thread;
3777  root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3778  // AC: the team created in reserve, not for execution (it is unused for now).
3779  root_thread->th.th_serial_team->t.t_serialized = 0;
3780  root->r.r_uber_thread = root_thread;
3781 
3782  /* initialize the thread, get it ready to go */
3783  __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3784  TCW_4(__kmp_init_gtid, TRUE);
3785 
3786  /* prepare the primary thread for get_gtid() */
3787  __kmp_gtid_set_specific(gtid);
3788 
3789 #if USE_ITT_BUILD
3790  __kmp_itt_thread_name(gtid);
3791 #endif /* USE_ITT_BUILD */
3792 
3793 #ifdef KMP_TDATA_GTID
3794  __kmp_gtid = gtid;
3795 #endif
3796  __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3797  KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3798 
3799  KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3800  "plain=%u\n",
3801  gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3802  root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3803  KMP_INIT_BARRIER_STATE));
3804  { // Initialize barrier data.
3805  int b;
3806  for (b = 0; b < bs_last_barrier; ++b) {
3807  root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3808 #if USE_DEBUGGER
3809  root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3810 #endif
3811  }
3812  }
3813  KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
3814  KMP_INIT_BARRIER_STATE);
3815 
3816 #if KMP_AFFINITY_SUPPORTED
3817  root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3818  root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3819  root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3820  root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3821 #endif /* KMP_AFFINITY_SUPPORTED */
3822  root_thread->th.th_def_allocator = __kmp_def_allocator;
3823  root_thread->th.th_prev_level = 0;
3824  root_thread->th.th_prev_num_threads = 1;
3825 
3826  kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
3827  tmp->cg_root = root_thread;
3828  tmp->cg_thread_limit = __kmp_cg_max_nth;
3829  tmp->cg_nthreads = 1;
3830  KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
3831  " cg_nthreads init to 1\n",
3832  root_thread, tmp));
3833  tmp->up = NULL;
3834  root_thread->th.th_cg_roots = tmp;
3835 
3836  __kmp_root_counter++;
3837 
3838 #if OMPT_SUPPORT
3839  if (!initial_thread && ompt_enabled.enabled) {
3840 
3841  kmp_info_t *root_thread = ompt_get_thread();
3842 
3843  ompt_set_thread_state(root_thread, ompt_state_overhead);
3844 
3845  if (ompt_enabled.ompt_callback_thread_begin) {
3846  ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
3847  ompt_thread_initial, __ompt_get_thread_data_internal());
3848  }
3849  ompt_data_t *task_data;
3850  ompt_data_t *parallel_data;
3851  __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
3852  NULL);
3853  if (ompt_enabled.ompt_callback_implicit_task) {
3854  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3855  ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
3856  }
3857 
3858  ompt_set_thread_state(root_thread, ompt_state_work_serial);
3859  }
3860 #endif
3861 #if OMPD_SUPPORT
3862  if (ompd_state & OMPD_ENABLE_BP)
3863  ompd_bp_thread_begin();
3864 #endif
3865 
3866  KMP_MB();
3867  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3868 
3869  return gtid;
3870 }
3871 
3872 #if KMP_NESTED_HOT_TEAMS
3873 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
3874  const int max_level) {
3875  int i, n, nth;
3876  kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
3877  if (!hot_teams || !hot_teams[level].hot_team) {
3878  return 0;
3879  }
3880  KMP_DEBUG_ASSERT(level < max_level);
3881  kmp_team_t *team = hot_teams[level].hot_team;
3882  nth = hot_teams[level].hot_team_nth;
3883  n = nth - 1; // primary thread is not freed
3884  if (level < max_level - 1) {
3885  for (i = 0; i < nth; ++i) {
3886  kmp_info_t *th = team->t.t_threads[i];
3887  n += __kmp_free_hot_teams(root, th, level + 1, max_level);
3888  if (i > 0 && th->th.th_hot_teams) {
3889  __kmp_free(th->th.th_hot_teams);
3890  th->th.th_hot_teams = NULL;
3891  }
3892  }
3893  }
3894  __kmp_free_team(root, team, NULL);
3895  return n;
3896 }
3897 #endif
3898 
3899 // Resets a root thread and clear its root and hot teams.
3900 // Returns the number of __kmp_threads entries directly and indirectly freed.
3901 static int __kmp_reset_root(int gtid, kmp_root_t *root) {
3902  kmp_team_t *root_team = root->r.r_root_team;
3903  kmp_team_t *hot_team = root->r.r_hot_team;
3904  int n = hot_team->t.t_nproc;
3905  int i;
3906 
3907  KMP_DEBUG_ASSERT(!root->r.r_active);
3908 
3909  root->r.r_root_team = NULL;
3910  root->r.r_hot_team = NULL;
3911  // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
3912  // before call to __kmp_free_team().
3913  __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
3914 #if KMP_NESTED_HOT_TEAMS
3915  if (__kmp_hot_teams_max_level >
3916  0) { // need to free nested hot teams and their threads if any
3917  for (i = 0; i < hot_team->t.t_nproc; ++i) {
3918  kmp_info_t *th = hot_team->t.t_threads[i];
3919  if (__kmp_hot_teams_max_level > 1) {
3920  n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
3921  }
3922  if (th->th.th_hot_teams) {
3923  __kmp_free(th->th.th_hot_teams);
3924  th->th.th_hot_teams = NULL;
3925  }
3926  }
3927  }
3928 #endif
3929  __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
3930 
3931  // Before we can reap the thread, we need to make certain that all other
3932  // threads in the teams that had this root as ancestor have stopped trying to
3933  // steal tasks.
3934  if (__kmp_tasking_mode != tskm_immediate_exec) {
3935  __kmp_wait_to_unref_task_teams();
3936  }
3937 
3938 #if KMP_OS_WINDOWS
3939  /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
3940  KA_TRACE(
3941  10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
3942  "\n",
3943  (LPVOID) & (root->r.r_uber_thread->th),
3944  root->r.r_uber_thread->th.th_info.ds.ds_thread));
3945  __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
3946 #endif /* KMP_OS_WINDOWS */
3947 
3948 #if OMPD_SUPPORT
3949  if (ompd_state & OMPD_ENABLE_BP)
3950  ompd_bp_thread_end();
3951 #endif
3952 
3953 #if OMPT_SUPPORT
3954  ompt_data_t *task_data;
3955  ompt_data_t *parallel_data;
3956  __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
3957  NULL);
3958  if (ompt_enabled.ompt_callback_implicit_task) {
3959  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3960  ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial);
3961  }
3962  if (ompt_enabled.ompt_callback_thread_end) {
3963  ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
3964  &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
3965  }
3966 #endif
3967 
3968  TCW_4(__kmp_nth,
3969  __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
3970  i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
3971  KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
3972  " to %d\n",
3973  root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
3974  root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
3975  if (i == 1) {
3976  // need to free contention group structure
3977  KMP_DEBUG_ASSERT(root->r.r_uber_thread ==
3978  root->r.r_uber_thread->th.th_cg_roots->cg_root);
3979  KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL);
3980  __kmp_free(root->r.r_uber_thread->th.th_cg_roots);
3981  root->r.r_uber_thread->th.th_cg_roots = NULL;
3982  }
3983  __kmp_reap_thread(root->r.r_uber_thread, 1);
3984 
3985  // We canot put root thread to __kmp_thread_pool, so we have to reap it
3986  // instead of freeing.
3987  root->r.r_uber_thread = NULL;
3988  /* mark root as no longer in use */
3989  root->r.r_begin = FALSE;
3990 
3991  return n;
3992 }
3993 
3994 void __kmp_unregister_root_current_thread(int gtid) {
3995  KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
3996  /* this lock should be ok, since unregister_root_current_thread is never
3997  called during an abort, only during a normal close. furthermore, if you
3998  have the forkjoin lock, you should never try to get the initz lock */
3999  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
4000  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
4001  KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
4002  "exiting T#%d\n",
4003  gtid));
4004  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4005  return;
4006  }
4007  kmp_root_t *root = __kmp_root[gtid];
4008 
4009  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4010  KMP_ASSERT(KMP_UBER_GTID(gtid));
4011  KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4012  KMP_ASSERT(root->r.r_active == FALSE);
4013 
4014  KMP_MB();
4015 
4016  kmp_info_t *thread = __kmp_threads[gtid];
4017  kmp_team_t *team = thread->th.th_team;
4018  kmp_task_team_t *task_team = thread->th.th_task_team;
4019 
4020  // we need to wait for the proxy tasks before finishing the thread
4021  if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) {
4022 #if OMPT_SUPPORT
4023  // the runtime is shutting down so we won't report any events
4024  thread->th.ompt_thread_info.state = ompt_state_undefined;
4025 #endif
4026  __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
4027  }
4028 
4029  __kmp_reset_root(gtid, root);
4030 
4031  KMP_MB();
4032  KC_TRACE(10,
4033  ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
4034 
4035  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4036 }
4037 
4038 #if KMP_OS_WINDOWS
4039 /* __kmp_forkjoin_lock must be already held
4040  Unregisters a root thread that is not the current thread. Returns the number
4041  of __kmp_threads entries freed as a result. */
4042 static int __kmp_unregister_root_other_thread(int gtid) {
4043  kmp_root_t *root = __kmp_root[gtid];
4044  int r;
4045 
4046  KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
4047  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4048  KMP_ASSERT(KMP_UBER_GTID(gtid));
4049  KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4050  KMP_ASSERT(root->r.r_active == FALSE);
4051 
4052  r = __kmp_reset_root(gtid, root);
4053  KC_TRACE(10,
4054  ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4055  return r;
4056 }
4057 #endif
4058 
4059 #if KMP_DEBUG
4060 void __kmp_task_info() {
4061 
4062  kmp_int32 gtid = __kmp_entry_gtid();
4063  kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4064  kmp_info_t *this_thr = __kmp_threads[gtid];
4065  kmp_team_t *steam = this_thr->th.th_serial_team;
4066  kmp_team_t *team = this_thr->th.th_team;
4067 
4068  __kmp_printf(
4069  "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
4070  "ptask=%p\n",
4071  gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
4072  team->t.t_implicit_task_taskdata[tid].td_parent);
4073 }
4074 #endif // KMP_DEBUG
4075 
4076 /* TODO optimize with one big memclr, take out what isn't needed, split
4077  responsibility to workers as much as possible, and delay initialization of
4078  features as much as possible */
4079 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4080  int tid, int gtid) {
4081  /* this_thr->th.th_info.ds.ds_gtid is setup in
4082  kmp_allocate_thread/create_worker.
4083  this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4084  KMP_DEBUG_ASSERT(this_thr != NULL);
4085  KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4086  KMP_DEBUG_ASSERT(team);
4087  KMP_DEBUG_ASSERT(team->t.t_threads);
4088  KMP_DEBUG_ASSERT(team->t.t_dispatch);
4089  kmp_info_t *master = team->t.t_threads[0];
4090  KMP_DEBUG_ASSERT(master);
4091  KMP_DEBUG_ASSERT(master->th.th_root);
4092 
4093  KMP_MB();
4094 
4095  TCW_SYNC_PTR(this_thr->th.th_team, team);
4096 
4097  this_thr->th.th_info.ds.ds_tid = tid;
4098  this_thr->th.th_set_nproc = 0;
4099  if (__kmp_tasking_mode != tskm_immediate_exec)
4100  // When tasking is possible, threads are not safe to reap until they are
4101  // done tasking; this will be set when tasking code is exited in wait
4102  this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4103  else // no tasking --> always safe to reap
4104  this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4105  this_thr->th.th_set_proc_bind = proc_bind_default;
4106 #if KMP_AFFINITY_SUPPORTED
4107  this_thr->th.th_new_place = this_thr->th.th_current_place;
4108 #endif
4109  this_thr->th.th_root = master->th.th_root;
4110 
4111  /* setup the thread's cache of the team structure */
4112  this_thr->th.th_team_nproc = team->t.t_nproc;
4113  this_thr->th.th_team_master = master;
4114  this_thr->th.th_team_serialized = team->t.t_serialized;
4115  TCW_PTR(this_thr->th.th_sleep_loc, NULL);
4116 
4117  KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4118 
4119  KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4120  tid, gtid, this_thr, this_thr->th.th_current_task));
4121 
4122  __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4123  team, tid, TRUE);
4124 
4125  KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4126  tid, gtid, this_thr, this_thr->th.th_current_task));
4127  // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4128  // __kmp_initialize_team()?
4129 
4130  /* TODO no worksharing in speculative threads */
4131  this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4132 
4133  this_thr->th.th_local.this_construct = 0;
4134 
4135  if (!this_thr->th.th_pri_common) {
4136  this_thr->th.th_pri_common =
4137  (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4138  if (__kmp_storage_map) {
4139  __kmp_print_storage_map_gtid(
4140  gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4141  sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4142  }
4143  this_thr->th.th_pri_head = NULL;
4144  }
4145 
4146  if (this_thr != master && // Primary thread's CG root is initialized elsewhere
4147  this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
4148  // Make new thread's CG root same as primary thread's
4149  KMP_DEBUG_ASSERT(master->th.th_cg_roots);
4150  kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
4151  if (tmp) {
4152  // worker changes CG, need to check if old CG should be freed
4153  int i = tmp->cg_nthreads--;
4154  KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads"
4155  " on node %p of thread %p to %d\n",
4156  this_thr, tmp, tmp->cg_root, tmp->cg_nthreads));
4157  if (i == 1) {
4158  __kmp_free(tmp); // last thread left CG --> free it
4159  }
4160  }
4161  this_thr->th.th_cg_roots = master->th.th_cg_roots;
4162  // Increment new thread's CG root's counter to add the new thread
4163  this_thr->th.th_cg_roots->cg_nthreads++;
4164  KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
4165  " node %p of thread %p to %d\n",
4166  this_thr, this_thr->th.th_cg_roots,
4167  this_thr->th.th_cg_roots->cg_root,
4168  this_thr->th.th_cg_roots->cg_nthreads));
4169  this_thr->th.th_current_task->td_icvs.thread_limit =
4170  this_thr->th.th_cg_roots->cg_thread_limit;
4171  }
4172 
4173  /* Initialize dynamic dispatch */
4174  {
4175  volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4176  // Use team max_nproc since this will never change for the team.
4177  size_t disp_size =
4178  sizeof(dispatch_private_info_t) *
4179  (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4180  KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4181  team->t.t_max_nproc));
4182  KMP_ASSERT(dispatch);
4183  KMP_DEBUG_ASSERT(team->t.t_dispatch);
4184  KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4185 
4186  dispatch->th_disp_index = 0;
4187  dispatch->th_doacross_buf_idx = 0;
4188  if (!dispatch->th_disp_buffer) {
4189  dispatch->th_disp_buffer =
4190  (dispatch_private_info_t *)__kmp_allocate(disp_size);
4191 
4192  if (__kmp_storage_map) {
4193  __kmp_print_storage_map_gtid(
4194  gtid, &dispatch->th_disp_buffer[0],
4195  &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4196  ? 1
4197  : __kmp_dispatch_num_buffers],
4198  disp_size,
4199  "th_%d.th_dispatch.th_disp_buffer "
4200  "(team_%d.t_dispatch[%d].th_disp_buffer)",
4201  gtid, team->t.t_id, gtid);
4202  }
4203  } else {
4204  memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4205  }
4206 
4207  dispatch->th_dispatch_pr_current = 0;
4208  dispatch->th_dispatch_sh_current = 0;
4209 
4210  dispatch->th_deo_fcn = 0; /* ORDERED */
4211  dispatch->th_dxo_fcn = 0; /* END ORDERED */
4212  }
4213 
4214  this_thr->th.th_next_pool = NULL;
4215 
4216  if (!this_thr->th.th_task_state_memo_stack) {
4217  size_t i;
4218  this_thr->th.th_task_state_memo_stack =
4219  (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4220  this_thr->th.th_task_state_top = 0;
4221  this_thr->th.th_task_state_stack_sz = 4;
4222  for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4223  ++i) // zero init the stack
4224  this_thr->th.th_task_state_memo_stack[i] = 0;
4225  }
4226 
4227  KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4228  KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4229 
4230  KMP_MB();
4231 }
4232 
4233 /* allocate a new thread for the requesting team. this is only called from
4234  within a forkjoin critical section. we will first try to get an available
4235  thread from the thread pool. if none is available, we will fork a new one
4236  assuming we are able to create a new one. this should be assured, as the
4237  caller should check on this first. */
4238 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4239  int new_tid) {
4240  kmp_team_t *serial_team;
4241  kmp_info_t *new_thr;
4242  int new_gtid;
4243 
4244  KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4245  KMP_DEBUG_ASSERT(root && team);
4246 #if !KMP_NESTED_HOT_TEAMS
4247  KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4248 #endif
4249  KMP_MB();
4250 
4251  /* first, try to get one from the thread pool */
4252  if (__kmp_thread_pool) {
4253  new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4254  __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4255  if (new_thr == __kmp_thread_pool_insert_pt) {
4256  __kmp_thread_pool_insert_pt = NULL;
4257  }
4258  TCW_4(new_thr->th.th_in_pool, FALSE);
4259  __kmp_suspend_initialize_thread(new_thr);
4260  __kmp_lock_suspend_mx(new_thr);
4261  if (new_thr->th.th_active_in_pool == TRUE) {
4262  KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE);
4263  KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
4264  new_thr->th.th_active_in_pool = FALSE;
4265  }
4266  __kmp_unlock_suspend_mx(new_thr);
4267 
4268  KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4269  __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4270  KMP_ASSERT(!new_thr->th.th_team);
4271  KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4272 
4273  /* setup the thread structure */
4274  __kmp_initialize_info(new_thr, team, new_tid,
4275  new_thr->th.th_info.ds.ds_gtid);
4276  KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4277 
4278  TCW_4(__kmp_nth, __kmp_nth + 1);
4279 
4280  new_thr->th.th_task_state = 0;
4281  new_thr->th.th_task_state_top = 0;
4282  new_thr->th.th_task_state_stack_sz = 4;
4283 
4284 #ifdef KMP_ADJUST_BLOCKTIME
4285  /* Adjust blocktime back to zero if necessary */
4286  /* Middle initialization might not have occurred yet */
4287  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4288  if (__kmp_nth > __kmp_avail_proc) {
4289  __kmp_zero_bt = TRUE;
4290  }
4291  }
4292 #endif /* KMP_ADJUST_BLOCKTIME */
4293 
4294 #if KMP_DEBUG
4295  // If thread entered pool via __kmp_free_thread, wait_flag should !=
4296  // KMP_BARRIER_PARENT_FLAG.
4297  int b;
4298  kmp_balign_t *balign = new_thr->th.th_bar;
4299  for (b = 0; b < bs_last_barrier; ++b)
4300  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4301 #endif
4302 
4303  KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4304  __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4305 
4306  KMP_MB();
4307  return new_thr;
4308  }
4309 
4310  /* no, well fork a new one */
4311  KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4312  KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4313 
4314 #if KMP_USE_MONITOR
4315  // If this is the first worker thread the RTL is creating, then also
4316  // launch the monitor thread. We try to do this as early as possible.
4317  if (!TCR_4(__kmp_init_monitor)) {
4318  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4319  if (!TCR_4(__kmp_init_monitor)) {
4320  KF_TRACE(10, ("before __kmp_create_monitor\n"));
4321  TCW_4(__kmp_init_monitor, 1);
4322  __kmp_create_monitor(&__kmp_monitor);
4323  KF_TRACE(10, ("after __kmp_create_monitor\n"));
4324 #if KMP_OS_WINDOWS
4325  // AC: wait until monitor has started. This is a fix for CQ232808.
4326  // The reason is that if the library is loaded/unloaded in a loop with
4327  // small (parallel) work in between, then there is high probability that
4328  // monitor thread started after the library shutdown. At shutdown it is
4329  // too late to cope with the problem, because when the primary thread is
4330  // in DllMain (process detach) the monitor has no chances to start (it is
4331  // blocked), and primary thread has no means to inform the monitor that
4332  // the library has gone, because all the memory which the monitor can
4333  // access is going to be released/reset.
4334  while (TCR_4(__kmp_init_monitor) < 2) {
4335  KMP_YIELD(TRUE);
4336  }
4337  KF_TRACE(10, ("after monitor thread has started\n"));
4338 #endif
4339  }
4340  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4341  }
4342 #endif
4343 
4344  KMP_MB();
4345 
4346  {
4347  int new_start_gtid = TCR_4(__kmp_init_hidden_helper_threads)
4348  ? 1
4349  : __kmp_hidden_helper_threads_num + 1;
4350 
4351  for (new_gtid = new_start_gtid; TCR_PTR(__kmp_threads[new_gtid]) != NULL;
4352  ++new_gtid) {
4353  KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4354  }
4355 
4356  if (TCR_4(__kmp_init_hidden_helper_threads)) {
4357  KMP_DEBUG_ASSERT(new_gtid <= __kmp_hidden_helper_threads_num);
4358  }
4359  }
4360 
4361  /* allocate space for it. */
4362  new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4363 
4364  TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4365 
4366 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
4367  // suppress race conditions detection on synchronization flags in debug mode
4368  // this helps to analyze library internals eliminating false positives
4369  __itt_suppress_mark_range(
4370  __itt_suppress_range, __itt_suppress_threading_errors,
4371  &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc));
4372  __itt_suppress_mark_range(
4373  __itt_suppress_range, __itt_suppress_threading_errors,
4374  &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state));
4375 #if KMP_OS_WINDOWS
4376  __itt_suppress_mark_range(
4377  __itt_suppress_range, __itt_suppress_threading_errors,
4378  &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init));
4379 #else
4380  __itt_suppress_mark_range(__itt_suppress_range,
4381  __itt_suppress_threading_errors,
4382  &new_thr->th.th_suspend_init_count,
4383  sizeof(new_thr->th.th_suspend_init_count));
4384 #endif
4385  // TODO: check if we need to also suppress b_arrived flags
4386  __itt_suppress_mark_range(__itt_suppress_range,
4387  __itt_suppress_threading_errors,
4388  CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go),
4389  sizeof(new_thr->th.th_bar[0].bb.b_go));
4390  __itt_suppress_mark_range(__itt_suppress_range,
4391  __itt_suppress_threading_errors,
4392  CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go),
4393  sizeof(new_thr->th.th_bar[1].bb.b_go));
4394  __itt_suppress_mark_range(__itt_suppress_range,
4395  __itt_suppress_threading_errors,
4396  CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go),
4397  sizeof(new_thr->th.th_bar[2].bb.b_go));
4398 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
4399  if (__kmp_storage_map) {
4400  __kmp_print_thread_storage_map(new_thr, new_gtid);
4401  }
4402 
4403  // add the reserve serialized team, initialized from the team's primary thread
4404  {
4405  kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4406  KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4407  new_thr->th.th_serial_team = serial_team =
4408  (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4409 #if OMPT_SUPPORT
4410  ompt_data_none, // root parallel id
4411 #endif
4412  proc_bind_default, &r_icvs,
4413  0 USE_NESTED_HOT_ARG(NULL));
4414  }
4415  KMP_ASSERT(serial_team);
4416  serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4417  // execution (it is unused for now).
4418  serial_team->t.t_threads[0] = new_thr;
4419  KF_TRACE(10,
4420  ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4421  new_thr));
4422 
4423  /* setup the thread structures */
4424  __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4425 
4426 #if USE_FAST_MEMORY
4427  __kmp_initialize_fast_memory(new_thr);
4428 #endif /* USE_FAST_MEMORY */
4429 
4430 #if KMP_USE_BGET
4431  KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4432  __kmp_initialize_bget(new_thr);
4433 #endif
4434 
4435  __kmp_init_random(new_thr); // Initialize random number generator
4436 
4437  /* Initialize these only once when thread is grabbed for a team allocation */
4438  KA_TRACE(20,
4439  ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4440  __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4441 
4442  int b;
4443  kmp_balign_t *balign = new_thr->th.th_bar;
4444  for (b = 0; b < bs_last_barrier; ++b) {
4445  balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4446  balign[b].bb.team = NULL;
4447  balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4448  balign[b].bb.use_oncore_barrier = 0;
4449  }
4450 
4451  new_thr->th.th_spin_here = FALSE;
4452  new_thr->th.th_next_waiting = 0;
4453 #if KMP_OS_UNIX
4454  new_thr->th.th_blocking = false;
4455 #endif
4456 
4457 #if KMP_AFFINITY_SUPPORTED
4458  new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4459  new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4460  new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4461  new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4462 #endif
4463  new_thr->th.th_def_allocator = __kmp_def_allocator;
4464  new_thr->th.th_prev_level = 0;
4465  new_thr->th.th_prev_num_threads = 1;
4466 
4467  TCW_4(new_thr->th.th_in_pool, FALSE);
4468  new_thr->th.th_active_in_pool = FALSE;
4469  TCW_4(new_thr->th.th_active, TRUE);
4470 
4471  /* adjust the global counters */
4472  __kmp_all_nth++;
4473  __kmp_nth++;
4474 
4475  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4476  // numbers of procs, and method #2 (keyed API call) for higher numbers.
4477  if (__kmp_adjust_gtid_mode) {
4478  if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4479  if (TCR_4(__kmp_gtid_mode) != 2) {
4480  TCW_4(__kmp_gtid_mode, 2);
4481  }
4482  } else {
4483  if (TCR_4(__kmp_gtid_mode) != 1) {
4484  TCW_4(__kmp_gtid_mode, 1);
4485  }
4486  }
4487  }
4488 
4489 #ifdef KMP_ADJUST_BLOCKTIME
4490  /* Adjust blocktime back to zero if necessary */
4491  /* Middle initialization might not have occurred yet */
4492  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4493  if (__kmp_nth > __kmp_avail_proc) {
4494  __kmp_zero_bt = TRUE;
4495  }
4496  }
4497 #endif /* KMP_ADJUST_BLOCKTIME */
4498 
4499  /* actually fork it and create the new worker thread */
4500  KF_TRACE(
4501  10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4502  __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4503  KF_TRACE(10,
4504  ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4505 
4506  KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4507  new_gtid));
4508  KMP_MB();
4509  return new_thr;
4510 }
4511 
4512 /* Reinitialize team for reuse.
4513  The hot team code calls this case at every fork barrier, so EPCC barrier
4514  test are extremely sensitive to changes in it, esp. writes to the team
4515  struct, which cause a cache invalidation in all threads.
4516  IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4517 static void __kmp_reinitialize_team(kmp_team_t *team,
4518  kmp_internal_control_t *new_icvs,
4519  ident_t *loc) {
4520  KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4521  team->t.t_threads[0], team));
4522  KMP_DEBUG_ASSERT(team && new_icvs);
4523  KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4524  KMP_CHECK_UPDATE(team->t.t_ident, loc);
4525 
4526  KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4527  // Copy ICVs to the primary thread's implicit taskdata
4528  __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4529  copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4530 
4531  KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4532  team->t.t_threads[0], team));
4533 }
4534 
4535 /* Initialize the team data structure.
4536  This assumes the t_threads and t_max_nproc are already set.
4537  Also, we don't touch the arguments */
4538 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4539  kmp_internal_control_t *new_icvs,
4540  ident_t *loc) {
4541  KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4542 
4543  /* verify */
4544  KMP_DEBUG_ASSERT(team);
4545  KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4546  KMP_DEBUG_ASSERT(team->t.t_threads);
4547  KMP_MB();
4548 
4549  team->t.t_master_tid = 0; /* not needed */
4550  /* team->t.t_master_bar; not needed */
4551  team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4552  team->t.t_nproc = new_nproc;
4553 
4554  /* team->t.t_parent = NULL; TODO not needed & would mess up hot team */
4555  team->t.t_next_pool = NULL;
4556  /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4557  * up hot team */
4558 
4559  TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4560  team->t.t_invoke = NULL; /* not needed */
4561 
4562  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4563  team->t.t_sched.sched = new_icvs->sched.sched;
4564 
4565 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4566  team->t.t_fp_control_saved = FALSE; /* not needed */
4567  team->t.t_x87_fpu_control_word = 0; /* not needed */
4568  team->t.t_mxcsr = 0; /* not needed */
4569 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4570 
4571  team->t.t_construct = 0;
4572 
4573  team->t.t_ordered.dt.t_value = 0;
4574  team->t.t_master_active = FALSE;
4575 
4576 #ifdef KMP_DEBUG
4577  team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4578 #endif
4579 #if KMP_OS_WINDOWS
4580  team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4581 #endif
4582 
4583  team->t.t_control_stack_top = NULL;
4584 
4585  __kmp_reinitialize_team(team, new_icvs, loc);
4586 
4587  KMP_MB();
4588  KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4589 }
4590 
4591 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
4592 /* Sets full mask for thread and returns old mask, no changes to structures. */
4593 static void
4594 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
4595  if (KMP_AFFINITY_CAPABLE()) {
4596  int status;
4597  if (old_mask != NULL) {
4598  status = __kmp_get_system_affinity(old_mask, TRUE);
4599  int error = errno;
4600  if (status != 0) {
4601  __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error),
4602  __kmp_msg_null);
4603  }
4604  }
4605  __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE);
4606  }
4607 }
4608 #endif
4609 
4610 #if KMP_AFFINITY_SUPPORTED
4611 
4612 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4613 // It calculates the worker + primary thread's partition based upon the parent
4614 // thread's partition, and binds each worker to a thread in their partition.
4615 // The primary thread's partition should already include its current binding.
4616 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4617  // Do not partition places for the hidden helper team
4618  if (KMP_HIDDEN_HELPER_TEAM(team))
4619  return;
4620  // Copy the primary thread's place partition to the team struct
4621  kmp_info_t *master_th = team->t.t_threads[0];
4622  KMP_DEBUG_ASSERT(master_th != NULL);
4623  kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4624  int first_place = master_th->th.th_first_place;
4625  int last_place = master_th->th.th_last_place;
4626  int masters_place = master_th->th.th_current_place;
4627  team->t.t_first_place = first_place;
4628  team->t.t_last_place = last_place;
4629 
4630  KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4631  "bound to place %d partition = [%d,%d]\n",
4632  proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4633  team->t.t_id, masters_place, first_place, last_place));
4634 
4635  switch (proc_bind) {
4636 
4637  case proc_bind_default:
4638  // Serial teams might have the proc_bind policy set to proc_bind_default.
4639  // Not an issue -- we don't rebind primary thread for any proc_bind policy.
4640  KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4641  break;
4642 
4643  case proc_bind_primary: {
4644  int f;
4645  int n_th = team->t.t_nproc;
4646  for (f = 1; f < n_th; f++) {
4647  kmp_info_t *th = team->t.t_threads[f];
4648  KMP_DEBUG_ASSERT(th != NULL);
4649  th->th.th_first_place = first_place;
4650  th->th.th_last_place = last_place;
4651  th->th.th_new_place = masters_place;
4652  if (__kmp_display_affinity && masters_place != th->th.th_current_place &&
4653  team->t.t_display_affinity != 1) {
4654  team->t.t_display_affinity = 1;
4655  }
4656 
4657  KA_TRACE(100, ("__kmp_partition_places: primary: T#%d(%d:%d) place %d "
4658  "partition = [%d,%d]\n",
4659  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4660  f, masters_place, first_place, last_place));
4661  }
4662  } break;
4663 
4664  case proc_bind_close: {
4665  int f;
4666  int n_th = team->t.t_nproc;
4667  int n_places;
4668  if (first_place <= last_place) {
4669  n_places = last_place - first_place + 1;
4670  } else {
4671  n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4672  }
4673  if (n_th <= n_places) {
4674  int place = masters_place;
4675  for (f = 1; f < n_th; f++) {
4676  kmp_info_t *th = team->t.t_threads[f];
4677  KMP_DEBUG_ASSERT(th != NULL);
4678 
4679  if (place == last_place) {
4680  place = first_place;
4681  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4682  place = 0;
4683  } else {
4684  place++;
4685  }
4686  th->th.th_first_place = first_place;
4687  th->th.th_last_place = last_place;
4688  th->th.th_new_place = place;
4689  if (__kmp_display_affinity && place != th->th.th_current_place &&
4690  team->t.t_display_affinity != 1) {
4691  team->t.t_display_affinity = 1;
4692  }
4693 
4694  KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4695  "partition = [%d,%d]\n",
4696  __kmp_gtid_from_thread(team->t.t_threads[f]),
4697  team->t.t_id, f, place, first_place, last_place));
4698  }
4699  } else {
4700  int S, rem, gap, s_count;
4701  S = n_th / n_places;
4702  s_count = 0;
4703  rem = n_th - (S * n_places);
4704  gap = rem > 0 ? n_places / rem : n_places;
4705  int place = masters_place;
4706  int gap_ct = gap;
4707  for (f = 0; f < n_th; f++) {
4708  kmp_info_t *th = team->t.t_threads[f];
4709  KMP_DEBUG_ASSERT(th != NULL);
4710 
4711  th->th.th_first_place = first_place;
4712  th->th.th_last_place = last_place;
4713  th->th.th_new_place = place;
4714  if (__kmp_display_affinity && place != th->th.th_current_place &&
4715  team->t.t_display_affinity != 1) {
4716  team->t.t_display_affinity = 1;
4717  }
4718  s_count++;
4719 
4720  if ((s_count == S) && rem && (gap_ct == gap)) {
4721  // do nothing, add an extra thread to place on next iteration
4722  } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4723  // we added an extra thread to this place; move to next place
4724  if (place == last_place) {
4725  place = first_place;
4726  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4727  place = 0;
4728  } else {
4729  place++;
4730  }
4731  s_count = 0;
4732  gap_ct = 1;
4733  rem--;
4734  } else if (s_count == S) { // place full; don't add extra
4735  if (place == last_place) {
4736  place = first_place;
4737  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4738  place = 0;
4739  } else {
4740  place++;
4741  }
4742  gap_ct++;
4743  s_count = 0;
4744  }
4745 
4746  KA_TRACE(100,
4747  ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4748  "partition = [%d,%d]\n",
4749  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4750  th->th.th_new_place, first_place, last_place));
4751  }
4752  KMP_DEBUG_ASSERT(place == masters_place);
4753  }
4754  } break;
4755 
4756  case proc_bind_spread: {
4757  int f;
4758  int n_th = team->t.t_nproc;
4759  int n_places;
4760  int thidx;
4761  if (first_place <= last_place) {
4762  n_places = last_place - first_place + 1;
4763  } else {
4764  n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4765  }
4766  if (n_th <= n_places) {
4767  int place = -1;
4768 
4769  if (n_places != static_cast<int>(__kmp_affinity_num_masks)) {
4770  int S = n_places / n_th;
4771  int s_count, rem, gap, gap_ct;
4772 
4773  place = masters_place;
4774  rem = n_places - n_th * S;
4775  gap = rem ? n_th / rem : 1;
4776  gap_ct = gap;
4777  thidx = n_th;
4778  if (update_master_only == 1)
4779  thidx = 1;
4780  for (f = 0; f < thidx; f++) {
4781  kmp_info_t *th = team->t.t_threads[f];
4782  KMP_DEBUG_ASSERT(th != NULL);
4783 
4784  th->th.th_first_place = place;
4785  th->th.th_new_place = place;
4786  if (__kmp_display_affinity && place != th->th.th_current_place &&
4787  team->t.t_display_affinity != 1) {
4788  team->t.t_display_affinity = 1;
4789  }
4790  s_count = 1;
4791  while (s_count < S) {
4792  if (place == last_place) {
4793  place = first_place;
4794  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4795  place = 0;
4796  } else {
4797  place++;
4798  }
4799  s_count++;
4800  }
4801  if (rem && (gap_ct == gap)) {
4802  if (place == last_place) {
4803  place = first_place;
4804  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4805  place = 0;
4806  } else {
4807  place++;
4808  }
4809  rem--;
4810  gap_ct = 0;
4811  }
4812  th->th.th_last_place = place;
4813  gap_ct++;
4814 
4815  if (place == last_place) {
4816  place = first_place;
4817  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4818  place = 0;
4819  } else {
4820  place++;
4821  }
4822 
4823  KA_TRACE(100,
4824  ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4825  "partition = [%d,%d], __kmp_affinity_num_masks: %u\n",
4826  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4827  f, th->th.th_new_place, th->th.th_first_place,
4828  th->th.th_last_place, __kmp_affinity_num_masks));
4829  }
4830  } else {
4831  /* Having uniform space of available computation places I can create
4832  T partitions of round(P/T) size and put threads into the first
4833  place of each partition. */
4834  double current = static_cast<double>(masters_place);
4835  double spacing =
4836  (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
4837  int first, last;
4838  kmp_info_t *th;
4839 
4840  thidx = n_th + 1;
4841  if (update_master_only == 1)
4842  thidx = 1;
4843  for (f = 0; f < thidx; f++) {
4844  first = static_cast<int>(current);
4845  last = static_cast<int>(current + spacing) - 1;
4846  KMP_DEBUG_ASSERT(last >= first);
4847  if (first >= n_places) {
4848  if (masters_place) {
4849  first -= n_places;
4850  last -= n_places;
4851  if (first == (masters_place + 1)) {
4852  KMP_DEBUG_ASSERT(f == n_th);
4853  first--;
4854  }
4855  if (last == masters_place) {
4856  KMP_DEBUG_ASSERT(f == (n_th - 1));
4857  last--;
4858  }
4859  } else {
4860  KMP_DEBUG_ASSERT(f == n_th);
4861  first = 0;
4862  last = 0;
4863  }
4864  }
4865  if (last >= n_places) {
4866  last = (n_places - 1);
4867  }
4868  place = first;
4869  current += spacing;
4870  if (f < n_th) {
4871  KMP_DEBUG_ASSERT(0 <= first);
4872  KMP_DEBUG_ASSERT(n_places > first);
4873  KMP_DEBUG_ASSERT(0 <= last);
4874  KMP_DEBUG_ASSERT(n_places > last);
4875  KMP_DEBUG_ASSERT(last_place >= first_place);
4876  th = team->t.t_threads[f];
4877  KMP_DEBUG_ASSERT(th);
4878  th->th.th_first_place = first;
4879  th->th.th_new_place = place;
4880  th->th.th_last_place = last;
4881  if (__kmp_display_affinity && place != th->th.th_current_place &&
4882  team->t.t_display_affinity != 1) {
4883  team->t.t_display_affinity = 1;
4884  }
4885  KA_TRACE(100,
4886  ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4887  "partition = [%d,%d], spacing = %.4f\n",
4888  __kmp_gtid_from_thread(team->t.t_threads[f]),
4889  team->t.t_id, f, th->th.th_new_place,
4890  th->th.th_first_place, th->th.th_last_place, spacing));
4891  }
4892  }
4893  }
4894  KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4895  } else {
4896  int S, rem, gap, s_count;
4897  S = n_th / n_places;
4898  s_count = 0;
4899  rem = n_th - (S * n_places);
4900  gap = rem > 0 ? n_places / rem : n_places;
4901  int place = masters_place;
4902  int gap_ct = gap;
4903  thidx = n_th;
4904  if (update_master_only == 1)
4905  thidx = 1;
4906  for (f = 0; f < thidx; f++) {
4907  kmp_info_t *th = team->t.t_threads[f];
4908  KMP_DEBUG_ASSERT(th != NULL);
4909 
4910  th->th.th_first_place = place;
4911  th->th.th_last_place = place;
4912  th->th.th_new_place = place;
4913  if (__kmp_display_affinity && place != th->th.th_current_place &&
4914  team->t.t_display_affinity != 1) {
4915  team->t.t_display_affinity = 1;
4916  }
4917  s_count++;
4918 
4919  if ((s_count == S) && rem && (gap_ct == gap)) {
4920  // do nothing, add an extra thread to place on next iteration
4921  } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4922  // we added an extra thread to this place; move on to next place
4923  if (place == last_place) {
4924  place = first_place;
4925  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4926  place = 0;
4927  } else {
4928  place++;
4929  }
4930  s_count = 0;
4931  gap_ct = 1;
4932  rem--;
4933  } else if (s_count == S) { // place is full; don't add extra thread
4934  if (place == last_place) {
4935  place = first_place;
4936  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4937  place = 0;
4938  } else {
4939  place++;
4940  }
4941  gap_ct++;
4942  s_count = 0;
4943  }
4944 
4945  KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4946  "partition = [%d,%d]\n",
4947  __kmp_gtid_from_thread(team->t.t_threads[f]),
4948  team->t.t_id, f, th->th.th_new_place,
4949  th->th.th_first_place, th->th.th_last_place));
4950  }
4951  KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4952  }
4953  } break;
4954 
4955  default:
4956  break;
4957  }
4958 
4959  KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
4960 }
4961 
4962 #endif // KMP_AFFINITY_SUPPORTED
4963 
4964 /* allocate a new team data structure to use. take one off of the free pool if
4965  available */
4966 kmp_team_t *
4967 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
4968 #if OMPT_SUPPORT
4969  ompt_data_t ompt_parallel_data,
4970 #endif
4971  kmp_proc_bind_t new_proc_bind,
4972  kmp_internal_control_t *new_icvs,
4973  int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
4974  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
4975  int f;
4976  kmp_team_t *team;
4977  int use_hot_team = !root->r.r_active;
4978  int level = 0;
4979 
4980  KA_TRACE(20, ("__kmp_allocate_team: called\n"));
4981  KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
4982  KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
4983  KMP_MB();
4984 
4985 #if KMP_NESTED_HOT_TEAMS
4986  kmp_hot_team_ptr_t *hot_teams;
4987  if (master) {
4988  team = master->th.th_team;
4989  level = team->t.t_active_level;
4990  if (master->th.th_teams_microtask) { // in teams construct?
4991  if (master->th.th_teams_size.nteams > 1 &&
4992  ( // #teams > 1
4993  team->t.t_pkfn ==
4994  (microtask_t)__kmp_teams_master || // inner fork of the teams
4995  master->th.th_teams_level <
4996  team->t.t_level)) { // or nested parallel inside the teams
4997  ++level; // not increment if #teams==1, or for outer fork of the teams;
4998  // increment otherwise
4999  }
5000  }
5001  hot_teams = master->th.th_hot_teams;
5002  if (level < __kmp_hot_teams_max_level && hot_teams &&
5003  hot_teams[level].hot_team) {
5004  // hot team has already been allocated for given level
5005  use_hot_team = 1;
5006  } else {
5007  use_hot_team = 0;
5008  }
5009  } else {
5010  // check we won't access uninitialized hot_teams, just in case
5011  KMP_DEBUG_ASSERT(new_nproc == 1);
5012  }
5013 #endif
5014  // Optimization to use a "hot" team
5015  if (use_hot_team && new_nproc > 1) {
5016  KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
5017 #if KMP_NESTED_HOT_TEAMS
5018  team = hot_teams[level].hot_team;
5019 #else
5020  team = root->r.r_hot_team;
5021 #endif
5022 #if KMP_DEBUG
5023  if (__kmp_tasking_mode != tskm_immediate_exec) {
5024  KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5025  "task_team[1] = %p before reinit\n",
5026  team->t.t_task_team[0], team->t.t_task_team[1]));
5027  }
5028 #endif
5029 
5030  // Has the number of threads changed?
5031  /* Let's assume the most common case is that the number of threads is
5032  unchanged, and put that case first. */
5033  if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
5034  KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
5035  // This case can mean that omp_set_num_threads() was called and the hot
5036  // team size was already reduced, so we check the special flag
5037  if (team->t.t_size_changed == -1) {
5038  team->t.t_size_changed = 1;
5039  } else {
5040  KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
5041  }
5042 
5043  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5044  kmp_r_sched_t new_sched = new_icvs->sched;
5045  // set primary thread's schedule as new run-time schedule
5046  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
5047 
5048  __kmp_reinitialize_team(team, new_icvs,
5049  root->r.r_uber_thread->th.th_ident);
5050 
5051  KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
5052  team->t.t_threads[0], team));
5053  __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5054 
5055 #if KMP_AFFINITY_SUPPORTED
5056  if ((team->t.t_size_changed == 0) &&
5057  (team->t.t_proc_bind == new_proc_bind)) {
5058  if (new_proc_bind == proc_bind_spread) {
5059  __kmp_partition_places(
5060  team, 1); // add flag to update only master for spread
5061  }
5062  KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
5063  "proc_bind = %d, partition = [%d,%d]\n",
5064  team->t.t_id, new_proc_bind, team->t.t_first_place,
5065  team->t.t_last_place));
5066  } else {
5067  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5068  __kmp_partition_places(team);
5069  }
5070 #else
5071  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5072 #endif /* KMP_AFFINITY_SUPPORTED */
5073  } else if (team->t.t_nproc > new_nproc) {
5074  KA_TRACE(20,
5075  ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
5076  new_nproc));
5077 
5078  team->t.t_size_changed = 1;
5079 #if KMP_NESTED_HOT_TEAMS
5080  if (__kmp_hot_teams_mode == 0) {
5081  // AC: saved number of threads should correspond to team's value in this
5082  // mode, can be bigger in mode 1, when hot team has threads in reserve
5083  KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
5084  hot_teams[level].hot_team_nth = new_nproc;
5085 #endif // KMP_NESTED_HOT_TEAMS
5086  /* release the extra threads we don't need any more */
5087  for (f = new_nproc; f < team->t.t_nproc; f++) {
5088  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5089  if (__kmp_tasking_mode != tskm_immediate_exec) {
5090  // When decreasing team size, threads no longer in the team should
5091  // unref task team.
5092  team->t.t_threads[f]->th.th_task_team = NULL;
5093  }
5094  __kmp_free_thread(team->t.t_threads[f]);
5095  team->t.t_threads[f] = NULL;
5096  }
5097 #if KMP_NESTED_HOT_TEAMS
5098  } // (__kmp_hot_teams_mode == 0)
5099  else {
5100  // When keeping extra threads in team, switch threads to wait on own
5101  // b_go flag
5102  for (f = new_nproc; f < team->t.t_nproc; ++f) {
5103  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5104  kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
5105  for (int b = 0; b < bs_last_barrier; ++b) {
5106  if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
5107  balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5108  }
5109  KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
5110  }
5111  }
5112  }
5113 #endif // KMP_NESTED_HOT_TEAMS
5114  team->t.t_nproc = new_nproc;
5115  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5116  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
5117  __kmp_reinitialize_team(team, new_icvs,
5118  root->r.r_uber_thread->th.th_ident);
5119 
5120  // Update remaining threads
5121  for (f = 0; f < new_nproc; ++f) {
5122  team->t.t_threads[f]->th.th_team_nproc = new_nproc;
5123  }
5124 
5125  // restore the current task state of the primary thread: should be the
5126  // implicit task
5127  KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5128  team->t.t_threads[0], team));
5129 
5130  __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5131 
5132 #ifdef KMP_DEBUG
5133  for (f = 0; f < team->t.t_nproc; f++) {
5134  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5135  team->t.t_threads[f]->th.th_team_nproc ==
5136  team->t.t_nproc);
5137  }
5138 #endif
5139 
5140  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5141 #if KMP_AFFINITY_SUPPORTED
5142  __kmp_partition_places(team);
5143 #endif
5144  } else { // team->t.t_nproc < new_nproc
5145 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5146  kmp_affin_mask_t *old_mask;
5147  if (KMP_AFFINITY_CAPABLE()) {
5148  KMP_CPU_ALLOC(old_mask);
5149  }
5150 #endif
5151 
5152  KA_TRACE(20,
5153  ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5154  new_nproc));
5155 
5156  team->t.t_size_changed = 1;
5157 
5158 #if KMP_NESTED_HOT_TEAMS
5159  int avail_threads = hot_teams[level].hot_team_nth;
5160  if (new_nproc < avail_threads)
5161  avail_threads = new_nproc;
5162  kmp_info_t **other_threads = team->t.t_threads;
5163  for (f = team->t.t_nproc; f < avail_threads; ++f) {
5164  // Adjust barrier data of reserved threads (if any) of the team
5165  // Other data will be set in __kmp_initialize_info() below.
5166  int b;
5167  kmp_balign_t *balign = other_threads[f]->th.th_bar;
5168  for (b = 0; b < bs_last_barrier; ++b) {
5169  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5170  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5171 #if USE_DEBUGGER
5172  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5173 #endif
5174  }
5175  }
5176  if (hot_teams[level].hot_team_nth >= new_nproc) {
5177  // we have all needed threads in reserve, no need to allocate any
5178  // this only possible in mode 1, cannot have reserved threads in mode 0
5179  KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5180  team->t.t_nproc = new_nproc; // just get reserved threads involved
5181  } else {
5182  // we may have some threads in reserve, but not enough
5183  team->t.t_nproc =
5184  hot_teams[level]
5185  .hot_team_nth; // get reserved threads involved if any
5186  hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5187 #endif // KMP_NESTED_HOT_TEAMS
5188  if (team->t.t_max_nproc < new_nproc) {
5189  /* reallocate larger arrays */
5190  __kmp_reallocate_team_arrays(team, new_nproc);
5191  __kmp_reinitialize_team(team, new_icvs, NULL);
5192  }
5193 
5194 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5195  /* Temporarily set full mask for primary thread before creation of
5196  workers. The reason is that workers inherit the affinity from the
5197  primary thread, so if a lot of workers are created on the single
5198  core quickly, they don't get a chance to set their own affinity for
5199  a long time. */
5200  __kmp_set_thread_affinity_mask_full_tmp(old_mask);
5201 #endif
5202 
5203  /* allocate new threads for the hot team */
5204  for (f = team->t.t_nproc; f < new_nproc; f++) {
5205  kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5206  KMP_DEBUG_ASSERT(new_worker);
5207  team->t.t_threads[f] = new_worker;
5208 
5209  KA_TRACE(20,
5210  ("__kmp_allocate_team: team %d init T#%d arrived: "
5211  "join=%llu, plain=%llu\n",
5212  team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5213  team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5214  team->t.t_bar[bs_plain_barrier].b_arrived));
5215 
5216  { // Initialize barrier data for new threads.
5217  int b;
5218  kmp_balign_t *balign = new_worker->th.th_bar;
5219  for (b = 0; b < bs_last_barrier; ++b) {
5220  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5221  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5222  KMP_BARRIER_PARENT_FLAG);
5223 #if USE_DEBUGGER
5224  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5225 #endif
5226  }
5227  }
5228  }
5229 
5230 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5231  if (KMP_AFFINITY_CAPABLE()) {
5232  /* Restore initial primary thread's affinity mask */
5233  __kmp_set_system_affinity(old_mask, TRUE);
5234  KMP_CPU_FREE(old_mask);
5235  }
5236 #endif
5237 #if KMP_NESTED_HOT_TEAMS
5238  } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5239 #endif // KMP_NESTED_HOT_TEAMS
5240  /* make sure everyone is syncronized */
5241  int old_nproc = team->t.t_nproc; // save old value and use to update only
5242  // new threads below
5243  __kmp_initialize_team(team, new_nproc, new_icvs,
5244  root->r.r_uber_thread->th.th_ident);
5245 
5246  /* reinitialize the threads */
5247  KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5248  for (f = 0; f < team->t.t_nproc; ++f)
5249  __kmp_initialize_info(team->t.t_threads[f], team, f,
5250  __kmp_gtid_from_tid(f, team));
5251 
5252  if (level) { // set th_task_state for new threads in nested hot team
5253  // __kmp_initialize_info() no longer zeroes th_task_state, so we should
5254  // only need to set the th_task_state for the new threads. th_task_state
5255  // for primary thread will not be accurate until after this in
5256  // __kmp_fork_call(), so we look to the primary thread's memo_stack to
5257  // get the correct value.
5258  for (f = old_nproc; f < team->t.t_nproc; ++f)
5259  team->t.t_threads[f]->th.th_task_state =
5260  team->t.t_threads[0]->th.th_task_state_memo_stack[level];
5261  } else { // set th_task_state for new threads in non-nested hot team
5262  // copy primary thread's state
5263  kmp_uint8 old_state = team->t.t_threads[0]->th.th_task_state;
5264  for (f = old_nproc; f < team->t.t_nproc; ++f)
5265  team->t.t_threads[f]->th.th_task_state = old_state;
5266  }
5267 
5268 #ifdef KMP_DEBUG
5269  for (f = 0; f < team->t.t_nproc; ++f) {
5270  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5271  team->t.t_threads[f]->th.th_team_nproc ==
5272  team->t.t_nproc);
5273  }
5274 #endif
5275 
5276  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5277 #if KMP_AFFINITY_SUPPORTED
5278  __kmp_partition_places(team);
5279 #endif
5280  } // Check changes in number of threads
5281 
5282  kmp_info_t *master = team->t.t_threads[0];
5283  if (master->th.th_teams_microtask) {
5284  for (f = 1; f < new_nproc; ++f) {
5285  // propagate teams construct specific info to workers
5286  kmp_info_t *thr = team->t.t_threads[f];
5287  thr->th.th_teams_microtask = master->th.th_teams_microtask;
5288  thr->th.th_teams_level = master->th.th_teams_level;
5289  thr->th.th_teams_size = master->th.th_teams_size;
5290  }
5291  }
5292 #if KMP_NESTED_HOT_TEAMS
5293  if (level) {
5294  // Sync barrier state for nested hot teams, not needed for outermost hot
5295  // team.
5296  for (f = 1; f < new_nproc; ++f) {
5297  kmp_info_t *thr = team->t.t_threads[f];
5298  int b;
5299  kmp_balign_t *balign = thr->th.th_bar;
5300  for (b = 0; b < bs_last_barrier; ++b) {
5301  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5302  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5303 #if USE_DEBUGGER
5304  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5305 #endif
5306  }
5307  }
5308  }
5309 #endif // KMP_NESTED_HOT_TEAMS
5310 
5311  /* reallocate space for arguments if necessary */
5312  __kmp_alloc_argv_entries(argc, team, TRUE);
5313  KMP_CHECK_UPDATE(team->t.t_argc, argc);
5314  // The hot team re-uses the previous task team,
5315  // if untouched during the previous release->gather phase.
5316 
5317  KF_TRACE(10, (" hot_team = %p\n", team));
5318 
5319 #if KMP_DEBUG
5320  if (__kmp_tasking_mode != tskm_immediate_exec) {
5321  KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5322  "task_team[1] = %p after reinit\n",
5323  team->t.t_task_team[0], team->t.t_task_team[1]));
5324  }
5325 #endif
5326 
5327 #if OMPT_SUPPORT
5328  __ompt_team_assign_id(team, ompt_parallel_data);
5329 #endif
5330 
5331  KMP_MB();
5332 
5333  return team;
5334  }
5335 
5336  /* next, let's try to take one from the team pool */
5337  KMP_MB();
5338  for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5339  /* TODO: consider resizing undersized teams instead of reaping them, now
5340  that we have a resizing mechanism */
5341  if (team->t.t_max_nproc >= max_nproc) {
5342  /* take this team from the team pool */
5343  __kmp_team_pool = team->t.t_next_pool;
5344 
5345  /* setup the team for fresh use */
5346  __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5347 
5348  KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5349  "task_team[1] %p to NULL\n",
5350  &team->t.t_task_team[0], &team->t.t_task_team[1]));
5351  team->t.t_task_team[0] = NULL;
5352  team->t.t_task_team[1] = NULL;
5353 
5354  /* reallocate space for arguments if necessary */
5355  __kmp_alloc_argv_entries(argc, team, TRUE);
5356  KMP_CHECK_UPDATE(team->t.t_argc, argc);
5357 
5358  KA_TRACE(
5359  20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5360  team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5361  { // Initialize barrier data.
5362  int b;
5363  for (b = 0; b < bs_last_barrier; ++b) {
5364  team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5365 #if USE_DEBUGGER
5366  team->t.t_bar[b].b_master_arrived = 0;
5367  team->t.t_bar[b].b_team_arrived = 0;
5368 #endif
5369  }
5370  }
5371 
5372  team->t.t_proc_bind = new_proc_bind;
5373 
5374  KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5375  team->t.t_id));
5376 
5377 #if OMPT_SUPPORT
5378  __ompt_team_assign_id(team, ompt_parallel_data);
5379 #endif
5380 
5381  KMP_MB();
5382 
5383  return team;
5384  }
5385 
5386  /* reap team if it is too small, then loop back and check the next one */
5387  // not sure if this is wise, but, will be redone during the hot-teams
5388  // rewrite.
5389  /* TODO: Use technique to find the right size hot-team, don't reap them */
5390  team = __kmp_reap_team(team);
5391  __kmp_team_pool = team;
5392  }
5393 
5394  /* nothing available in the pool, no matter, make a new team! */
5395  KMP_MB();
5396  team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5397 
5398  /* and set it up */
5399  team->t.t_max_nproc = max_nproc;
5400  /* NOTE well, for some reason allocating one big buffer and dividing it up
5401  seems to really hurt performance a lot on the P4, so, let's not use this */
5402  __kmp_allocate_team_arrays(team, max_nproc);
5403 
5404  KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5405  __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5406 
5407  KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5408  "%p to NULL\n",
5409  &team->t.t_task_team[0], &team->t.t_task_team[1]));
5410  team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5411  // memory, no need to duplicate
5412  team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5413  // memory, no need to duplicate
5414 
5415  if (__kmp_storage_map) {
5416  __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5417  }
5418 
5419  /* allocate space for arguments */
5420  __kmp_alloc_argv_entries(argc, team, FALSE);
5421  team->t.t_argc = argc;
5422 
5423  KA_TRACE(20,
5424  ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5425  team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5426  { // Initialize barrier data.
5427  int b;
5428  for (b = 0; b < bs_last_barrier; ++b) {
5429  team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5430 #if USE_DEBUGGER
5431  team->t.t_bar[b].b_master_arrived = 0;
5432  team->t.t_bar[b].b_team_arrived = 0;
5433 #endif
5434  }
5435  }
5436 
5437  team->t.t_proc_bind = new_proc_bind;
5438 
5439 #if OMPT_SUPPORT
5440  __ompt_team_assign_id(team, ompt_parallel_data);
5441  team->t.ompt_serialized_team_info = NULL;
5442 #endif
5443 
5444  KMP_MB();
5445 
5446  KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5447  team->t.t_id));
5448 
5449  return team;
5450 }
5451 
5452 /* TODO implement hot-teams at all levels */
5453 /* TODO implement lazy thread release on demand (disband request) */
5454 
5455 /* free the team. return it to the team pool. release all the threads
5456  * associated with it */
5457 void __kmp_free_team(kmp_root_t *root,
5458  kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5459  int f;
5460  KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5461  team->t.t_id));
5462 
5463  /* verify state */
5464  KMP_DEBUG_ASSERT(root);
5465  KMP_DEBUG_ASSERT(team);
5466  KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5467  KMP_DEBUG_ASSERT(team->t.t_threads);
5468 
5469  int use_hot_team = team == root->r.r_hot_team;
5470 #if KMP_NESTED_HOT_TEAMS
5471  int level;
5472  kmp_hot_team_ptr_t *hot_teams;
5473  if (master) {
5474  level = team->t.t_active_level - 1;
5475  if (master->th.th_teams_microtask) { // in teams construct?
5476  if (master->th.th_teams_size.nteams > 1) {
5477  ++level; // level was not increased in teams construct for
5478  // team_of_masters
5479  }
5480  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5481  master->th.th_teams_level == team->t.t_level) {
5482  ++level; // level was not increased in teams construct for
5483  // team_of_workers before the parallel
5484  } // team->t.t_level will be increased inside parallel
5485  }
5486  hot_teams = master->th.th_hot_teams;
5487  if (level < __kmp_hot_teams_max_level) {
5488  KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5489  use_hot_team = 1;
5490  }
5491  }
5492 #endif // KMP_NESTED_HOT_TEAMS
5493 
5494  /* team is done working */
5495  TCW_SYNC_PTR(team->t.t_pkfn,
5496  NULL); // Important for Debugging Support Library.
5497 #if KMP_OS_WINDOWS
5498  team->t.t_copyin_counter = 0; // init counter for possible reuse
5499 #endif
5500  // Do not reset pointer to parent team to NULL for hot teams.
5501 
5502  /* if we are non-hot team, release our threads */
5503  if (!use_hot_team) {
5504  if (__kmp_tasking_mode != tskm_immediate_exec) {
5505  // Wait for threads to reach reapable state
5506  for (f = 1; f < team->t.t_nproc; ++f) {
5507  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5508  kmp_info_t *th = team->t.t_threads[f];
5509  volatile kmp_uint32 *state = &th->th.th_reap_state;
5510  while (*state != KMP_SAFE_TO_REAP) {
5511 #if KMP_OS_WINDOWS
5512  // On Windows a thread can be killed at any time, check this
5513  DWORD ecode;
5514  if (!__kmp_is_thread_alive(th, &ecode)) {
5515  *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5516  break;
5517  }
5518 #endif
5519  // first check if thread is sleeping
5520  kmp_flag_64<> fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5521  if (fl.is_sleeping())
5522  fl.resume(__kmp_gtid_from_thread(th));
5523  KMP_CPU_PAUSE();
5524  }
5525  }
5526 
5527  // Delete task teams
5528  int tt_idx;
5529  for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5530  kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5531  if (task_team != NULL) {
5532  for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
5533  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5534  team->t.t_threads[f]->th.th_task_team = NULL;
5535  }
5536  KA_TRACE(
5537  20,
5538  ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5539  __kmp_get_gtid(), task_team, team->t.t_id));
5540 #if KMP_NESTED_HOT_TEAMS
5541  __kmp_free_task_team(master, task_team);
5542 #endif
5543  team->t.t_task_team[tt_idx] = NULL;
5544  }
5545  }
5546  }
5547 
5548  // Reset pointer to parent team only for non-hot teams.
5549  team->t.t_parent = NULL;
5550  team->t.t_level = 0;
5551  team->t.t_active_level = 0;
5552 
5553  /* free the worker threads */
5554  for (f = 1; f < team->t.t_nproc; ++f) {
5555  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5556  __kmp_free_thread(team->t.t_threads[f]);
5557  team->t.t_threads[f] = NULL;
5558  }
5559 
5560  /* put the team back in the team pool */
5561  /* TODO limit size of team pool, call reap_team if pool too large */
5562  team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5563  __kmp_team_pool = (volatile kmp_team_t *)team;
5564  } else { // Check if team was created for primary threads in teams construct
5565  // See if first worker is a CG root
5566  KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
5567  team->t.t_threads[1]->th.th_cg_roots);
5568  if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
5569  // Clean up the CG root nodes on workers so that this team can be re-used
5570  for (f = 1; f < team->t.t_nproc; ++f) {
5571  kmp_info_t *thr = team->t.t_threads[f];
5572  KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
5573  thr->th.th_cg_roots->cg_root == thr);
5574  // Pop current CG root off list
5575  kmp_cg_root_t *tmp = thr->th.th_cg_roots;
5576  thr->th.th_cg_roots = tmp->up;
5577  KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
5578  " up to node %p. cg_nthreads was %d\n",
5579  thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
5580  int i = tmp->cg_nthreads--;
5581  if (i == 1) {
5582  __kmp_free(tmp); // free CG if we are the last thread in it
5583  }
5584  // Restore current task's thread_limit from CG root
5585  if (thr->th.th_cg_roots)
5586  thr->th.th_current_task->td_icvs.thread_limit =
5587  thr->th.th_cg_roots->cg_thread_limit;
5588  }
5589  }
5590  }
5591 
5592  KMP_MB();
5593 }
5594 
5595 /* reap the team. destroy it, reclaim all its resources and free its memory */
5596 kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5597  kmp_team_t *next_pool = team->t.t_next_pool;
5598 
5599  KMP_DEBUG_ASSERT(team);
5600  KMP_DEBUG_ASSERT(team->t.t_dispatch);
5601  KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5602  KMP_DEBUG_ASSERT(team->t.t_threads);
5603  KMP_DEBUG_ASSERT(team->t.t_argv);
5604 
5605  /* TODO clean the threads that are a part of this? */
5606 
5607  /* free stuff */
5608  __kmp_free_team_arrays(team);
5609  if (team->t.t_argv != &team->t.t_inline_argv[0])
5610  __kmp_free((void *)team->t.t_argv);
5611  __kmp_free(team);
5612 
5613  KMP_MB();
5614  return next_pool;
5615 }
5616 
5617 // Free the thread. Don't reap it, just place it on the pool of available
5618 // threads.
5619 //
5620 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5621 // binding for the affinity mechanism to be useful.
5622 //
5623 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5624 // However, we want to avoid a potential performance problem by always
5625 // scanning through the list to find the correct point at which to insert
5626 // the thread (potential N**2 behavior). To do this we keep track of the
5627 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5628 // With single-level parallelism, threads will always be added to the tail
5629 // of the list, kept track of by __kmp_thread_pool_insert_pt. With nested
5630 // parallelism, all bets are off and we may need to scan through the entire
5631 // free list.
5632 //
5633 // This change also has a potentially large performance benefit, for some
5634 // applications. Previously, as threads were freed from the hot team, they
5635 // would be placed back on the free list in inverse order. If the hot team
5636 // grew back to it's original size, then the freed thread would be placed
5637 // back on the hot team in reverse order. This could cause bad cache
5638 // locality problems on programs where the size of the hot team regularly
5639 // grew and shrunk.
5640 //
5641 // Now, for single-level parallelism, the OMP tid is always == gtid.
5642 void __kmp_free_thread(kmp_info_t *this_th) {
5643  int gtid;
5644  kmp_info_t **scan;
5645 
5646  KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5647  __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5648 
5649  KMP_DEBUG_ASSERT(this_th);
5650 
5651  // When moving thread to pool, switch thread to wait on own b_go flag, and
5652  // uninitialized (NULL team).
5653  int b;
5654  kmp_balign_t *balign = this_th->th.th_bar;
5655  for (b = 0; b < bs_last_barrier; ++b) {
5656  if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5657  balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5658  balign[b].bb.team = NULL;
5659  balign[b].bb.leaf_kids = 0;
5660  }
5661  this_th->th.th_task_state = 0;
5662  this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5663 
5664  /* put thread back on the free pool */
5665  TCW_PTR(this_th->th.th_team, NULL);
5666  TCW_PTR(this_th->th.th_root, NULL);
5667  TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5668 
5669  while (this_th->th.th_cg_roots) {
5670  this_th->th.th_cg_roots->cg_nthreads--;
5671  KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
5672  " %p of thread %p to %d\n",
5673  this_th, this_th->th.th_cg_roots,
5674  this_th->th.th_cg_roots->cg_root,
5675  this_th->th.th_cg_roots->cg_nthreads));
5676  kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
5677  if (tmp->cg_root == this_th) { // Thread is a cg_root
5678  KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
5679  KA_TRACE(
5680  5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
5681  this_th->th.th_cg_roots = tmp->up;
5682  __kmp_free(tmp);
5683  } else { // Worker thread
5684  if (tmp->cg_nthreads == 0) { // last thread leaves contention group
5685  __kmp_free(tmp);
5686  }
5687  this_th->th.th_cg_roots = NULL;
5688  break;
5689  }
5690  }
5691 
5692  /* If the implicit task assigned to this thread can be used by other threads
5693  * -> multiple threads can share the data and try to free the task at
5694  * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5695  * with higher probability when hot team is disabled but can occurs even when
5696  * the hot team is enabled */
5697  __kmp_free_implicit_task(this_th);
5698  this_th->th.th_current_task = NULL;
5699 
5700  // If the __kmp_thread_pool_insert_pt is already past the new insert
5701  // point, then we need to re-scan the entire list.
5702  gtid = this_th->th.th_info.ds.ds_gtid;
5703  if (__kmp_thread_pool_insert_pt != NULL) {
5704  KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5705  if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5706  __kmp_thread_pool_insert_pt = NULL;
5707  }
5708  }
5709 
5710  // Scan down the list to find the place to insert the thread.
5711  // scan is the address of a link in the list, possibly the address of
5712  // __kmp_thread_pool itself.
5713  //
5714  // In the absence of nested parallelism, the for loop will have 0 iterations.
5715  if (__kmp_thread_pool_insert_pt != NULL) {
5716  scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5717  } else {
5718  scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5719  }
5720  for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5721  scan = &((*scan)->th.th_next_pool))
5722  ;
5723 
5724  // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5725  // to its address.
5726  TCW_PTR(this_th->th.th_next_pool, *scan);
5727  __kmp_thread_pool_insert_pt = *scan = this_th;
5728  KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5729  (this_th->th.th_info.ds.ds_gtid <
5730  this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5731  TCW_4(this_th->th.th_in_pool, TRUE);
5732  __kmp_suspend_initialize_thread(this_th);
5733  __kmp_lock_suspend_mx(this_th);
5734  if (this_th->th.th_active == TRUE) {
5735  KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
5736  this_th->th.th_active_in_pool = TRUE;
5737  }
5738 #if KMP_DEBUG
5739  else {
5740  KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE);
5741  }
5742 #endif
5743  __kmp_unlock_suspend_mx(this_th);
5744 
5745  TCW_4(__kmp_nth, __kmp_nth - 1);
5746 
5747 #ifdef KMP_ADJUST_BLOCKTIME
5748  /* Adjust blocktime back to user setting or default if necessary */
5749  /* Middle initialization might never have occurred */
5750  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5751  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5752  if (__kmp_nth <= __kmp_avail_proc) {
5753  __kmp_zero_bt = FALSE;
5754  }
5755  }
5756 #endif /* KMP_ADJUST_BLOCKTIME */
5757 
5758  KMP_MB();
5759 }
5760 
5761 /* ------------------------------------------------------------------------ */
5762 
5763 void *__kmp_launch_thread(kmp_info_t *this_thr) {
5764 #if OMP_PROFILING_SUPPORT
5765  ProfileTraceFile = getenv("LIBOMPTARGET_PROFILE");
5766  // TODO: add a configuration option for time granularity
5767  if (ProfileTraceFile)
5768  llvm::timeTraceProfilerInitialize(500 /* us */, "libomptarget");
5769 #endif
5770 
5771  int gtid = this_thr->th.th_info.ds.ds_gtid;
5772  /* void *stack_data;*/
5773  kmp_team_t **volatile pteam;
5774 
5775  KMP_MB();
5776  KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
5777 
5778  if (__kmp_env_consistency_check) {
5779  this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
5780  }
5781 
5782 #if OMPD_SUPPORT
5783  if (ompd_state & OMPD_ENABLE_BP)
5784  ompd_bp_thread_begin();
5785 #endif
5786 
5787 #if OMPT_SUPPORT
5788  ompt_data_t *thread_data = nullptr;
5789  if (ompt_enabled.enabled) {
5790  thread_data = &(this_thr->th.ompt_thread_info.thread_data);
5791  *thread_data = ompt_data_none;
5792 
5793  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5794  this_thr->th.ompt_thread_info.wait_id = 0;
5795  this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
5796  this_thr->th.ompt_thread_info.parallel_flags = 0;
5797  if (ompt_enabled.ompt_callback_thread_begin) {
5798  ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
5799  ompt_thread_worker, thread_data);
5800  }
5801  this_thr->th.ompt_thread_info.state = ompt_state_idle;
5802  }
5803 #endif
5804 
5805  /* This is the place where threads wait for work */
5806  while (!TCR_4(__kmp_global.g.g_done)) {
5807  KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
5808  KMP_MB();
5809 
5810  /* wait for work to do */
5811  KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
5812 
5813  /* No tid yet since not part of a team */
5814  __kmp_fork_barrier(gtid, KMP_GTID_DNE);
5815 
5816 #if OMPT_SUPPORT
5817  if (ompt_enabled.enabled) {
5818  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5819  }
5820 #endif
5821 
5822  pteam = &this_thr->th.th_team;
5823 
5824  /* have we been allocated? */
5825  if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
5826  /* we were just woken up, so run our new task */
5827  if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
5828  int rc;
5829  KA_TRACE(20,
5830  ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
5831  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5832  (*pteam)->t.t_pkfn));
5833 
5834  updateHWFPControl(*pteam);
5835 
5836 #if OMPT_SUPPORT
5837  if (ompt_enabled.enabled) {
5838  this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
5839  }
5840 #endif
5841 
5842  rc = (*pteam)->t.t_invoke(gtid);
5843  KMP_ASSERT(rc);
5844 
5845  KMP_MB();
5846  KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
5847  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5848  (*pteam)->t.t_pkfn));
5849  }
5850 #if OMPT_SUPPORT
5851  if (ompt_enabled.enabled) {
5852  /* no frame set while outside task */
5853  __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
5854 
5855  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5856  }
5857 #endif
5858  /* join barrier after parallel region */
5859  __kmp_join_barrier(gtid);
5860  }
5861  }
5862  TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
5863 
5864 #if OMPD_SUPPORT
5865  if (ompd_state & OMPD_ENABLE_BP)
5866  ompd_bp_thread_end();
5867 #endif
5868 
5869 #if OMPT_SUPPORT
5870  if (ompt_enabled.ompt_callback_thread_end) {
5871  ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
5872  }
5873 #endif
5874 
5875  this_thr->th.th_task_team = NULL;
5876  /* run the destructors for the threadprivate data for this thread */
5877  __kmp_common_destroy_gtid(gtid);
5878 
5879  KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
5880  KMP_MB();
5881 
5882 #if OMP_PROFILING_SUPPORT
5883  llvm::timeTraceProfilerFinishThread();
5884 #endif
5885  return this_thr;
5886 }
5887 
5888 /* ------------------------------------------------------------------------ */
5889 
5890 void __kmp_internal_end_dest(void *specific_gtid) {
5891  // Make sure no significant bits are lost
5892  int gtid;
5893  __kmp_type_convert((kmp_intptr_t)specific_gtid - 1, &gtid);
5894 
5895  KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
5896  /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
5897  * this is because 0 is reserved for the nothing-stored case */
5898 
5899  __kmp_internal_end_thread(gtid);
5900 }
5901 
5902 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
5903 
5904 __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
5905  __kmp_internal_end_atexit();
5906 }
5907 
5908 #endif
5909 
5910 /* [Windows] josh: when the atexit handler is called, there may still be more
5911  than one thread alive */
5912 void __kmp_internal_end_atexit(void) {
5913  KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
5914  /* [Windows]
5915  josh: ideally, we want to completely shutdown the library in this atexit
5916  handler, but stat code that depends on thread specific data for gtid fails
5917  because that data becomes unavailable at some point during the shutdown, so
5918  we call __kmp_internal_end_thread instead. We should eventually remove the
5919  dependency on __kmp_get_specific_gtid in the stat code and use
5920  __kmp_internal_end_library to cleanly shutdown the library.
5921 
5922  // TODO: Can some of this comment about GVS be removed?
5923  I suspect that the offending stat code is executed when the calling thread
5924  tries to clean up a dead root thread's data structures, resulting in GVS
5925  code trying to close the GVS structures for that thread, but since the stat
5926  code uses __kmp_get_specific_gtid to get the gtid with the assumption that
5927  the calling thread is cleaning up itself instead of another thread, it get
5928  confused. This happens because allowing a thread to unregister and cleanup
5929  another thread is a recent modification for addressing an issue.
5930  Based on the current design (20050722), a thread may end up
5931  trying to unregister another thread only if thread death does not trigger
5932  the calling of __kmp_internal_end_thread. For Linux* OS, there is the
5933  thread specific data destructor function to detect thread death. For
5934  Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
5935  is nothing. Thus, the workaround is applicable only for Windows static
5936  stat library. */
5937  __kmp_internal_end_library(-1);
5938 #if KMP_OS_WINDOWS
5939  __kmp_close_console();
5940 #endif
5941 }
5942 
5943 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
5944  // It is assumed __kmp_forkjoin_lock is acquired.
5945 
5946  int gtid;
5947 
5948  KMP_DEBUG_ASSERT(thread != NULL);
5949 
5950  gtid = thread->th.th_info.ds.ds_gtid;
5951 
5952  if (!is_root) {
5953  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5954  /* Assume the threads are at the fork barrier here */
5955  KA_TRACE(
5956  20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
5957  gtid));
5958  /* Need release fence here to prevent seg faults for tree forkjoin barrier
5959  * (GEH) */
5960  kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
5961  thread);
5962  __kmp_release_64(&flag);
5963  }
5964 
5965  // Terminate OS thread.
5966  __kmp_reap_worker(thread);
5967 
5968  // The thread was killed asynchronously. If it was actively
5969  // spinning in the thread pool, decrement the global count.
5970  //
5971  // There is a small timing hole here - if the worker thread was just waking
5972  // up after sleeping in the pool, had reset it's th_active_in_pool flag but
5973  // not decremented the global counter __kmp_thread_pool_active_nth yet, then
5974  // the global counter might not get updated.
5975  //
5976  // Currently, this can only happen as the library is unloaded,
5977  // so there are no harmful side effects.
5978  if (thread->th.th_active_in_pool) {
5979  thread->th.th_active_in_pool = FALSE;
5980  KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
5981  KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
5982  }
5983  }
5984 
5985  __kmp_free_implicit_task(thread);
5986 
5987 // Free the fast memory for tasking
5988 #if USE_FAST_MEMORY
5989  __kmp_free_fast_memory(thread);
5990 #endif /* USE_FAST_MEMORY */
5991 
5992  __kmp_suspend_uninitialize_thread(thread);
5993 
5994  KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
5995  TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
5996 
5997  --__kmp_all_nth;
5998  // __kmp_nth was decremented when thread is added to the pool.
5999 
6000 #ifdef KMP_ADJUST_BLOCKTIME
6001  /* Adjust blocktime back to user setting or default if necessary */
6002  /* Middle initialization might never have occurred */
6003  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6004  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6005  if (__kmp_nth <= __kmp_avail_proc) {
6006  __kmp_zero_bt = FALSE;
6007  }
6008  }
6009 #endif /* KMP_ADJUST_BLOCKTIME */
6010 
6011  /* free the memory being used */
6012  if (__kmp_env_consistency_check) {
6013  if (thread->th.th_cons) {
6014  __kmp_free_cons_stack(thread->th.th_cons);
6015  thread->th.th_cons = NULL;
6016  }
6017  }
6018 
6019  if (thread->th.th_pri_common != NULL) {
6020  __kmp_free(thread->th.th_pri_common);
6021  thread->th.th_pri_common = NULL;
6022  }
6023 
6024  if (thread->th.th_task_state_memo_stack != NULL) {
6025  __kmp_free(thread->th.th_task_state_memo_stack);
6026  thread->th.th_task_state_memo_stack = NULL;
6027  }
6028 
6029 #if KMP_USE_BGET
6030  if (thread->th.th_local.bget_data != NULL) {
6031  __kmp_finalize_bget(thread);
6032  }
6033 #endif
6034 
6035 #if KMP_AFFINITY_SUPPORTED
6036  if (thread->th.th_affin_mask != NULL) {
6037  KMP_CPU_FREE(thread->th.th_affin_mask);
6038  thread->th.th_affin_mask = NULL;
6039  }
6040 #endif /* KMP_AFFINITY_SUPPORTED */
6041 
6042 #if KMP_USE_HIER_SCHED
6043  if (thread->th.th_hier_bar_data != NULL) {
6044  __kmp_free(thread->th.th_hier_bar_data);
6045  thread->th.th_hier_bar_data = NULL;
6046  }
6047 #endif
6048 
6049  __kmp_reap_team(thread->th.th_serial_team);
6050  thread->th.th_serial_team = NULL;
6051  __kmp_free(thread);
6052 
6053  KMP_MB();
6054 
6055 } // __kmp_reap_thread
6056 
6057 static void __kmp_internal_end(void) {
6058  int i;
6059 
6060  /* First, unregister the library */
6061  __kmp_unregister_library();
6062 
6063 #if KMP_OS_WINDOWS
6064  /* In Win static library, we can't tell when a root actually dies, so we
6065  reclaim the data structures for any root threads that have died but not
6066  unregistered themselves, in order to shut down cleanly.
6067  In Win dynamic library we also can't tell when a thread dies. */
6068  __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
6069 // dead roots
6070 #endif
6071 
6072  for (i = 0; i < __kmp_threads_capacity; i++)
6073  if (__kmp_root[i])
6074  if (__kmp_root[i]->r.r_active)
6075  break;
6076  KMP_MB(); /* Flush all pending memory write invalidates. */
6077  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6078 
6079  if (i < __kmp_threads_capacity) {
6080 #if KMP_USE_MONITOR
6081  // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
6082  KMP_MB(); /* Flush all pending memory write invalidates. */
6083 
6084  // Need to check that monitor was initialized before reaping it. If we are
6085  // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
6086  // __kmp_monitor will appear to contain valid data, but it is only valid in
6087  // the parent process, not the child.
6088  // New behavior (201008): instead of keying off of the flag
6089  // __kmp_init_parallel, the monitor thread creation is keyed off
6090  // of the new flag __kmp_init_monitor.
6091  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6092  if (TCR_4(__kmp_init_monitor)) {
6093  __kmp_reap_monitor(&__kmp_monitor);
6094  TCW_4(__kmp_init_monitor, 0);
6095  }
6096  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6097  KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6098 #endif // KMP_USE_MONITOR
6099  } else {
6100 /* TODO move this to cleanup code */
6101 #ifdef KMP_DEBUG
6102  /* make sure that everything has properly ended */
6103  for (i = 0; i < __kmp_threads_capacity; i++) {
6104  if (__kmp_root[i]) {
6105  // KMP_ASSERT( ! KMP_UBER_GTID( i ) ); // AC:
6106  // there can be uber threads alive here
6107  KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
6108  }
6109  }
6110 #endif
6111 
6112  KMP_MB();
6113 
6114  // Reap the worker threads.
6115  // This is valid for now, but be careful if threads are reaped sooner.
6116  while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
6117  // Get the next thread from the pool.
6118  kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
6119  __kmp_thread_pool = thread->th.th_next_pool;
6120  // Reap it.
6121  KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
6122  thread->th.th_next_pool = NULL;
6123  thread->th.th_in_pool = FALSE;
6124  __kmp_reap_thread(thread, 0);
6125  }
6126  __kmp_thread_pool_insert_pt = NULL;
6127 
6128  // Reap teams.
6129  while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
6130  // Get the next team from the pool.
6131  kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
6132  __kmp_team_pool = team->t.t_next_pool;
6133  // Reap it.
6134  team->t.t_next_pool = NULL;
6135  __kmp_reap_team(team);
6136  }
6137 
6138  __kmp_reap_task_teams();
6139 
6140 #if KMP_OS_UNIX
6141  // Threads that are not reaped should not access any resources since they
6142  // are going to be deallocated soon, so the shutdown sequence should wait
6143  // until all threads either exit the final spin-waiting loop or begin
6144  // sleeping after the given blocktime.
6145  for (i = 0; i < __kmp_threads_capacity; i++) {
6146  kmp_info_t *thr = __kmp_threads[i];
6147  while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
6148  KMP_CPU_PAUSE();
6149  }
6150 #endif
6151 
6152  for (i = 0; i < __kmp_threads_capacity; ++i) {
6153  // TBD: Add some checking...
6154  // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6155  }
6156 
6157  /* Make sure all threadprivate destructors get run by joining with all
6158  worker threads before resetting this flag */
6159  TCW_SYNC_4(__kmp_init_common, FALSE);
6160 
6161  KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6162  KMP_MB();
6163 
6164 #if KMP_USE_MONITOR
6165  // See note above: One of the possible fixes for CQ138434 / CQ140126
6166  //
6167  // FIXME: push both code fragments down and CSE them?
6168  // push them into __kmp_cleanup() ?
6169  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6170  if (TCR_4(__kmp_init_monitor)) {
6171  __kmp_reap_monitor(&__kmp_monitor);
6172  TCW_4(__kmp_init_monitor, 0);
6173  }
6174  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6175  KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6176 #endif
6177  } /* else !__kmp_global.t_active */
6178  TCW_4(__kmp_init_gtid, FALSE);
6179  KMP_MB(); /* Flush all pending memory write invalidates. */
6180 
6181  __kmp_cleanup();
6182 #if OMPT_SUPPORT
6183  ompt_fini();
6184 #endif
6185 }
6186 
6187 void __kmp_internal_end_library(int gtid_req) {
6188  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6189  /* this shouldn't be a race condition because __kmp_internal_end() is the
6190  only place to clear __kmp_serial_init */
6191  /* we'll check this later too, after we get the lock */
6192  // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6193  // redundant, because the next check will work in any case.
6194  if (__kmp_global.g.g_abort) {
6195  KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6196  /* TODO abort? */
6197  return;
6198  }
6199  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6200  KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6201  return;
6202  }
6203 
6204  // If hidden helper team has been initialized, we need to deinit it
6205  if (TCR_4(__kmp_init_hidden_helper) &&
6206  !TCR_4(__kmp_hidden_helper_team_done)) {
6207  TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6208  // First release the main thread to let it continue its work
6209  __kmp_hidden_helper_main_thread_release();
6210  // Wait until the hidden helper team has been destroyed
6211  __kmp_hidden_helper_threads_deinitz_wait();
6212  }
6213 
6214  KMP_MB(); /* Flush all pending memory write invalidates. */
6215  /* find out who we are and what we should do */
6216  {
6217  int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6218  KA_TRACE(
6219  10, ("__kmp_internal_end_library: enter T#%d (%d)\n", gtid, gtid_req));
6220  if (gtid == KMP_GTID_SHUTDOWN) {
6221  KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6222  "already shutdown\n"));
6223  return;
6224  } else if (gtid == KMP_GTID_MONITOR) {
6225  KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6226  "registered, or system shutdown\n"));
6227  return;
6228  } else if (gtid == KMP_GTID_DNE) {
6229  KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6230  "shutdown\n"));
6231  /* we don't know who we are, but we may still shutdown the library */
6232  } else if (KMP_UBER_GTID(gtid)) {
6233  /* unregister ourselves as an uber thread. gtid is no longer valid */
6234  if (__kmp_root[gtid]->r.r_active) {
6235  __kmp_global.g.g_abort = -1;
6236  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6237  __kmp_unregister_library();
6238  KA_TRACE(10,
6239  ("__kmp_internal_end_library: root still active, abort T#%d\n",
6240  gtid));
6241  return;
6242  } else {
6243  KA_TRACE(
6244  10,
6245  ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6246  __kmp_unregister_root_current_thread(gtid);
6247  }
6248  } else {
6249 /* worker threads may call this function through the atexit handler, if they
6250  * call exit() */
6251 /* For now, skip the usual subsequent processing and just dump the debug buffer.
6252  TODO: do a thorough shutdown instead */
6253 #ifdef DUMP_DEBUG_ON_EXIT
6254  if (__kmp_debug_buf)
6255  __kmp_dump_debug_buffer();
6256 #endif
6257  // added unregister library call here when we switch to shm linux
6258  // if we don't, it will leave lots of files in /dev/shm
6259  // cleanup shared memory file before exiting.
6260  __kmp_unregister_library();
6261  return;
6262  }
6263  }
6264  /* synchronize the termination process */
6265  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6266 
6267  /* have we already finished */
6268  if (__kmp_global.g.g_abort) {
6269  KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6270  /* TODO abort? */
6271  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6272  return;
6273  }
6274  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6275  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6276  return;
6277  }
6278 
6279  /* We need this lock to enforce mutex between this reading of
6280  __kmp_threads_capacity and the writing by __kmp_register_root.
6281  Alternatively, we can use a counter of roots that is atomically updated by
6282  __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6283  __kmp_internal_end_*. */
6284  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6285 
6286  /* now we can safely conduct the actual termination */
6287  __kmp_internal_end();
6288 
6289  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6290  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6291 
6292  KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6293 
6294 #ifdef DUMP_DEBUG_ON_EXIT
6295  if (__kmp_debug_buf)
6296  __kmp_dump_debug_buffer();
6297 #endif
6298 
6299 #if KMP_OS_WINDOWS
6300  __kmp_close_console();
6301 #endif
6302 
6303  __kmp_fini_allocator();
6304 
6305 } // __kmp_internal_end_library
6306 
6307 void __kmp_internal_end_thread(int gtid_req) {
6308  int i;
6309 
6310  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6311  /* this shouldn't be a race condition because __kmp_internal_end() is the
6312  * only place to clear __kmp_serial_init */
6313  /* we'll check this later too, after we get the lock */
6314  // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6315  // redundant, because the next check will work in any case.
6316  if (__kmp_global.g.g_abort) {
6317  KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6318  /* TODO abort? */
6319  return;
6320  }
6321  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6322  KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6323  return;
6324  }
6325 
6326  // If hidden helper team has been initialized, we need to deinit it
6327  if (TCR_4(__kmp_init_hidden_helper) &&
6328  !TCR_4(__kmp_hidden_helper_team_done)) {
6329  TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6330  // First release the main thread to let it continue its work
6331  __kmp_hidden_helper_main_thread_release();
6332  // Wait until the hidden helper team has been destroyed
6333  __kmp_hidden_helper_threads_deinitz_wait();
6334  }
6335 
6336  KMP_MB(); /* Flush all pending memory write invalidates. */
6337 
6338  /* find out who we are and what we should do */
6339  {
6340  int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6341  KA_TRACE(10,
6342  ("__kmp_internal_end_thread: enter T#%d (%d)\n", gtid, gtid_req));
6343  if (gtid == KMP_GTID_SHUTDOWN) {
6344  KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6345  "already shutdown\n"));
6346  return;
6347  } else if (gtid == KMP_GTID_MONITOR) {
6348  KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6349  "registered, or system shutdown\n"));
6350  return;
6351  } else if (gtid == KMP_GTID_DNE) {
6352  KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6353  "shutdown\n"));
6354  return;
6355  /* we don't know who we are */
6356  } else if (KMP_UBER_GTID(gtid)) {
6357  /* unregister ourselves as an uber thread. gtid is no longer valid */
6358  if (__kmp_root[gtid]->r.r_active) {
6359  __kmp_global.g.g_abort = -1;
6360  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6361  KA_TRACE(10,
6362  ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6363  gtid));
6364  return;
6365  } else {
6366  KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6367  gtid));
6368  __kmp_unregister_root_current_thread(gtid);
6369  }
6370  } else {
6371  /* just a worker thread, let's leave */
6372  KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6373 
6374  if (gtid >= 0) {
6375  __kmp_threads[gtid]->th.th_task_team = NULL;
6376  }
6377 
6378  KA_TRACE(10,
6379  ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6380  gtid));
6381  return;
6382  }
6383  }
6384 #if KMP_DYNAMIC_LIB
6385  if (__kmp_pause_status != kmp_hard_paused)
6386  // AC: lets not shutdown the dynamic library at the exit of uber thread,
6387  // because we will better shutdown later in the library destructor.
6388  {
6389  KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6390  return;
6391  }
6392 #endif
6393  /* synchronize the termination process */
6394  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6395 
6396  /* have we already finished */
6397  if (__kmp_global.g.g_abort) {
6398  KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6399  /* TODO abort? */
6400  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6401  return;
6402  }
6403  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6404  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6405  return;
6406  }
6407 
6408  /* We need this lock to enforce mutex between this reading of
6409  __kmp_threads_capacity and the writing by __kmp_register_root.
6410  Alternatively, we can use a counter of roots that is atomically updated by
6411  __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6412  __kmp_internal_end_*. */
6413 
6414  /* should we finish the run-time? are all siblings done? */
6415  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6416 
6417  for (i = 0; i < __kmp_threads_capacity; ++i) {
6418  if (KMP_UBER_GTID(i)) {
6419  KA_TRACE(
6420  10,
6421  ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6422  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6423  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6424  return;
6425  }
6426  }
6427 
6428  /* now we can safely conduct the actual termination */
6429 
6430  __kmp_internal_end();
6431 
6432  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6433  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6434 
6435  KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6436 
6437 #ifdef DUMP_DEBUG_ON_EXIT
6438  if (__kmp_debug_buf)
6439  __kmp_dump_debug_buffer();
6440 #endif
6441 } // __kmp_internal_end_thread
6442 
6443 // -----------------------------------------------------------------------------
6444 // Library registration stuff.
6445 
6446 static long __kmp_registration_flag = 0;
6447 // Random value used to indicate library initialization.
6448 static char *__kmp_registration_str = NULL;
6449 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6450 
6451 static inline char *__kmp_reg_status_name() {
6452 /* On RHEL 3u5 if linked statically, getpid() returns different values in
6453  each thread. If registration and unregistration go in different threads
6454  (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6455  env var can not be found, because the name will contain different pid. */
6456 // macOS* complains about name being too long with additional getuid()
6457 #if KMP_OS_UNIX && !KMP_OS_DARWIN && KMP_DYNAMIC_LIB
6458  return __kmp_str_format("__KMP_REGISTERED_LIB_%d_%d", (int)getpid(),
6459  (int)getuid());
6460 #else
6461  return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6462 #endif
6463 } // __kmp_reg_status_get
6464 
6465 void __kmp_register_library_startup(void) {
6466 
6467  char *name = __kmp_reg_status_name(); // Name of the environment variable.
6468  int done = 0;
6469  union {
6470  double dtime;
6471  long ltime;
6472  } time;
6473 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6474  __kmp_initialize_system_tick();
6475 #endif
6476  __kmp_read_system_time(&time.dtime);
6477  __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6478  __kmp_registration_str =
6479  __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6480  __kmp_registration_flag, KMP_LIBRARY_FILE);
6481 
6482  KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6483  __kmp_registration_str));
6484 
6485  while (!done) {
6486 
6487  char *value = NULL; // Actual value of the environment variable.
6488 
6489 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6490  char *shm_name = __kmp_str_format("/%s", name);
6491  int shm_preexist = 0;
6492  char *data1;
6493  int fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0666);
6494  if ((fd1 == -1) && (errno == EEXIST)) {
6495  // file didn't open because it already exists.
6496  // try opening existing file
6497  fd1 = shm_open(shm_name, O_RDWR, 0666);
6498  if (fd1 == -1) { // file didn't open
6499  // error out here
6500  __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM"), KMP_ERR(0),
6501  __kmp_msg_null);
6502  } else {
6503  // able to open existing file
6504  shm_preexist = 1;
6505  }
6506  } else if (fd1 == -1) { // SHM didn't open; it was due to error other than
6507  // already exists.
6508  // error out here.
6509  __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM2"), KMP_ERR(errno),
6510  __kmp_msg_null);
6511  }
6512  if (shm_preexist == 0) {
6513  // we created SHM now set size
6514  if (ftruncate(fd1, SHM_SIZE) == -1) {
6515  // error occured setting size;
6516  __kmp_fatal(KMP_MSG(FunctionError, "Can't set size of SHM"),
6517  KMP_ERR(errno), __kmp_msg_null);
6518  }
6519  }
6520  data1 =
6521  (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd1, 0);
6522  if (data1 == MAP_FAILED) {
6523  // failed to map shared memory
6524  __kmp_fatal(KMP_MSG(FunctionError, "Can't map SHM"), KMP_ERR(errno),
6525  __kmp_msg_null);
6526  }
6527  if (shm_preexist == 0) { // set data to SHM, set value
6528  KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6529  }
6530  // Read value from either what we just wrote or existing file.
6531  value = __kmp_str_format("%s", data1); // read value from SHM
6532  munmap(data1, SHM_SIZE);
6533  close(fd1);
6534 #else // Windows and unix with static library
6535  // Set environment variable, but do not overwrite if it is exist.
6536  __kmp_env_set(name, __kmp_registration_str, 0);
6537  // read value to see if it got set
6538  value = __kmp_env_get(name);
6539 #endif
6540 
6541  if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6542  done = 1; // Ok, environment variable set successfully, exit the loop.
6543  } else {
6544  // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6545  // Check whether it alive or dead.
6546  int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6547  char *tail = value;
6548  char *flag_addr_str = NULL;
6549  char *flag_val_str = NULL;
6550  char const *file_name = NULL;
6551  __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6552  __kmp_str_split(tail, '-', &flag_val_str, &tail);
6553  file_name = tail;
6554  if (tail != NULL) {
6555  unsigned long *flag_addr = 0;
6556  unsigned long flag_val = 0;
6557  KMP_SSCANF(flag_addr_str, "%p", RCAST(void **, &flag_addr));
6558  KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6559  if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6560  // First, check whether environment-encoded address is mapped into
6561  // addr space.
6562  // If so, dereference it to see if it still has the right value.
6563  if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6564  neighbor = 1;
6565  } else {
6566  // If not, then we know the other copy of the library is no longer
6567  // running.
6568  neighbor = 2;
6569  }
6570  }
6571  }
6572  switch (neighbor) {
6573  case 0: // Cannot parse environment variable -- neighbor status unknown.
6574  // Assume it is the incompatible format of future version of the
6575  // library. Assume the other library is alive.
6576  // WARN( ... ); // TODO: Issue a warning.
6577  file_name = "unknown library";
6578  KMP_FALLTHROUGH();
6579  // Attention! Falling to the next case. That's intentional.
6580  case 1: { // Neighbor is alive.
6581  // Check it is allowed.
6582  char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6583  if (!__kmp_str_match_true(duplicate_ok)) {
6584  // That's not allowed. Issue fatal error.
6585  __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6586  KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6587  }
6588  KMP_INTERNAL_FREE(duplicate_ok);
6589  __kmp_duplicate_library_ok = 1;
6590  done = 1; // Exit the loop.
6591  } break;
6592  case 2: { // Neighbor is dead.
6593 
6594 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6595  // close shared memory.
6596  shm_unlink(shm_name); // this removes file in /dev/shm
6597 #else
6598  // Clear the variable and try to register library again.
6599  __kmp_env_unset(name);
6600 #endif
6601  } break;
6602  default: {
6603  KMP_DEBUG_ASSERT(0);
6604  } break;
6605  }
6606  }
6607  KMP_INTERNAL_FREE((void *)value);
6608 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6609  KMP_INTERNAL_FREE((void *)shm_name);
6610 #endif
6611  } // while
6612  KMP_INTERNAL_FREE((void *)name);
6613 
6614 } // func __kmp_register_library_startup
6615 
6616 void __kmp_unregister_library(void) {
6617 
6618  char *name = __kmp_reg_status_name();
6619  char *value = NULL;
6620 
6621 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6622  char *shm_name = __kmp_str_format("/%s", name);
6623  int fd1 = shm_open(shm_name, O_RDONLY, 0666);
6624  if (fd1 == -1) {
6625  // file did not open. return.
6626  return;
6627  }
6628  char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
6629  if (data1 != MAP_FAILED) {
6630  value = __kmp_str_format("%s", data1); // read value from SHM
6631  munmap(data1, SHM_SIZE);
6632  }
6633  close(fd1);
6634 #else
6635  value = __kmp_env_get(name);
6636 #endif
6637 
6638  KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6639  KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6640  if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6641 // Ok, this is our variable. Delete it.
6642 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6643  shm_unlink(shm_name); // this removes file in /dev/shm
6644 #else
6645  __kmp_env_unset(name);
6646 #endif
6647  }
6648 
6649 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6650  KMP_INTERNAL_FREE(shm_name);
6651 #endif
6652 
6653  KMP_INTERNAL_FREE(__kmp_registration_str);
6654  KMP_INTERNAL_FREE(value);
6655  KMP_INTERNAL_FREE(name);
6656 
6657  __kmp_registration_flag = 0;
6658  __kmp_registration_str = NULL;
6659 
6660 } // __kmp_unregister_library
6661 
6662 // End of Library registration stuff.
6663 // -----------------------------------------------------------------------------
6664 
6665 #if KMP_MIC_SUPPORTED
6666 
6667 static void __kmp_check_mic_type() {
6668  kmp_cpuid_t cpuid_state = {0};
6669  kmp_cpuid_t *cs_p = &cpuid_state;
6670  __kmp_x86_cpuid(1, 0, cs_p);
6671  // We don't support mic1 at the moment
6672  if ((cs_p->eax & 0xff0) == 0xB10) {
6673  __kmp_mic_type = mic2;
6674  } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
6675  __kmp_mic_type = mic3;
6676  } else {
6677  __kmp_mic_type = non_mic;
6678  }
6679 }
6680 
6681 #endif /* KMP_MIC_SUPPORTED */
6682 
6683 #if KMP_HAVE_UMWAIT
6684 static void __kmp_user_level_mwait_init() {
6685  struct kmp_cpuid buf;
6686  __kmp_x86_cpuid(7, 0, &buf);
6687  __kmp_umwait_enabled = ((buf.ecx >> 5) & 1) && __kmp_user_level_mwait;
6688  KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n",
6689  __kmp_umwait_enabled));
6690 }
6691 #elif KMP_HAVE_MWAIT
6692 #ifndef AT_INTELPHIUSERMWAIT
6693 // Spurious, non-existent value that should always fail to return anything.
6694 // Will be replaced with the correct value when we know that.
6695 #define AT_INTELPHIUSERMWAIT 10000
6696 #endif
6697 // getauxval() function is available in RHEL7 and SLES12. If a system with an
6698 // earlier OS is used to build the RTL, we'll use the following internal
6699 // function when the entry is not found.
6700 unsigned long getauxval(unsigned long) KMP_WEAK_ATTRIBUTE_EXTERNAL;
6701 unsigned long getauxval(unsigned long) { return 0; }
6702 
6703 static void __kmp_user_level_mwait_init() {
6704  // When getauxval() and correct value of AT_INTELPHIUSERMWAIT are available
6705  // use them to find if the user-level mwait is enabled. Otherwise, forcibly
6706  // set __kmp_mwait_enabled=TRUE on Intel MIC if the environment variable
6707  // KMP_USER_LEVEL_MWAIT was set to TRUE.
6708  if (__kmp_mic_type == mic3) {
6709  unsigned long res = getauxval(AT_INTELPHIUSERMWAIT);
6710  if ((res & 0x1) || __kmp_user_level_mwait) {
6711  __kmp_mwait_enabled = TRUE;
6712  if (__kmp_user_level_mwait) {
6713  KMP_INFORM(EnvMwaitWarn);
6714  }
6715  } else {
6716  __kmp_mwait_enabled = FALSE;
6717  }
6718  }
6719  KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_mic_type = %d, "
6720  "__kmp_mwait_enabled = %d\n",
6721  __kmp_mic_type, __kmp_mwait_enabled));
6722 }
6723 #endif /* KMP_HAVE_UMWAIT */
6724 
6725 static void __kmp_do_serial_initialize(void) {
6726  int i, gtid;
6727  size_t size;
6728 
6729  KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
6730 
6731  KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
6732  KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
6733  KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
6734  KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
6735  KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
6736 
6737 #if OMPT_SUPPORT
6738  ompt_pre_init();
6739 #endif
6740 #if OMPD_SUPPORT
6741  __kmp_env_dump();
6742  ompd_init();
6743 #endif
6744 
6745  __kmp_validate_locks();
6746 
6747  /* Initialize internal memory allocator */
6748  __kmp_init_allocator();
6749 
6750  /* Register the library startup via an environment variable and check to see
6751  whether another copy of the library is already registered. */
6752 
6753  __kmp_register_library_startup();
6754 
6755  /* TODO reinitialization of library */
6756  if (TCR_4(__kmp_global.g.g_done)) {
6757  KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
6758  }
6759 
6760  __kmp_global.g.g_abort = 0;
6761  TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
6762 
6763 /* initialize the locks */
6764 #if KMP_USE_ADAPTIVE_LOCKS
6765 #if KMP_DEBUG_ADAPTIVE_LOCKS
6766  __kmp_init_speculative_stats();
6767 #endif
6768 #endif
6769 #if KMP_STATS_ENABLED
6770  __kmp_stats_init();
6771 #endif
6772  __kmp_init_lock(&__kmp_global_lock);
6773  __kmp_init_queuing_lock(&__kmp_dispatch_lock);
6774  __kmp_init_lock(&__kmp_debug_lock);
6775  __kmp_init_atomic_lock(&__kmp_atomic_lock);
6776  __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
6777  __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
6778  __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
6779  __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
6780  __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
6781  __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
6782  __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
6783  __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
6784  __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
6785  __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
6786  __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
6787  __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
6788  __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
6789  __kmp_init_bootstrap_lock(&__kmp_exit_lock);
6790 #if KMP_USE_MONITOR
6791  __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
6792 #endif
6793  __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
6794 
6795  /* conduct initialization and initial setup of configuration */
6796 
6797  __kmp_runtime_initialize();
6798 
6799 #if KMP_MIC_SUPPORTED
6800  __kmp_check_mic_type();
6801 #endif
6802 
6803 // Some global variable initialization moved here from kmp_env_initialize()
6804 #ifdef KMP_DEBUG
6805  kmp_diag = 0;
6806 #endif
6807  __kmp_abort_delay = 0;
6808 
6809  // From __kmp_init_dflt_team_nth()
6810  /* assume the entire machine will be used */
6811  __kmp_dflt_team_nth_ub = __kmp_xproc;
6812  if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
6813  __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
6814  }
6815  if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
6816  __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
6817  }
6818  __kmp_max_nth = __kmp_sys_max_nth;
6819  __kmp_cg_max_nth = __kmp_sys_max_nth;
6820  __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
6821  if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
6822  __kmp_teams_max_nth = __kmp_sys_max_nth;
6823  }
6824 
6825  // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
6826  // part
6827  __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
6828 #if KMP_USE_MONITOR
6829  __kmp_monitor_wakeups =
6830  KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6831  __kmp_bt_intervals =
6832  KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6833 #endif
6834  // From "KMP_LIBRARY" part of __kmp_env_initialize()
6835  __kmp_library = library_throughput;
6836  // From KMP_SCHEDULE initialization
6837  __kmp_static = kmp_sch_static_balanced;
6838 // AC: do not use analytical here, because it is non-monotonous
6839 //__kmp_guided = kmp_sch_guided_iterative_chunked;
6840 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
6841 // need to repeat assignment
6842 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
6843 // bit control and barrier method control parts
6844 #if KMP_FAST_REDUCTION_BARRIER
6845 #define kmp_reduction_barrier_gather_bb ((int)1)
6846 #define kmp_reduction_barrier_release_bb ((int)1)
6847 #define kmp_reduction_barrier_gather_pat bp_hyper_bar
6848 #define kmp_reduction_barrier_release_pat bp_hyper_bar
6849 #endif // KMP_FAST_REDUCTION_BARRIER
6850  for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
6851  __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
6852  __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
6853  __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
6854  __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
6855 #if KMP_FAST_REDUCTION_BARRIER
6856  if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
6857  // lin_64 ): hyper,1
6858  __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
6859  __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
6860  __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
6861  __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
6862  }
6863 #endif // KMP_FAST_REDUCTION_BARRIER
6864  }
6865 #if KMP_FAST_REDUCTION_BARRIER
6866 #undef kmp_reduction_barrier_release_pat
6867 #undef kmp_reduction_barrier_gather_pat
6868 #undef kmp_reduction_barrier_release_bb
6869 #undef kmp_reduction_barrier_gather_bb
6870 #endif // KMP_FAST_REDUCTION_BARRIER
6871 #if KMP_MIC_SUPPORTED
6872  if (__kmp_mic_type == mic2) { // KNC
6873  // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
6874  __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
6875  __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
6876  1; // forkjoin release
6877  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6878  __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6879  }
6880 #if KMP_FAST_REDUCTION_BARRIER
6881  if (__kmp_mic_type == mic2) { // KNC
6882  __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6883  __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6884  }
6885 #endif // KMP_FAST_REDUCTION_BARRIER
6886 #endif // KMP_MIC_SUPPORTED
6887 
6888 // From KMP_CHECKS initialization
6889 #ifdef KMP_DEBUG
6890  __kmp_env_checks = TRUE; /* development versions have the extra checks */
6891 #else
6892  __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
6893 #endif
6894 
6895  // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
6896  __kmp_foreign_tp = TRUE;
6897 
6898  __kmp_global.g.g_dynamic = FALSE;
6899  __kmp_global.g.g_dynamic_mode = dynamic_default;
6900 
6901  __kmp_init_nesting_mode();
6902 
6903  __kmp_env_initialize(NULL);
6904 
6905 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
6906  __kmp_user_level_mwait_init();
6907 #endif
6908 // Print all messages in message catalog for testing purposes.
6909 #ifdef KMP_DEBUG
6910  char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
6911  if (__kmp_str_match_true(val)) {
6912  kmp_str_buf_t buffer;
6913  __kmp_str_buf_init(&buffer);
6914  __kmp_i18n_dump_catalog(&buffer);
6915  __kmp_printf("%s", buffer.str);
6916  __kmp_str_buf_free(&buffer);
6917  }
6918  __kmp_env_free(&val);
6919 #endif
6920 
6921  __kmp_threads_capacity =
6922  __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
6923  // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
6924  __kmp_tp_capacity = __kmp_default_tp_capacity(
6925  __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
6926 
6927  // If the library is shut down properly, both pools must be NULL. Just in
6928  // case, set them to NULL -- some memory may leak, but subsequent code will
6929  // work even if pools are not freed.
6930  KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
6931  KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
6932  KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
6933  __kmp_thread_pool = NULL;
6934  __kmp_thread_pool_insert_pt = NULL;
6935  __kmp_team_pool = NULL;
6936 
6937  /* Allocate all of the variable sized records */
6938  /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
6939  * expandable */
6940  /* Since allocation is cache-aligned, just add extra padding at the end */
6941  size =
6942  (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
6943  CACHE_LINE;
6944  __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
6945  __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
6946  sizeof(kmp_info_t *) * __kmp_threads_capacity);
6947 
6948  /* init thread counts */
6949  KMP_DEBUG_ASSERT(__kmp_all_nth ==
6950  0); // Asserts fail if the library is reinitializing and
6951  KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
6952  __kmp_all_nth = 0;
6953  __kmp_nth = 0;
6954 
6955  /* setup the uber master thread and hierarchy */
6956  gtid = __kmp_register_root(TRUE);
6957  KA_TRACE(10, ("__kmp_do_serial_initialize T#%d\n", gtid));
6958  KMP_ASSERT(KMP_UBER_GTID(gtid));
6959  KMP_ASSERT(KMP_INITIAL_GTID(gtid));
6960 
6961  KMP_MB(); /* Flush all pending memory write invalidates. */
6962 
6963  __kmp_common_initialize();
6964 
6965 #if KMP_OS_UNIX
6966  /* invoke the child fork handler */
6967  __kmp_register_atfork();
6968 #endif
6969 
6970 #if !KMP_DYNAMIC_LIB
6971  {
6972  /* Invoke the exit handler when the program finishes, only for static
6973  library. For dynamic library, we already have _fini and DllMain. */
6974  int rc = atexit(__kmp_internal_end_atexit);
6975  if (rc != 0) {
6976  __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
6977  __kmp_msg_null);
6978  }
6979  }
6980 #endif
6981 
6982 #if KMP_HANDLE_SIGNALS
6983 #if KMP_OS_UNIX
6984  /* NOTE: make sure that this is called before the user installs their own
6985  signal handlers so that the user handlers are called first. this way they
6986  can return false, not call our handler, avoid terminating the library, and
6987  continue execution where they left off. */
6988  __kmp_install_signals(FALSE);
6989 #endif /* KMP_OS_UNIX */
6990 #if KMP_OS_WINDOWS
6991  __kmp_install_signals(TRUE);
6992 #endif /* KMP_OS_WINDOWS */
6993 #endif
6994 
6995  /* we have finished the serial initialization */
6996  __kmp_init_counter++;
6997 
6998  __kmp_init_serial = TRUE;
6999 
7000  if (__kmp_settings) {
7001  __kmp_env_print();
7002  }
7003 
7004  if (__kmp_display_env || __kmp_display_env_verbose) {
7005  __kmp_env_print_2();
7006  }
7007 
7008 #if OMPT_SUPPORT
7009  ompt_post_init();
7010 #endif
7011 
7012  KMP_MB();
7013 
7014  KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
7015 }
7016 
7017 void __kmp_serial_initialize(void) {
7018  if (__kmp_init_serial) {
7019  return;
7020  }
7021  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7022  if (__kmp_init_serial) {
7023  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7024  return;
7025  }
7026  __kmp_do_serial_initialize();
7027  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7028 }
7029 
7030 static void __kmp_do_middle_initialize(void) {
7031  int i, j;
7032  int prev_dflt_team_nth;
7033 
7034  if (!__kmp_init_serial) {
7035  __kmp_do_serial_initialize();
7036  }
7037 
7038  KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
7039 
7040  // Save the previous value for the __kmp_dflt_team_nth so that
7041  // we can avoid some reinitialization if it hasn't changed.
7042  prev_dflt_team_nth = __kmp_dflt_team_nth;
7043 
7044 #if KMP_AFFINITY_SUPPORTED
7045  // __kmp_affinity_initialize() will try to set __kmp_ncores to the
7046  // number of cores on the machine.
7047  __kmp_affinity_initialize();
7048 
7049 #endif /* KMP_AFFINITY_SUPPORTED */
7050 
7051  KMP_ASSERT(__kmp_xproc > 0);
7052  if (__kmp_avail_proc == 0) {
7053  __kmp_avail_proc = __kmp_xproc;
7054  }
7055 
7056  // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
7057  // correct them now
7058  j = 0;
7059  while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
7060  __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
7061  __kmp_avail_proc;
7062  j++;
7063  }
7064 
7065  if (__kmp_dflt_team_nth == 0) {
7066 #ifdef KMP_DFLT_NTH_CORES
7067  // Default #threads = #cores
7068  __kmp_dflt_team_nth = __kmp_ncores;
7069  KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7070  "__kmp_ncores (%d)\n",
7071  __kmp_dflt_team_nth));
7072 #else
7073  // Default #threads = #available OS procs
7074  __kmp_dflt_team_nth = __kmp_avail_proc;
7075  KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7076  "__kmp_avail_proc(%d)\n",
7077  __kmp_dflt_team_nth));
7078 #endif /* KMP_DFLT_NTH_CORES */
7079  }
7080 
7081  if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
7082  __kmp_dflt_team_nth = KMP_MIN_NTH;
7083  }
7084  if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
7085  __kmp_dflt_team_nth = __kmp_sys_max_nth;
7086  }
7087 
7088  if (__kmp_nesting_mode > 0)
7089  __kmp_set_nesting_mode_threads();
7090 
7091  // There's no harm in continuing if the following check fails,
7092  // but it indicates an error in the previous logic.
7093  KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
7094 
7095  if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
7096  // Run through the __kmp_threads array and set the num threads icv for each
7097  // root thread that is currently registered with the RTL (which has not
7098  // already explicitly set its nthreads-var with a call to
7099  // omp_set_num_threads()).
7100  for (i = 0; i < __kmp_threads_capacity; i++) {
7101  kmp_info_t *thread = __kmp_threads[i];
7102  if (thread == NULL)
7103  continue;
7104  if (thread->th.th_current_task->td_icvs.nproc != 0)
7105  continue;
7106 
7107  set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
7108  }
7109  }
7110  KA_TRACE(
7111  20,
7112  ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
7113  __kmp_dflt_team_nth));
7114 
7115 #ifdef KMP_ADJUST_BLOCKTIME
7116  /* Adjust blocktime to zero if necessary now that __kmp_avail_proc is set */
7117  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
7118  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
7119  if (__kmp_nth > __kmp_avail_proc) {
7120  __kmp_zero_bt = TRUE;
7121  }
7122  }
7123 #endif /* KMP_ADJUST_BLOCKTIME */
7124 
7125  /* we have finished middle initialization */
7126  TCW_SYNC_4(__kmp_init_middle, TRUE);
7127 
7128  KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
7129 }
7130 
7131 void __kmp_middle_initialize(void) {
7132  if (__kmp_init_middle) {
7133  return;
7134  }
7135  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7136  if (__kmp_init_middle) {
7137  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7138  return;
7139  }
7140  __kmp_do_middle_initialize();
7141  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7142 }
7143 
7144 void __kmp_parallel_initialize(void) {
7145  int gtid = __kmp_entry_gtid(); // this might be a new root
7146 
7147  /* synchronize parallel initialization (for sibling) */
7148  if (TCR_4(__kmp_init_parallel))
7149  return;
7150  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7151  if (TCR_4(__kmp_init_parallel)) {
7152  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7153  return;
7154  }
7155 
7156  /* TODO reinitialization after we have already shut down */
7157  if (TCR_4(__kmp_global.g.g_done)) {
7158  KA_TRACE(
7159  10,
7160  ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
7161  __kmp_infinite_loop();
7162  }
7163 
7164  /* jc: The lock __kmp_initz_lock is already held, so calling
7165  __kmp_serial_initialize would cause a deadlock. So we call
7166  __kmp_do_serial_initialize directly. */
7167  if (!__kmp_init_middle) {
7168  __kmp_do_middle_initialize();
7169  }
7170  __kmp_assign_root_init_mask();
7171  __kmp_resume_if_hard_paused();
7172 
7173  /* begin initialization */
7174  KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
7175  KMP_ASSERT(KMP_UBER_GTID(gtid));
7176 
7177 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
7178  // Save the FP control regs.
7179  // Worker threads will set theirs to these values at thread startup.
7180  __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
7181  __kmp_store_mxcsr(&__kmp_init_mxcsr);
7182  __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
7183 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
7184 
7185 #if KMP_OS_UNIX
7186 #if KMP_HANDLE_SIGNALS
7187  /* must be after __kmp_serial_initialize */
7188  __kmp_install_signals(TRUE);
7189 #endif
7190 #endif
7191 
7192  __kmp_suspend_initialize();
7193 
7194 #if defined(USE_LOAD_BALANCE)
7195  if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7196  __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
7197  }
7198 #else
7199  if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7200  __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7201  }
7202 #endif
7203 
7204  if (__kmp_version) {
7205  __kmp_print_version_2();
7206  }
7207 
7208  /* we have finished parallel initialization */
7209  TCW_SYNC_4(__kmp_init_parallel, TRUE);
7210 
7211  KMP_MB();
7212  KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
7213 
7214  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7215 }
7216 
7217 void __kmp_hidden_helper_initialize() {
7218  if (TCR_4(__kmp_init_hidden_helper))
7219  return;
7220 
7221  // __kmp_parallel_initialize is required before we initialize hidden helper
7222  if (!TCR_4(__kmp_init_parallel))
7223  __kmp_parallel_initialize();
7224 
7225  // Double check. Note that this double check should not be placed before
7226  // __kmp_parallel_initialize as it will cause dead lock.
7227  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7228  if (TCR_4(__kmp_init_hidden_helper)) {
7229  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7230  return;
7231  }
7232 
7233  // Set the count of hidden helper tasks to be executed to zero
7234  KMP_ATOMIC_ST_REL(&__kmp_unexecuted_hidden_helper_tasks, 0);
7235 
7236  // Set the global variable indicating that we're initializing hidden helper
7237  // team/threads
7238  TCW_SYNC_4(__kmp_init_hidden_helper_threads, TRUE);
7239 
7240  // Platform independent initialization
7241  __kmp_do_initialize_hidden_helper_threads();
7242 
7243  // Wait here for the finish of initialization of hidden helper teams
7244  __kmp_hidden_helper_threads_initz_wait();
7245 
7246  // We have finished hidden helper initialization
7247  TCW_SYNC_4(__kmp_init_hidden_helper, TRUE);
7248 
7249  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7250 }
7251 
7252 /* ------------------------------------------------------------------------ */
7253 
7254 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7255  kmp_team_t *team) {
7256  kmp_disp_t *dispatch;
7257 
7258  KMP_MB();
7259 
7260  /* none of the threads have encountered any constructs, yet. */
7261  this_thr->th.th_local.this_construct = 0;
7262 #if KMP_CACHE_MANAGE
7263  KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
7264 #endif /* KMP_CACHE_MANAGE */
7265  dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
7266  KMP_DEBUG_ASSERT(dispatch);
7267  KMP_DEBUG_ASSERT(team->t.t_dispatch);
7268  // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
7269  // this_thr->th.th_info.ds.ds_tid ] );
7270 
7271  dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
7272  dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter
7273  if (__kmp_env_consistency_check)
7274  __kmp_push_parallel(gtid, team->t.t_ident);
7275 
7276  KMP_MB(); /* Flush all pending memory write invalidates. */
7277 }
7278 
7279 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7280  kmp_team_t *team) {
7281  if (__kmp_env_consistency_check)
7282  __kmp_pop_parallel(gtid, team->t.t_ident);
7283 
7284  __kmp_finish_implicit_task(this_thr);
7285 }
7286 
7287 int __kmp_invoke_task_func(int gtid) {
7288  int rc;
7289  int tid = __kmp_tid_from_gtid(gtid);
7290  kmp_info_t *this_thr = __kmp_threads[gtid];
7291  kmp_team_t *team = this_thr->th.th_team;
7292 
7293  __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
7294 #if USE_ITT_BUILD
7295  if (__itt_stack_caller_create_ptr) {
7296  // inform ittnotify about entering user's code
7297  if (team->t.t_stack_id != NULL) {
7298  __kmp_itt_stack_callee_enter((__itt_caller)team->t.t_stack_id);
7299  } else {
7300  KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7301  __kmp_itt_stack_callee_enter(
7302  (__itt_caller)team->t.t_parent->t.t_stack_id);
7303  }
7304  }
7305 #endif /* USE_ITT_BUILD */
7306 #if INCLUDE_SSC_MARKS
7307  SSC_MARK_INVOKING();
7308 #endif
7309 
7310 #if OMPT_SUPPORT
7311  void *dummy;
7312  void **exit_frame_p;
7313  ompt_data_t *my_task_data;
7314  ompt_data_t *my_parallel_data;
7315  int ompt_team_size;
7316 
7317  if (ompt_enabled.enabled) {
7318  exit_frame_p = &(team->t.t_implicit_task_taskdata[tid]
7319  .ompt_task_info.frame.exit_frame.ptr);
7320  } else {
7321  exit_frame_p = &dummy;
7322  }
7323 
7324  my_task_data =
7325  &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
7326  my_parallel_data = &(team->t.ompt_team_info.parallel_data);
7327  if (ompt_enabled.ompt_callback_implicit_task) {
7328  ompt_team_size = team->t.t_nproc;
7329  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7330  ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
7331  __kmp_tid_from_gtid(gtid), ompt_task_implicit);
7332  OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
7333  }
7334 #endif
7335 
7336 #if KMP_STATS_ENABLED
7337  stats_state_e previous_state = KMP_GET_THREAD_STATE();
7338  if (previous_state == stats_state_e::TEAMS_REGION) {
7339  KMP_PUSH_PARTITIONED_TIMER(OMP_teams);
7340  } else {
7341  KMP_PUSH_PARTITIONED_TIMER(OMP_parallel);
7342  }
7343  KMP_SET_THREAD_STATE(IMPLICIT_TASK);
7344 #endif
7345 
7346  rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
7347  tid, (int)team->t.t_argc, (void **)team->t.t_argv
7348 #if OMPT_SUPPORT
7349  ,
7350  exit_frame_p
7351 #endif
7352  );
7353 #if OMPT_SUPPORT
7354  *exit_frame_p = NULL;
7355  this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_team;
7356 #endif
7357 
7358 #if KMP_STATS_ENABLED
7359  if (previous_state == stats_state_e::TEAMS_REGION) {
7360  KMP_SET_THREAD_STATE(previous_state);
7361  }
7362  KMP_POP_PARTITIONED_TIMER();
7363 #endif
7364 
7365 #if USE_ITT_BUILD
7366  if (__itt_stack_caller_create_ptr) {
7367  // inform ittnotify about leaving user's code
7368  if (team->t.t_stack_id != NULL) {
7369  __kmp_itt_stack_callee_leave((__itt_caller)team->t.t_stack_id);
7370  } else {
7371  KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7372  __kmp_itt_stack_callee_leave(
7373  (__itt_caller)team->t.t_parent->t.t_stack_id);
7374  }
7375  }
7376 #endif /* USE_ITT_BUILD */
7377  __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7378 
7379  return rc;
7380 }
7381 
7382 void __kmp_teams_master(int gtid) {
7383  // This routine is called by all primary threads in teams construct
7384  kmp_info_t *thr = __kmp_threads[gtid];
7385  kmp_team_t *team = thr->th.th_team;
7386  ident_t *loc = team->t.t_ident;
7387  thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7388  KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7389  KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7390  KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7391  __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7392 
7393  // This thread is a new CG root. Set up the proper variables.
7394  kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
7395  tmp->cg_root = thr; // Make thr the CG root
7396  // Init to thread limit stored when league primary threads were forked
7397  tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
7398  tmp->cg_nthreads = 1; // Init counter to one active thread, this one
7399  KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
7400  " cg_nthreads to 1\n",
7401  thr, tmp));
7402  tmp->up = thr->th.th_cg_roots;
7403  thr->th.th_cg_roots = tmp;
7404 
7405 // Launch league of teams now, but not let workers execute
7406 // (they hang on fork barrier until next parallel)
7407 #if INCLUDE_SSC_MARKS
7408  SSC_MARK_FORKING();
7409 #endif
7410  __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7411  (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7412  VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7413 #if INCLUDE_SSC_MARKS
7414  SSC_MARK_JOINING();
7415 #endif
7416  // If the team size was reduced from the limit, set it to the new size
7417  if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
7418  thr->th.th_teams_size.nth = thr->th.th_team_nproc;
7419  // AC: last parameter "1" eliminates join barrier which won't work because
7420  // worker threads are in a fork barrier waiting for more parallel regions
7421  __kmp_join_call(loc, gtid
7422 #if OMPT_SUPPORT
7423  ,
7424  fork_context_intel
7425 #endif
7426  ,
7427  1);
7428 }
7429 
7430 int __kmp_invoke_teams_master(int gtid) {
7431  kmp_info_t *this_thr = __kmp_threads[gtid];
7432  kmp_team_t *team = this_thr->th.th_team;
7433 #if KMP_DEBUG
7434  if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7435  KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7436  (void *)__kmp_teams_master);
7437 #endif
7438  __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7439 #if OMPT_SUPPORT
7440  int tid = __kmp_tid_from_gtid(gtid);
7441  ompt_data_t *task_data =
7442  &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data;
7443  ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data;
7444  if (ompt_enabled.ompt_callback_implicit_task) {
7445  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7446  ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid,
7447  ompt_task_initial);
7448  OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid;
7449  }
7450 #endif
7451  __kmp_teams_master(gtid);
7452 #if OMPT_SUPPORT
7453  this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_league;
7454 #endif
7455  __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7456  return 1;
7457 }
7458 
7459 /* this sets the requested number of threads for the next parallel region
7460  encountered by this team. since this should be enclosed in the forkjoin
7461  critical section it should avoid race conditions with asymmetrical nested
7462  parallelism */
7463 
7464 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7465  kmp_info_t *thr = __kmp_threads[gtid];
7466 
7467  if (num_threads > 0)
7468  thr->th.th_set_nproc = num_threads;
7469 }
7470 
7471 static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams,
7472  int num_threads) {
7473  KMP_DEBUG_ASSERT(thr);
7474  // Remember the number of threads for inner parallel regions
7475  if (!TCR_4(__kmp_init_middle))
7476  __kmp_middle_initialize(); // get internal globals calculated
7477  __kmp_assign_root_init_mask();
7478  KMP_DEBUG_ASSERT(__kmp_avail_proc);
7479  KMP_DEBUG_ASSERT(__kmp_dflt_team_nth);
7480 
7481  if (num_threads == 0) {
7482  if (__kmp_teams_thread_limit > 0) {
7483  num_threads = __kmp_teams_thread_limit;
7484  } else {
7485  num_threads = __kmp_avail_proc / num_teams;
7486  }
7487  // adjust num_threads w/o warning as it is not user setting
7488  // num_threads = min(num_threads, nthreads-var, thread-limit-var)
7489  // no thread_limit clause specified - do not change thread-limit-var ICV
7490  if (num_threads > __kmp_dflt_team_nth) {
7491  num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7492  }
7493  if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) {
7494  num_threads = thr->th.th_current_task->td_icvs.thread_limit;
7495  } // prevent team size to exceed thread-limit-var
7496  if (num_teams * num_threads > __kmp_teams_max_nth) {
7497  num_threads = __kmp_teams_max_nth / num_teams;
7498  }
7499  if (num_threads == 0) {
7500  num_threads = 1;
7501  }
7502  } else {
7503  // This thread will be the primary thread of the league primary threads
7504  // Store new thread limit; old limit is saved in th_cg_roots list
7505  thr->th.th_current_task->td_icvs.thread_limit = num_threads;
7506  // num_threads = min(num_threads, nthreads-var)
7507  if (num_threads > __kmp_dflt_team_nth) {
7508  num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7509  }
7510  if (num_teams * num_threads > __kmp_teams_max_nth) {
7511  int new_threads = __kmp_teams_max_nth / num_teams;
7512  if (new_threads == 0) {
7513  new_threads = 1;
7514  }
7515  if (new_threads != num_threads) {
7516  if (!__kmp_reserve_warn) { // user asked for too many threads
7517  __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
7518  __kmp_msg(kmp_ms_warning,
7519  KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7520  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7521  }
7522  }
7523  num_threads = new_threads;
7524  }
7525  }
7526  thr->th.th_teams_size.nth = num_threads;
7527 }
7528 
7529 /* this sets the requested number of teams for the teams region and/or
7530  the number of threads for the next parallel region encountered */
7531 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7532  int num_threads) {
7533  kmp_info_t *thr = __kmp_threads[gtid];
7534  KMP_DEBUG_ASSERT(num_teams >= 0);
7535  KMP_DEBUG_ASSERT(num_threads >= 0);
7536 
7537  if (num_teams == 0) {
7538  if (__kmp_nteams > 0) {
7539  num_teams = __kmp_nteams;
7540  } else {
7541  num_teams = 1; // default number of teams is 1.
7542  }
7543  }
7544  if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7545  if (!__kmp_reserve_warn) {
7546  __kmp_reserve_warn = 1;
7547  __kmp_msg(kmp_ms_warning,
7548  KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7549  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7550  }
7551  num_teams = __kmp_teams_max_nth;
7552  }
7553  // Set number of teams (number of threads in the outer "parallel" of the
7554  // teams)
7555  thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7556 
7557  __kmp_push_thread_limit(thr, num_teams, num_threads);
7558 }
7559 
7560 /* This sets the requested number of teams for the teams region and/or
7561  the number of threads for the next parallel region encountered */
7562 void __kmp_push_num_teams_51(ident_t *id, int gtid, int num_teams_lb,
7563  int num_teams_ub, int num_threads) {
7564  kmp_info_t *thr = __kmp_threads[gtid];
7565  KMP_DEBUG_ASSERT(num_teams_lb >= 0 && num_teams_ub >= 0);
7566  KMP_DEBUG_ASSERT(num_teams_ub >= num_teams_lb);
7567  KMP_DEBUG_ASSERT(num_threads >= 0);
7568 
7569  if (num_teams_lb > num_teams_ub) {
7570  __kmp_fatal(KMP_MSG(FailedToCreateTeam, num_teams_lb, num_teams_ub),
7571  KMP_HNT(SetNewBound, __kmp_teams_max_nth), __kmp_msg_null);
7572  }
7573 
7574  int num_teams = 1; // defalt number of teams is 1.
7575 
7576  if (num_teams_lb == 0 && num_teams_ub > 0)
7577  num_teams_lb = num_teams_ub;
7578 
7579  if (num_teams_lb == 0 && num_teams_ub == 0) { // no num_teams clause
7580  num_teams = (__kmp_nteams > 0) ? __kmp_nteams : num_teams;
7581  if (num_teams > __kmp_teams_max_nth) {
7582  if (!__kmp_reserve_warn) {
7583  __kmp_reserve_warn = 1;
7584  __kmp_msg(kmp_ms_warning,
7585  KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7586  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7587  }
7588  num_teams = __kmp_teams_max_nth;
7589  }
7590  } else if (num_teams_lb == num_teams_ub) { // requires exact number of teams
7591  num_teams = num_teams_ub;
7592  } else { // num_teams_lb <= num_teams <= num_teams_ub
7593  if (num_threads == 0) {
7594  if (num_teams_ub > __kmp_teams_max_nth) {
7595  num_teams = num_teams_lb;
7596  } else {
7597  num_teams = num_teams_ub;
7598  }
7599  } else {
7600  num_teams = (num_threads > __kmp_teams_max_nth)
7601  ? num_teams
7602  : __kmp_teams_max_nth / num_threads;
7603  if (num_teams < num_teams_lb) {
7604  num_teams = num_teams_lb;
7605  } else if (num_teams > num_teams_ub) {
7606  num_teams = num_teams_ub;
7607  }
7608  }
7609  }
7610  // Set number of teams (number of threads in the outer "parallel" of the
7611  // teams)
7612  thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7613 
7614  __kmp_push_thread_limit(thr, num_teams, num_threads);
7615 }
7616 
7617 // Set the proc_bind var to use in the following parallel region.
7618 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
7619  kmp_info_t *thr = __kmp_threads[gtid];
7620  thr->th.th_set_proc_bind = proc_bind;
7621 }
7622 
7623 /* Launch the worker threads into the microtask. */
7624 
7625 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
7626  kmp_info_t *this_thr = __kmp_threads[gtid];
7627 
7628 #ifdef KMP_DEBUG
7629  int f;
7630 #endif /* KMP_DEBUG */
7631 
7632  KMP_DEBUG_ASSERT(team);
7633  KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7634  KMP_ASSERT(KMP_MASTER_GTID(gtid));
7635  KMP_MB(); /* Flush all pending memory write invalidates. */
7636 
7637  team->t.t_construct = 0; /* no single directives seen yet */
7638  team->t.t_ordered.dt.t_value =
7639  0; /* thread 0 enters the ordered section first */
7640 
7641  /* Reset the identifiers on the dispatch buffer */
7642  KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
7643  if (team->t.t_max_nproc > 1) {
7644  int i;
7645  for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
7646  team->t.t_disp_buffer[i].buffer_index = i;
7647  team->t.t_disp_buffer[i].doacross_buf_idx = i;
7648  }
7649  } else {
7650  team->t.t_disp_buffer[0].buffer_index = 0;
7651  team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7652  }
7653 
7654  KMP_MB(); /* Flush all pending memory write invalidates. */
7655  KMP_ASSERT(this_thr->th.th_team == team);
7656 
7657 #ifdef KMP_DEBUG
7658  for (f = 0; f < team->t.t_nproc; f++) {
7659  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
7660  team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
7661  }
7662 #endif /* KMP_DEBUG */
7663 
7664  /* release the worker threads so they may begin working */
7665  __kmp_fork_barrier(gtid, 0);
7666 }
7667 
7668 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
7669  kmp_info_t *this_thr = __kmp_threads[gtid];
7670 
7671  KMP_DEBUG_ASSERT(team);
7672  KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7673  KMP_ASSERT(KMP_MASTER_GTID(gtid));
7674  KMP_MB(); /* Flush all pending memory write invalidates. */
7675 
7676  /* Join barrier after fork */
7677 
7678 #ifdef KMP_DEBUG
7679  if (__kmp_threads[gtid] &&
7680  __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
7681  __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
7682  __kmp_threads[gtid]);
7683  __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
7684  "team->t.t_nproc=%d\n",
7685  gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
7686  team->t.t_nproc);
7687  __kmp_print_structure();
7688  }
7689  KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
7690  __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
7691 #endif /* KMP_DEBUG */
7692 
7693  __kmp_join_barrier(gtid); /* wait for everyone */
7694 #if OMPT_SUPPORT
7695  if (ompt_enabled.enabled &&
7696  this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
7697  int ds_tid = this_thr->th.th_info.ds.ds_tid;
7698  ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
7699  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
7700 #if OMPT_OPTIONAL
7701  void *codeptr = NULL;
7702  if (KMP_MASTER_TID(ds_tid) &&
7703  (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
7704  ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
7705  codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
7706 
7707  if (ompt_enabled.ompt_callback_sync_region_wait) {
7708  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
7709  ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7710  codeptr);
7711  }
7712  if (ompt_enabled.ompt_callback_sync_region) {
7713  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
7714  ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7715  codeptr);
7716  }
7717 #endif
7718  if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
7719  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7720  ompt_scope_end, NULL, task_data, 0, ds_tid,
7721  ompt_task_implicit); // TODO: Can this be ompt_task_initial?
7722  }
7723  }
7724 #endif
7725 
7726  KMP_MB(); /* Flush all pending memory write invalidates. */
7727  KMP_ASSERT(this_thr->th.th_team == team);
7728 }
7729 
7730 /* ------------------------------------------------------------------------ */
7731 
7732 #ifdef USE_LOAD_BALANCE
7733 
7734 // Return the worker threads actively spinning in the hot team, if we
7735 // are at the outermost level of parallelism. Otherwise, return 0.
7736 static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
7737  int i;
7738  int retval;
7739  kmp_team_t *hot_team;
7740 
7741  if (root->r.r_active) {
7742  return 0;
7743  }
7744  hot_team = root->r.r_hot_team;
7745  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
7746  return hot_team->t.t_nproc - 1; // Don't count primary thread
7747  }
7748 
7749  // Skip the primary thread - it is accounted for elsewhere.
7750  retval = 0;
7751  for (i = 1; i < hot_team->t.t_nproc; i++) {
7752  if (hot_team->t.t_threads[i]->th.th_active) {
7753  retval++;
7754  }
7755  }
7756  return retval;
7757 }
7758 
7759 // Perform an automatic adjustment to the number of
7760 // threads used by the next parallel region.
7761 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
7762  int retval;
7763  int pool_active;
7764  int hot_team_active;
7765  int team_curr_active;
7766  int system_active;
7767 
7768  KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
7769  set_nproc));
7770  KMP_DEBUG_ASSERT(root);
7771  KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
7772  ->th.th_current_task->td_icvs.dynamic == TRUE);
7773  KMP_DEBUG_ASSERT(set_nproc > 1);
7774 
7775  if (set_nproc == 1) {
7776  KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
7777  return 1;
7778  }
7779 
7780  // Threads that are active in the thread pool, active in the hot team for this
7781  // particular root (if we are at the outer par level), and the currently
7782  // executing thread (to become the primary thread) are available to add to the
7783  // new team, but are currently contributing to the system load, and must be
7784  // accounted for.
7785  pool_active = __kmp_thread_pool_active_nth;
7786  hot_team_active = __kmp_active_hot_team_nproc(root);
7787  team_curr_active = pool_active + hot_team_active + 1;
7788 
7789  // Check the system load.
7790  system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
7791  KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
7792  "hot team active = %d\n",
7793  system_active, pool_active, hot_team_active));
7794 
7795  if (system_active < 0) {
7796  // There was an error reading the necessary info from /proc, so use the
7797  // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
7798  // = dynamic_thread_limit, we shouldn't wind up getting back here.
7799  __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7800  KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
7801 
7802  // Make this call behave like the thread limit algorithm.
7803  retval = __kmp_avail_proc - __kmp_nth +
7804  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
7805  if (retval > set_nproc) {
7806  retval = set_nproc;
7807  }
7808  if (retval < KMP_MIN_NTH) {
7809  retval = KMP_MIN_NTH;
7810  }
7811 
7812  KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
7813  retval));
7814  return retval;
7815  }
7816 
7817  // There is a slight delay in the load balance algorithm in detecting new
7818  // running procs. The real system load at this instant should be at least as
7819  // large as the #active omp thread that are available to add to the team.
7820  if (system_active < team_curr_active) {
7821  system_active = team_curr_active;
7822  }
7823  retval = __kmp_avail_proc - system_active + team_curr_active;
7824  if (retval > set_nproc) {
7825  retval = set_nproc;
7826  }
7827  if (retval < KMP_MIN_NTH) {
7828  retval = KMP_MIN_NTH;
7829  }
7830 
7831  KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
7832  return retval;
7833 } // __kmp_load_balance_nproc()
7834 
7835 #endif /* USE_LOAD_BALANCE */
7836 
7837 /* ------------------------------------------------------------------------ */
7838 
7839 /* NOTE: this is called with the __kmp_init_lock held */
7840 void __kmp_cleanup(void) {
7841  int f;
7842 
7843  KA_TRACE(10, ("__kmp_cleanup: enter\n"));
7844 
7845  if (TCR_4(__kmp_init_parallel)) {
7846 #if KMP_HANDLE_SIGNALS
7847  __kmp_remove_signals();
7848 #endif
7849  TCW_4(__kmp_init_parallel, FALSE);
7850  }
7851 
7852  if (TCR_4(__kmp_init_middle)) {
7853 #if KMP_AFFINITY_SUPPORTED
7854  __kmp_affinity_uninitialize();
7855 #endif /* KMP_AFFINITY_SUPPORTED */
7856  __kmp_cleanup_hierarchy();
7857  TCW_4(__kmp_init_middle, FALSE);
7858  }
7859 
7860  KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
7861 
7862  if (__kmp_init_serial) {
7863  __kmp_runtime_destroy();
7864  __kmp_init_serial = FALSE;
7865  }
7866 
7867  __kmp_cleanup_threadprivate_caches();
7868 
7869  for (f = 0; f < __kmp_threads_capacity; f++) {
7870  if (__kmp_root[f] != NULL) {
7871  __kmp_free(__kmp_root[f]);
7872  __kmp_root[f] = NULL;
7873  }
7874  }
7875  __kmp_free(__kmp_threads);
7876  // __kmp_threads and __kmp_root were allocated at once, as single block, so
7877  // there is no need in freeing __kmp_root.
7878  __kmp_threads = NULL;
7879  __kmp_root = NULL;
7880  __kmp_threads_capacity = 0;
7881 
7882 #if KMP_USE_DYNAMIC_LOCK
7883  __kmp_cleanup_indirect_user_locks();
7884 #else
7885  __kmp_cleanup_user_locks();
7886 #endif
7887 #if OMPD_SUPPORT
7888  if (ompd_state) {
7889  __kmp_free(ompd_env_block);
7890  ompd_env_block = NULL;
7891  ompd_env_block_size = 0;
7892  }
7893 #endif
7894 
7895 #if KMP_AFFINITY_SUPPORTED
7896  KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
7897  __kmp_cpuinfo_file = NULL;
7898 #endif /* KMP_AFFINITY_SUPPORTED */
7899 
7900 #if KMP_USE_ADAPTIVE_LOCKS
7901 #if KMP_DEBUG_ADAPTIVE_LOCKS
7902  __kmp_print_speculative_stats();
7903 #endif
7904 #endif
7905  KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
7906  __kmp_nested_nth.nth = NULL;
7907  __kmp_nested_nth.size = 0;
7908  __kmp_nested_nth.used = 0;
7909  KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
7910  __kmp_nested_proc_bind.bind_types = NULL;
7911  __kmp_nested_proc_bind.size = 0;
7912  __kmp_nested_proc_bind.used = 0;
7913  if (__kmp_affinity_format) {
7914  KMP_INTERNAL_FREE(__kmp_affinity_format);
7915  __kmp_affinity_format = NULL;
7916  }
7917 
7918  __kmp_i18n_catclose();
7919 
7920 #if KMP_USE_HIER_SCHED
7921  __kmp_hier_scheds.deallocate();
7922 #endif
7923 
7924 #if KMP_STATS_ENABLED
7925  __kmp_stats_fini();
7926 #endif
7927 
7928  KA_TRACE(10, ("__kmp_cleanup: exit\n"));
7929 }
7930 
7931 /* ------------------------------------------------------------------------ */
7932 
7933 int __kmp_ignore_mppbeg(void) {
7934  char *env;
7935 
7936  if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
7937  if (__kmp_str_match_false(env))
7938  return FALSE;
7939  }
7940  // By default __kmpc_begin() is no-op.
7941  return TRUE;
7942 }
7943 
7944 int __kmp_ignore_mppend(void) {
7945  char *env;
7946 
7947  if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
7948  if (__kmp_str_match_false(env))
7949  return FALSE;
7950  }
7951  // By default __kmpc_end() is no-op.
7952  return TRUE;
7953 }
7954 
7955 void __kmp_internal_begin(void) {
7956  int gtid;
7957  kmp_root_t *root;
7958 
7959  /* this is a very important step as it will register new sibling threads
7960  and assign these new uber threads a new gtid */
7961  gtid = __kmp_entry_gtid();
7962  root = __kmp_threads[gtid]->th.th_root;
7963  KMP_ASSERT(KMP_UBER_GTID(gtid));
7964 
7965  if (root->r.r_begin)
7966  return;
7967  __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
7968  if (root->r.r_begin) {
7969  __kmp_release_lock(&root->r.r_begin_lock, gtid);
7970  return;
7971  }
7972 
7973  root->r.r_begin = TRUE;
7974 
7975  __kmp_release_lock(&root->r.r_begin_lock, gtid);
7976 }
7977 
7978 /* ------------------------------------------------------------------------ */
7979 
7980 void __kmp_user_set_library(enum library_type arg) {
7981  int gtid;
7982  kmp_root_t *root;
7983  kmp_info_t *thread;
7984 
7985  /* first, make sure we are initialized so we can get our gtid */
7986 
7987  gtid = __kmp_entry_gtid();
7988  thread = __kmp_threads[gtid];
7989 
7990  root = thread->th.th_root;
7991 
7992  KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
7993  library_serial));
7994  if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
7995  thread */
7996  KMP_WARNING(SetLibraryIncorrectCall);
7997  return;
7998  }
7999 
8000  switch (arg) {
8001  case library_serial:
8002  thread->th.th_set_nproc = 0;
8003  set__nproc(thread, 1);
8004  break;
8005  case library_turnaround:
8006  thread->th.th_set_nproc = 0;
8007  set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8008  : __kmp_dflt_team_nth_ub);
8009  break;
8010  case library_throughput:
8011  thread->th.th_set_nproc = 0;
8012  set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8013  : __kmp_dflt_team_nth_ub);
8014  break;
8015  default:
8016  KMP_FATAL(UnknownLibraryType, arg);
8017  }
8018 
8019  __kmp_aux_set_library(arg);
8020 }
8021 
8022 void __kmp_aux_set_stacksize(size_t arg) {
8023  if (!__kmp_init_serial)
8024  __kmp_serial_initialize();
8025 
8026 #if KMP_OS_DARWIN
8027  if (arg & (0x1000 - 1)) {
8028  arg &= ~(0x1000 - 1);
8029  if (arg + 0x1000) /* check for overflow if we round up */
8030  arg += 0x1000;
8031  }
8032 #endif
8033  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8034 
8035  /* only change the default stacksize before the first parallel region */
8036  if (!TCR_4(__kmp_init_parallel)) {
8037  size_t value = arg; /* argument is in bytes */
8038 
8039  if (value < __kmp_sys_min_stksize)
8040  value = __kmp_sys_min_stksize;
8041  else if (value > KMP_MAX_STKSIZE)
8042  value = KMP_MAX_STKSIZE;
8043 
8044  __kmp_stksize = value;
8045 
8046  __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
8047  }
8048 
8049  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
8050 }
8051 
8052 /* set the behaviour of the runtime library */
8053 /* TODO this can cause some odd behaviour with sibling parallelism... */
8054 void __kmp_aux_set_library(enum library_type arg) {
8055  __kmp_library = arg;
8056 
8057  switch (__kmp_library) {
8058  case library_serial: {
8059  KMP_INFORM(LibraryIsSerial);
8060  } break;
8061  case library_turnaround:
8062  if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
8063  __kmp_use_yield = 2; // only yield when oversubscribed
8064  break;
8065  case library_throughput:
8066  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
8067  __kmp_dflt_blocktime = 200;
8068  break;
8069  default:
8070  KMP_FATAL(UnknownLibraryType, arg);
8071  }
8072 }
8073 
8074 /* Getting team information common for all team API */
8075 // Returns NULL if not in teams construct
8076 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
8077  kmp_info_t *thr = __kmp_entry_thread();
8078  teams_serialized = 0;
8079  if (thr->th.th_teams_microtask) {
8080  kmp_team_t *team = thr->th.th_team;
8081  int tlevel = thr->th.th_teams_level; // the level of the teams construct
8082  int ii = team->t.t_level;
8083  teams_serialized = team->t.t_serialized;
8084  int level = tlevel + 1;
8085  KMP_DEBUG_ASSERT(ii >= tlevel);
8086  while (ii > level) {
8087  for (teams_serialized = team->t.t_serialized;
8088  (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
8089  }
8090  if (team->t.t_serialized && (!teams_serialized)) {
8091  team = team->t.t_parent;
8092  continue;
8093  }
8094  if (ii > level) {
8095  team = team->t.t_parent;
8096  ii--;
8097  }
8098  }
8099  return team;
8100  }
8101  return NULL;
8102 }
8103 
8104 int __kmp_aux_get_team_num() {
8105  int serialized;
8106  kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8107  if (team) {
8108  if (serialized > 1) {
8109  return 0; // teams region is serialized ( 1 team of 1 thread ).
8110  } else {
8111  return team->t.t_master_tid;
8112  }
8113  }
8114  return 0;
8115 }
8116 
8117 int __kmp_aux_get_num_teams() {
8118  int serialized;
8119  kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8120  if (team) {
8121  if (serialized > 1) {
8122  return 1;
8123  } else {
8124  return team->t.t_parent->t.t_nproc;
8125  }
8126  }
8127  return 1;
8128 }
8129 
8130 /* ------------------------------------------------------------------------ */
8131 
8132 /*
8133  * Affinity Format Parser
8134  *
8135  * Field is in form of: %[[[0].]size]type
8136  * % and type are required (%% means print a literal '%')
8137  * type is either single char or long name surrounded by {},
8138  * e.g., N or {num_threads}
8139  * 0 => leading zeros
8140  * . => right justified when size is specified
8141  * by default output is left justified
8142  * size is the *minimum* field length
8143  * All other characters are printed as is
8144  *
8145  * Available field types:
8146  * L {thread_level} - omp_get_level()
8147  * n {thread_num} - omp_get_thread_num()
8148  * h {host} - name of host machine
8149  * P {process_id} - process id (integer)
8150  * T {thread_identifier} - native thread identifier (integer)
8151  * N {num_threads} - omp_get_num_threads()
8152  * A {ancestor_tnum} - omp_get_ancestor_thread_num(omp_get_level()-1)
8153  * a {thread_affinity} - comma separated list of integers or integer ranges
8154  * (values of affinity mask)
8155  *
8156  * Implementation-specific field types can be added
8157  * If a type is unknown, print "undefined"
8158  */
8159 
8160 // Structure holding the short name, long name, and corresponding data type
8161 // for snprintf. A table of these will represent the entire valid keyword
8162 // field types.
8163 typedef struct kmp_affinity_format_field_t {
8164  char short_name; // from spec e.g., L -> thread level
8165  const char *long_name; // from spec thread_level -> thread level
8166  char field_format; // data type for snprintf (typically 'd' or 's'
8167  // for integer or string)
8168 } kmp_affinity_format_field_t;
8169 
8170 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
8171 #if KMP_AFFINITY_SUPPORTED
8172  {'A', "thread_affinity", 's'},
8173 #endif
8174  {'t', "team_num", 'd'},
8175  {'T', "num_teams", 'd'},
8176  {'L', "nesting_level", 'd'},
8177  {'n', "thread_num", 'd'},
8178  {'N', "num_threads", 'd'},
8179  {'a', "ancestor_tnum", 'd'},
8180  {'H', "host", 's'},
8181  {'P', "process_id", 'd'},
8182  {'i', "native_thread_id", 'd'}};
8183 
8184 // Return the number of characters it takes to hold field
8185 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
8186  const char **ptr,
8187  kmp_str_buf_t *field_buffer) {
8188  int rc, format_index, field_value;
8189  const char *width_left, *width_right;
8190  bool pad_zeros, right_justify, parse_long_name, found_valid_name;
8191  static const int FORMAT_SIZE = 20;
8192  char format[FORMAT_SIZE] = {0};
8193  char absolute_short_name = 0;
8194 
8195  KMP_DEBUG_ASSERT(gtid >= 0);
8196  KMP_DEBUG_ASSERT(th);
8197  KMP_DEBUG_ASSERT(**ptr == '%');
8198  KMP_DEBUG_ASSERT(field_buffer);
8199 
8200  __kmp_str_buf_clear(field_buffer);
8201 
8202  // Skip the initial %
8203  (*ptr)++;
8204 
8205  // Check for %% first
8206  if (**ptr == '%') {
8207  __kmp_str_buf_cat(field_buffer, "%", 1);
8208  (*ptr)++; // skip over the second %
8209  return 1;
8210  }
8211 
8212  // Parse field modifiers if they are present
8213  pad_zeros = false;
8214  if (**ptr == '0') {
8215  pad_zeros = true;
8216  (*ptr)++; // skip over 0
8217  }
8218  right_justify = false;
8219  if (**ptr == '.') {
8220  right_justify = true;
8221  (*ptr)++; // skip over .
8222  }
8223  // Parse width of field: [width_left, width_right)
8224  width_left = width_right = NULL;
8225  if (**ptr >= '0' && **ptr <= '9') {
8226  width_left = *ptr;
8227  SKIP_DIGITS(*ptr);
8228  width_right = *ptr;
8229  }
8230 
8231  // Create the format for KMP_SNPRINTF based on flags parsed above
8232  format_index = 0;
8233  format[format_index++] = '%';
8234  if (!right_justify)
8235  format[format_index++] = '-';
8236  if (pad_zeros)
8237  format[format_index++] = '0';
8238  if (width_left && width_right) {
8239  int i = 0;
8240  // Only allow 8 digit number widths.
8241  // This also prevents overflowing format variable
8242  while (i < 8 && width_left < width_right) {
8243  format[format_index++] = *width_left;
8244  width_left++;
8245  i++;
8246  }
8247  }
8248 
8249  // Parse a name (long or short)
8250  // Canonicalize the name into absolute_short_name
8251  found_valid_name = false;
8252  parse_long_name = (**ptr == '{');
8253  if (parse_long_name)
8254  (*ptr)++; // skip initial left brace
8255  for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
8256  sizeof(__kmp_affinity_format_table[0]);
8257  ++i) {
8258  char short_name = __kmp_affinity_format_table[i].short_name;
8259  const char *long_name = __kmp_affinity_format_table[i].long_name;
8260  char field_format = __kmp_affinity_format_table[i].field_format;
8261  if (parse_long_name) {
8262  size_t length = KMP_STRLEN(long_name);
8263  if (strncmp(*ptr, long_name, length) == 0) {
8264  found_valid_name = true;
8265  (*ptr) += length; // skip the long name
8266  }
8267  } else if (**ptr == short_name) {
8268  found_valid_name = true;
8269  (*ptr)++; // skip the short name
8270  }
8271  if (found_valid_name) {
8272  format[format_index++] = field_format;
8273  format[format_index++] = '\0';
8274  absolute_short_name = short_name;
8275  break;
8276  }
8277  }
8278  if (parse_long_name) {
8279  if (**ptr != '}') {
8280  absolute_short_name = 0;
8281  } else {
8282  (*ptr)++; // skip over the right brace
8283  }
8284  }
8285 
8286  // Attempt to fill the buffer with the requested
8287  // value using snprintf within __kmp_str_buf_print()
8288  switch (absolute_short_name) {
8289  case 't':
8290  rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
8291  break;
8292  case 'T':
8293  rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
8294  break;
8295  case 'L':
8296  rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
8297  break;
8298  case 'n':
8299  rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
8300  break;
8301  case 'H': {
8302  static const int BUFFER_SIZE = 256;
8303  char buf[BUFFER_SIZE];
8304  __kmp_expand_host_name(buf, BUFFER_SIZE);
8305  rc = __kmp_str_buf_print(field_buffer, format, buf);
8306  } break;
8307  case 'P':
8308  rc = __kmp_str_buf_print(field_buffer, format, getpid());
8309  break;
8310  case 'i':
8311  rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
8312  break;
8313  case 'N':
8314  rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
8315  break;
8316  case 'a':
8317  field_value =
8318  __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
8319  rc = __kmp_str_buf_print(field_buffer, format, field_value);
8320  break;
8321 #if KMP_AFFINITY_SUPPORTED
8322  case 'A': {
8323  kmp_str_buf_t buf;
8324  __kmp_str_buf_init(&buf);
8325  __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
8326  rc = __kmp_str_buf_print(field_buffer, format, buf.str);
8327  __kmp_str_buf_free(&buf);
8328  } break;
8329 #endif
8330  default:
8331  // According to spec, If an implementation does not have info for field
8332  // type, then "undefined" is printed
8333  rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
8334  // Skip the field
8335  if (parse_long_name) {
8336  SKIP_TOKEN(*ptr);
8337  if (**ptr == '}')
8338  (*ptr)++;
8339  } else {
8340  (*ptr)++;
8341  }
8342  }
8343 
8344  KMP_ASSERT(format_index <= FORMAT_SIZE);
8345  return rc;
8346 }
8347 
8348 /*
8349  * Return number of characters needed to hold the affinity string
8350  * (not including null byte character)
8351  * The resultant string is printed to buffer, which the caller can then
8352  * handle afterwards
8353  */
8354 size_t __kmp_aux_capture_affinity(int gtid, const char *format,
8355  kmp_str_buf_t *buffer) {
8356  const char *parse_ptr;
8357  size_t retval;
8358  const kmp_info_t *th;
8359  kmp_str_buf_t field;
8360 
8361  KMP_DEBUG_ASSERT(buffer);
8362  KMP_DEBUG_ASSERT(gtid >= 0);
8363 
8364  __kmp_str_buf_init(&field);
8365  __kmp_str_buf_clear(buffer);
8366 
8367  th = __kmp_threads[gtid];
8368  retval = 0;
8369 
8370  // If format is NULL or zero-length string, then we use
8371  // affinity-format-var ICV
8372  parse_ptr = format;
8373  if (parse_ptr == NULL || *parse_ptr == '\0') {
8374  parse_ptr = __kmp_affinity_format;
8375  }
8376  KMP_DEBUG_ASSERT(parse_ptr);
8377 
8378  while (*parse_ptr != '\0') {
8379  // Parse a field
8380  if (*parse_ptr == '%') {
8381  // Put field in the buffer
8382  int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
8383  __kmp_str_buf_catbuf(buffer, &field);
8384  retval += rc;
8385  } else {
8386  // Put literal character in buffer
8387  __kmp_str_buf_cat(buffer, parse_ptr, 1);
8388  retval++;
8389  parse_ptr++;
8390  }
8391  }
8392  __kmp_str_buf_free(&field);
8393  return retval;
8394 }
8395 
8396 // Displays the affinity string to stdout
8397 void __kmp_aux_display_affinity(int gtid, const char *format) {
8398  kmp_str_buf_t buf;
8399  __kmp_str_buf_init(&buf);
8400  __kmp_aux_capture_affinity(gtid, format, &buf);
8401  __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
8402  __kmp_str_buf_free(&buf);
8403 }
8404 
8405 /* ------------------------------------------------------------------------ */
8406 
8407 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
8408  int blocktime = arg; /* argument is in milliseconds */
8409 #if KMP_USE_MONITOR
8410  int bt_intervals;
8411 #endif
8412  kmp_int8 bt_set;
8413 
8414  __kmp_save_internal_controls(thread);
8415 
8416  /* Normalize and set blocktime for the teams */
8417  if (blocktime < KMP_MIN_BLOCKTIME)
8418  blocktime = KMP_MIN_BLOCKTIME;
8419  else if (blocktime > KMP_MAX_BLOCKTIME)
8420  blocktime = KMP_MAX_BLOCKTIME;
8421 
8422  set__blocktime_team(thread->th.th_team, tid, blocktime);
8423  set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
8424 
8425 #if KMP_USE_MONITOR
8426  /* Calculate and set blocktime intervals for the teams */
8427  bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
8428 
8429  set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
8430  set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
8431 #endif
8432 
8433  /* Set whether blocktime has been set to "TRUE" */
8434  bt_set = TRUE;
8435 
8436  set__bt_set_team(thread->th.th_team, tid, bt_set);
8437  set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
8438 #if KMP_USE_MONITOR
8439  KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
8440  "bt_intervals=%d, monitor_updates=%d\n",
8441  __kmp_gtid_from_tid(tid, thread->th.th_team),
8442  thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
8443  __kmp_monitor_wakeups));
8444 #else
8445  KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
8446  __kmp_gtid_from_tid(tid, thread->th.th_team),
8447  thread->th.th_team->t.t_id, tid, blocktime));
8448 #endif
8449 }
8450 
8451 void __kmp_aux_set_defaults(char const *str, size_t len) {
8452  if (!__kmp_init_serial) {
8453  __kmp_serial_initialize();
8454  }
8455  __kmp_env_initialize(str);
8456 
8457  if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) {
8458  __kmp_env_print();
8459  }
8460 } // __kmp_aux_set_defaults
8461 
8462 /* ------------------------------------------------------------------------ */
8463 /* internal fast reduction routines */
8464 
8465 PACKED_REDUCTION_METHOD_T
8466 __kmp_determine_reduction_method(
8467  ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
8468  void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
8469  kmp_critical_name *lck) {
8470 
8471  // Default reduction method: critical construct ( lck != NULL, like in current
8472  // PAROPT )
8473  // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
8474  // can be selected by RTL
8475  // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
8476  // can be selected by RTL
8477  // Finally, it's up to OpenMP RTL to make a decision on which method to select
8478  // among generated by PAROPT.
8479 
8480  PACKED_REDUCTION_METHOD_T retval;
8481 
8482  int team_size;
8483 
8484  KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 )
8485  KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8486 
8487 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED \
8488  ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE))
8489 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8490 
8491  retval = critical_reduce_block;
8492 
8493  // another choice of getting a team size (with 1 dynamic deference) is slower
8494  team_size = __kmp_get_team_num_threads(global_tid);
8495  if (team_size == 1) {
8496 
8497  retval = empty_reduce_block;
8498 
8499  } else {
8500 
8501  int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8502 
8503 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || \
8504  KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64
8505 
8506 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \
8507  KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8508 
8509  int teamsize_cutoff = 4;
8510 
8511 #if KMP_MIC_SUPPORTED
8512  if (__kmp_mic_type != non_mic) {
8513  teamsize_cutoff = 8;
8514  }
8515 #endif
8516  int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8517  if (tree_available) {
8518  if (team_size <= teamsize_cutoff) {
8519  if (atomic_available) {
8520  retval = atomic_reduce_block;
8521  }
8522  } else {
8523  retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8524  }
8525  } else if (atomic_available) {
8526  retval = atomic_reduce_block;
8527  }
8528 #else
8529 #error "Unknown or unsupported OS"
8530 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8531  // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8532 
8533 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
8534 
8535 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_HURD
8536 
8537  // basic tuning
8538 
8539  if (atomic_available) {
8540  if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
8541  retval = atomic_reduce_block;
8542  }
8543  } // otherwise: use critical section
8544 
8545 #elif KMP_OS_DARWIN
8546 
8547  int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8548  if (atomic_available && (num_vars <= 3)) {
8549  retval = atomic_reduce_block;
8550  } else if (tree_available) {
8551  if ((reduce_size > (9 * sizeof(kmp_real64))) &&
8552  (reduce_size < (2000 * sizeof(kmp_real64)))) {
8553  retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
8554  }
8555  } // otherwise: use critical section
8556 
8557 #else
8558 #error "Unknown or unsupported OS"
8559 #endif
8560 
8561 #else
8562 #error "Unknown or unsupported architecture"
8563 #endif
8564  }
8565 
8566  // KMP_FORCE_REDUCTION
8567 
8568  // If the team is serialized (team_size == 1), ignore the forced reduction
8569  // method and stay with the unsynchronized method (empty_reduce_block)
8570  if (__kmp_force_reduction_method != reduction_method_not_defined &&
8571  team_size != 1) {
8572 
8573  PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
8574 
8575  int atomic_available, tree_available;
8576 
8577  switch ((forced_retval = __kmp_force_reduction_method)) {
8578  case critical_reduce_block:
8579  KMP_ASSERT(lck); // lck should be != 0
8580  break;
8581 
8582  case atomic_reduce_block:
8583  atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8584  if (!atomic_available) {
8585  KMP_WARNING(RedMethodNotSupported, "atomic");
8586  forced_retval = critical_reduce_block;
8587  }
8588  break;
8589 
8590  case tree_reduce_block:
8591  tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8592  if (!tree_available) {
8593  KMP_WARNING(RedMethodNotSupported, "tree");
8594  forced_retval = critical_reduce_block;
8595  } else {
8596 #if KMP_FAST_REDUCTION_BARRIER
8597  forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8598 #endif
8599  }
8600  break;
8601 
8602  default:
8603  KMP_ASSERT(0); // "unsupported method specified"
8604  }
8605 
8606  retval = forced_retval;
8607  }
8608 
8609  KA_TRACE(10, ("reduction method selected=%08x\n", retval));
8610 
8611 #undef FAST_REDUCTION_TREE_METHOD_GENERATED
8612 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
8613 
8614  return (retval);
8615 }
8616 // this function is for testing set/get/determine reduce method
8617 kmp_int32 __kmp_get_reduce_method(void) {
8618  return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
8619 }
8620 
8621 // Soft pause sets up threads to ignore blocktime and just go to sleep.
8622 // Spin-wait code checks __kmp_pause_status and reacts accordingly.
8623 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
8624 
8625 // Hard pause shuts down the runtime completely. Resume happens naturally when
8626 // OpenMP is used subsequently.
8627 void __kmp_hard_pause() {
8628  __kmp_pause_status = kmp_hard_paused;
8629  __kmp_internal_end_thread(-1);
8630 }
8631 
8632 // Soft resume sets __kmp_pause_status, and wakes up all threads.
8633 void __kmp_resume_if_soft_paused() {
8634  if (__kmp_pause_status == kmp_soft_paused) {
8635  __kmp_pause_status = kmp_not_paused;
8636 
8637  for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
8638  kmp_info_t *thread = __kmp_threads[gtid];
8639  if (thread) { // Wake it if sleeping
8640  kmp_flag_64<> fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
8641  thread);
8642  if (fl.is_sleeping())
8643  fl.resume(gtid);
8644  else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
8645  __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep
8646  } else { // thread holds the lock and may sleep soon
8647  do { // until either the thread sleeps, or we can get the lock
8648  if (fl.is_sleeping()) {
8649  fl.resume(gtid);
8650  break;
8651  } else if (__kmp_try_suspend_mx(thread)) {
8652  __kmp_unlock_suspend_mx(thread);
8653  break;
8654  }
8655  } while (1);
8656  }
8657  }
8658  }
8659  }
8660 }
8661 
8662 // This function is called via __kmpc_pause_resource. Returns 0 if successful.
8663 // TODO: add warning messages
8664 int __kmp_pause_resource(kmp_pause_status_t level) {
8665  if (level == kmp_not_paused) { // requesting resume
8666  if (__kmp_pause_status == kmp_not_paused) {
8667  // error message about runtime not being paused, so can't resume
8668  return 1;
8669  } else {
8670  KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
8671  __kmp_pause_status == kmp_hard_paused);
8672  __kmp_pause_status = kmp_not_paused;
8673  return 0;
8674  }
8675  } else if (level == kmp_soft_paused) { // requesting soft pause
8676  if (__kmp_pause_status != kmp_not_paused) {
8677  // error message about already being paused
8678  return 1;
8679  } else {
8680  __kmp_soft_pause();
8681  return 0;
8682  }
8683  } else if (level == kmp_hard_paused) { // requesting hard pause
8684  if (__kmp_pause_status != kmp_not_paused) {
8685  // error message about already being paused
8686  return 1;
8687  } else {
8688  __kmp_hard_pause();
8689  return 0;
8690  }
8691  } else {
8692  // error message about invalid level
8693  return 1;
8694  }
8695 }
8696 
8697 void __kmp_omp_display_env(int verbose) {
8698  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8699  if (__kmp_init_serial == 0)
8700  __kmp_do_serial_initialize();
8701  __kmp_display_env_impl(!verbose, verbose);
8702  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
8703 }
8704 
8705 // Globals and functions for hidden helper task
8706 kmp_info_t **__kmp_hidden_helper_threads;
8707 kmp_info_t *__kmp_hidden_helper_main_thread;
8708 std::atomic<kmp_int32> __kmp_unexecuted_hidden_helper_tasks;
8709 #if KMP_OS_LINUX
8710 kmp_int32 __kmp_hidden_helper_threads_num = 8;
8711 kmp_int32 __kmp_enable_hidden_helper = TRUE;
8712 #else
8713 kmp_int32 __kmp_hidden_helper_threads_num = 0;
8714 kmp_int32 __kmp_enable_hidden_helper = FALSE;
8715 #endif
8716 
8717 namespace {
8718 std::atomic<kmp_int32> __kmp_hit_hidden_helper_threads_num;
8719 
8720 void __kmp_hidden_helper_wrapper_fn(int *gtid, int *, ...) {
8721  // This is an explicit synchronization on all hidden helper threads in case
8722  // that when a regular thread pushes a hidden helper task to one hidden
8723  // helper thread, the thread has not been awaken once since they're released
8724  // by the main thread after creating the team.
8725  KMP_ATOMIC_INC(&__kmp_hit_hidden_helper_threads_num);
8726  while (KMP_ATOMIC_LD_ACQ(&__kmp_hit_hidden_helper_threads_num) !=
8727  __kmp_hidden_helper_threads_num)
8728  ;
8729 
8730  // If main thread, then wait for signal
8731  if (__kmpc_master(nullptr, *gtid)) {
8732  // First, unset the initial state and release the initial thread
8733  TCW_4(__kmp_init_hidden_helper_threads, FALSE);
8734  __kmp_hidden_helper_initz_release();
8735  __kmp_hidden_helper_main_thread_wait();
8736  // Now wake up all worker threads
8737  for (int i = 1; i < __kmp_hit_hidden_helper_threads_num; ++i) {
8738  __kmp_hidden_helper_worker_thread_signal();
8739  }
8740  }
8741 }
8742 } // namespace
8743 
8744 void __kmp_hidden_helper_threads_initz_routine() {
8745  // Create a new root for hidden helper team/threads
8746  const int gtid = __kmp_register_root(TRUE);
8747  __kmp_hidden_helper_main_thread = __kmp_threads[gtid];
8748  __kmp_hidden_helper_threads = &__kmp_threads[gtid];
8749  __kmp_hidden_helper_main_thread->th.th_set_nproc =
8750  __kmp_hidden_helper_threads_num;
8751 
8752  KMP_ATOMIC_ST_REL(&__kmp_hit_hidden_helper_threads_num, 0);
8753 
8754  __kmpc_fork_call(nullptr, 0, __kmp_hidden_helper_wrapper_fn);
8755 
8756  // Set the initialization flag to FALSE
8757  TCW_SYNC_4(__kmp_init_hidden_helper, FALSE);
8758 
8759  __kmp_hidden_helper_threads_deinitz_release();
8760 }
8761 
8762 /* Nesting Mode:
8763  Set via KMP_NESTING_MODE, which takes an integer.
8764  Note: we skip duplicate topology levels, and skip levels with only
8765  one entity.
8766  KMP_NESTING_MODE=0 is the default, and doesn't use nesting mode.
8767  KMP_NESTING_MODE=1 sets as many nesting levels as there are distinct levels
8768  in the topology, and initializes the number of threads at each of those
8769  levels to the number of entities at each level, respectively, below the
8770  entity at the parent level.
8771  KMP_NESTING_MODE=N, where N>1, attempts to create up to N nesting levels,
8772  but starts with nesting OFF -- max-active-levels-var is 1 -- and requires
8773  the user to turn nesting on explicitly. This is an even more experimental
8774  option to this experimental feature, and may change or go away in the
8775  future.
8776 */
8777 
8778 // Allocate space to store nesting levels
8779 void __kmp_init_nesting_mode() {
8780  int levels = KMP_HW_LAST;
8781  __kmp_nesting_mode_nlevels = levels;
8782  __kmp_nesting_nth_level = (int *)KMP_INTERNAL_MALLOC(levels * sizeof(int));
8783  for (int i = 0; i < levels; ++i)
8784  __kmp_nesting_nth_level[i] = 0;
8785  if (__kmp_nested_nth.size < levels) {
8786  __kmp_nested_nth.nth =
8787  (int *)KMP_INTERNAL_REALLOC(__kmp_nested_nth.nth, levels * sizeof(int));
8788  __kmp_nested_nth.size = levels;
8789  }
8790 }
8791 
8792 // Set # threads for top levels of nesting; must be called after topology set
8793 void __kmp_set_nesting_mode_threads() {
8794  kmp_info_t *thread = __kmp_threads[__kmp_entry_gtid()];
8795 
8796  if (__kmp_nesting_mode == 1)
8797  __kmp_nesting_mode_nlevels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
8798  else if (__kmp_nesting_mode > 1)
8799  __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
8800 
8801  if (__kmp_topology) { // use topology info
8802  int loc, hw_level;
8803  for (loc = 0, hw_level = 0; hw_level < __kmp_topology->get_depth() &&
8804  loc < __kmp_nesting_mode_nlevels;
8805  loc++, hw_level++) {
8806  __kmp_nesting_nth_level[loc] = __kmp_topology->get_ratio(hw_level);
8807  if (__kmp_nesting_nth_level[loc] == 1)
8808  loc--;
8809  }
8810  // Make sure all cores are used
8811  if (__kmp_nesting_mode > 1 && loc > 1) {
8812  int core_level = __kmp_topology->get_level(KMP_HW_CORE);
8813  int num_cores = __kmp_topology->get_count(core_level);
8814  int upper_levels = 1;
8815  for (int level = 0; level < loc - 1; ++level)
8816  upper_levels *= __kmp_nesting_nth_level[level];
8817  if (upper_levels * __kmp_nesting_nth_level[loc - 1] < num_cores)
8818  __kmp_nesting_nth_level[loc - 1] =
8819  num_cores / __kmp_nesting_nth_level[loc - 2];
8820  }
8821  __kmp_nesting_mode_nlevels = loc;
8822  __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
8823  } else { // no topology info available; provide a reasonable guesstimation
8824  if (__kmp_avail_proc >= 4) {
8825  __kmp_nesting_nth_level[0] = __kmp_avail_proc / 2;
8826  __kmp_nesting_nth_level[1] = 2;
8827  __kmp_nesting_mode_nlevels = 2;
8828  } else {
8829  __kmp_nesting_nth_level[0] = __kmp_avail_proc;
8830  __kmp_nesting_mode_nlevels = 1;
8831  }
8832  __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
8833  }
8834  for (int i = 0; i < __kmp_nesting_mode_nlevels; ++i) {
8835  __kmp_nested_nth.nth[i] = __kmp_nesting_nth_level[i];
8836  }
8837  set__nproc(thread, __kmp_nesting_nth_level[0]);
8838  if (__kmp_nesting_mode > 1 && __kmp_nesting_mode_nlevels > __kmp_nesting_mode)
8839  __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
8840  if (get__max_active_levels(thread) > 1) {
8841  // if max levels was set, set nesting mode levels to same
8842  __kmp_nesting_mode_nlevels = get__max_active_levels(thread);
8843  }
8844  if (__kmp_nesting_mode == 1) // turn on nesting for this case only
8845  set__max_active_levels(thread, __kmp_nesting_mode_nlevels);
8846 }
@ KMP_IDENT_AUTOPAR
Definition: kmp.h:198
KMP_EXPORT void __kmpc_serialized_parallel(ident_t *, kmp_int32 global_tid)
KMP_EXPORT void __kmpc_fork_call(ident_t *, kmp_int32 nargs, kmpc_micro microtask,...)
KMP_EXPORT void __kmpc_end_serialized_parallel(ident_t *, kmp_int32 global_tid)
#define KMP_INIT_PARTITIONED_TIMERS(name)
Initializes the partitioned timers to begin with name.
Definition: kmp_stats.h:933
#define KMP_COUNT_VALUE(name, value)
Adds value to specified timer (name).
Definition: kmp_stats.h:891
stats_state_e
the states which a thread can be in
Definition: kmp_stats.h:63
sched_type
Definition: kmp.h:355
KMP_EXPORT kmp_int32 __kmpc_master(ident_t *, kmp_int32 global_tid)
@ kmp_sch_auto
Definition: kmp.h:362
@ kmp_sch_static
Definition: kmp.h:358
@ kmp_sch_guided_chunked
Definition: kmp.h:360
Definition: kmp.h:233
kmp_int32 flags
Definition: kmp.h:235