LLVM OpenMP* Runtime Library
kmp_barrier.cpp
1 /*
2  * kmp_barrier.cpp
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_wait_release.h"
15 #include "kmp_itt.h"
16 #include "kmp_os.h"
17 #include "kmp_stats.h"
18 #include "ompt-specific.h"
19 
20 #if KMP_MIC
21 #include <immintrin.h>
22 #define USE_NGO_STORES 1
23 #endif // KMP_MIC
24 
25 #if KMP_MIC && USE_NGO_STORES
26 // ICV copying
27 #define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src))
28 #define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
29 #define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
30 #define ngo_sync() __asm__ volatile("lock; addl $0,0(%%rsp)" ::: "memory")
31 #else
32 #define ngo_load(src) ((void)0)
33 #define ngo_store_icvs(dst, src) copy_icvs((dst), (src))
34 #define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE)
35 #define ngo_sync() ((void)0)
36 #endif /* KMP_MIC && USE_NGO_STORES */
37 
38 void __kmp_print_structure(void); // Forward declaration
39 
40 // ---------------------------- Barrier Algorithms ----------------------------
41 
42 // Linear Barrier
43 template <bool cancellable = false>
44 static bool __kmp_linear_barrier_gather_template(
45  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
46  void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
47  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_gather);
48  kmp_team_t *team = this_thr->th.th_team;
49  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
50  kmp_info_t **other_threads = team->t.t_threads;
51 
52  KA_TRACE(
53  20,
54  ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
55  gtid, team->t.t_id, tid, bt));
56  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
57 
58 #if USE_ITT_BUILD && USE_ITT_NOTIFY
59  // Barrier imbalance - save arrive time to the thread
60  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
61  this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
62  __itt_get_timestamp();
63  }
64 #endif
65  // We now perform a linear reduction to signal that all of the threads have
66  // arrived.
67  if (!KMP_MASTER_TID(tid)) {
68  KA_TRACE(20,
69  ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
70  "arrived(%p): %llu => %llu\n",
71  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(0, team),
72  team->t.t_id, 0, &thr_bar->b_arrived, thr_bar->b_arrived,
73  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
74  // Mark arrival to primary thread
75  /* After performing this write, a worker thread may not assume that the team
76  is valid any more - it could be deallocated by the primary thread at any
77  time. */
78  kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[0]);
79  flag.release();
80  } else {
81  kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
82  int nproc = this_thr->th.th_team_nproc;
83  int i;
84  // Don't have to worry about sleep bit here or atomic since team setting
85  kmp_uint64 new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;
86 
87  // Collect all the worker team member threads.
88  for (i = 1; i < nproc; ++i) {
89 #if KMP_CACHE_MANAGE
90  // Prefetch next thread's arrived count
91  if (i + 1 < nproc)
92  KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_arrived);
93 #endif /* KMP_CACHE_MANAGE */
94  KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
95  "arrived(%p) == %llu\n",
96  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
97  team->t.t_id, i,
98  &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));
99 
100  // Wait for worker thread to arrive
101  if (cancellable) {
102  kmp_flag_64<true, false> flag(
103  &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state);
104  if (flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)))
105  return true;
106  } else {
107  kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived,
108  new_state);
109  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
110  }
111 #if USE_ITT_BUILD && USE_ITT_NOTIFY
112  // Barrier imbalance - write min of the thread time and the other thread
113  // time to the thread.
114  if (__kmp_forkjoin_frames_mode == 2) {
115  this_thr->th.th_bar_min_time = KMP_MIN(
116  this_thr->th.th_bar_min_time, other_threads[i]->th.th_bar_min_time);
117  }
118 #endif
119  if (reduce) {
120  KA_TRACE(100,
121  ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
122  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
123  team->t.t_id, i));
124  OMPT_REDUCTION_DECL(this_thr, gtid);
125  OMPT_REDUCTION_BEGIN;
126  (*reduce)(this_thr->th.th_local.reduce_data,
127  other_threads[i]->th.th_local.reduce_data);
128  OMPT_REDUCTION_END;
129  }
130  }
131  // Don't have to worry about sleep bit here or atomic since team setting
132  team_bar->b_arrived = new_state;
133  KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d "
134  "arrived(%p) = %llu\n",
135  gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived,
136  new_state));
137  }
138  KA_TRACE(
139  20,
140  ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
141  gtid, team->t.t_id, tid, bt));
142  return false;
143 }
144 
145 template <bool cancellable = false>
146 static bool __kmp_linear_barrier_release_template(
147  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
148  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
149  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_release);
150  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
151  kmp_team_t *team;
152 
153  if (KMP_MASTER_TID(tid)) {
154  unsigned int i;
155  kmp_uint32 nproc = this_thr->th.th_team_nproc;
156  kmp_info_t **other_threads;
157 
158  team = __kmp_threads[gtid]->th.th_team;
159  KMP_DEBUG_ASSERT(team != NULL);
160  other_threads = team->t.t_threads;
161 
162  KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) primary enter for "
163  "barrier type %d\n",
164  gtid, team->t.t_id, tid, bt));
165 
166  if (nproc > 1) {
167 #if KMP_BARRIER_ICV_PUSH
168  {
169  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
170  if (propagate_icvs) {
171  ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
172  for (i = 1; i < nproc; ++i) {
173  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i],
174  team, i, FALSE);
175  ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
176  &team->t.t_implicit_task_taskdata[0].td_icvs);
177  }
178  ngo_sync();
179  }
180  }
181 #endif // KMP_BARRIER_ICV_PUSH
182 
183  // Now, release all of the worker threads
184  for (i = 1; i < nproc; ++i) {
185 #if KMP_CACHE_MANAGE
186  // Prefetch next thread's go flag
187  if (i + 1 < nproc)
188  KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_go);
189 #endif /* KMP_CACHE_MANAGE */
190  KA_TRACE(
191  20,
192  ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
193  "go(%p): %u => %u\n",
194  gtid, team->t.t_id, tid, other_threads[i]->th.th_info.ds.ds_gtid,
195  team->t.t_id, i, &other_threads[i]->th.th_bar[bt].bb.b_go,
196  other_threads[i]->th.th_bar[bt].bb.b_go,
197  other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
198  kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_go,
199  other_threads[i]);
200  flag.release();
201  }
202  }
203  } else { // Wait for the PRIMARY thread to release us
204  KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
205  gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
206  if (cancellable) {
207  kmp_flag_64<true, false> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
208  if (flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)))
209  return true;
210  } else {
211  kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
212  flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
213  }
214 #if USE_ITT_BUILD && USE_ITT_NOTIFY
215  if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
216  // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is
217  // disabled)
218  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
219  // Cancel wait on previous parallel region...
220  __kmp_itt_task_starting(itt_sync_obj);
221 
222  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
223  return false;
224 
225  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
226  if (itt_sync_obj != NULL)
227  // Call prepare as early as possible for "new" barrier
228  __kmp_itt_task_finished(itt_sync_obj);
229  } else
230 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
231  // Early exit for reaping threads releasing forkjoin barrier
232  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
233  return false;
234 // The worker thread may now assume that the team is valid.
235 #ifdef KMP_DEBUG
236  tid = __kmp_tid_from_gtid(gtid);
237  team = __kmp_threads[gtid]->th.th_team;
238 #endif
239  KMP_DEBUG_ASSERT(team != NULL);
240  TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
241  KA_TRACE(20,
242  ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
243  gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
244  KMP_MB(); // Flush all pending memory write invalidates.
245  }
246  KA_TRACE(
247  20,
248  ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
249  gtid, team->t.t_id, tid, bt));
250  return false;
251 }
252 
253 static void __kmp_linear_barrier_gather(
254  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
255  void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
256  __kmp_linear_barrier_gather_template<false>(
257  bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
258 }
259 
260 static bool __kmp_linear_barrier_gather_cancellable(
261  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
262  void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
263  return __kmp_linear_barrier_gather_template<true>(
264  bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
265 }
266 
267 static void __kmp_linear_barrier_release(
268  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
269  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
270  __kmp_linear_barrier_release_template<false>(
271  bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
272 }
273 
274 static bool __kmp_linear_barrier_release_cancellable(
275  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
276  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
277  return __kmp_linear_barrier_release_template<true>(
278  bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
279 }
280 
281 // Tree barrier
282 static void __kmp_tree_barrier_gather(
283  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
284  void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
285  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_gather);
286  kmp_team_t *team = this_thr->th.th_team;
287  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
288  kmp_info_t **other_threads = team->t.t_threads;
289  kmp_uint32 nproc = this_thr->th.th_team_nproc;
290  kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
291  kmp_uint32 branch_factor = 1 << branch_bits;
292  kmp_uint32 child;
293  kmp_uint32 child_tid;
294  kmp_uint64 new_state = 0;
295 
296  KA_TRACE(
297  20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
298  gtid, team->t.t_id, tid, bt));
299  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
300 
301 #if USE_ITT_BUILD && USE_ITT_NOTIFY
302  // Barrier imbalance - save arrive time to the thread
303  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
304  this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
305  __itt_get_timestamp();
306  }
307 #endif
308  // Perform tree gather to wait until all threads have arrived; reduce any
309  // required data as we go
310  child_tid = (tid << branch_bits) + 1;
311  if (child_tid < nproc) {
312  // Parent threads wait for all their children to arrive
313  new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
314  child = 1;
315  do {
316  kmp_info_t *child_thr = other_threads[child_tid];
317  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
318 #if KMP_CACHE_MANAGE
319  // Prefetch next thread's arrived count
320  if (child + 1 <= branch_factor && child_tid + 1 < nproc)
321  KMP_CACHE_PREFETCH(
322  &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_arrived);
323 #endif /* KMP_CACHE_MANAGE */
324  KA_TRACE(20,
325  ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
326  "arrived(%p) == %llu\n",
327  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
328  team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
329  // Wait for child to arrive
330  kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
331  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
332 #if USE_ITT_BUILD && USE_ITT_NOTIFY
333  // Barrier imbalance - write min of the thread time and a child time to
334  // the thread.
335  if (__kmp_forkjoin_frames_mode == 2) {
336  this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
337  child_thr->th.th_bar_min_time);
338  }
339 #endif
340  if (reduce) {
341  KA_TRACE(100,
342  ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
343  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
344  team->t.t_id, child_tid));
345  OMPT_REDUCTION_DECL(this_thr, gtid);
346  OMPT_REDUCTION_BEGIN;
347  (*reduce)(this_thr->th.th_local.reduce_data,
348  child_thr->th.th_local.reduce_data);
349  OMPT_REDUCTION_END;
350  }
351  child++;
352  child_tid++;
353  } while (child <= branch_factor && child_tid < nproc);
354  }
355 
356  if (!KMP_MASTER_TID(tid)) { // Worker threads
357  kmp_int32 parent_tid = (tid - 1) >> branch_bits;
358 
359  KA_TRACE(20,
360  ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
361  "arrived(%p): %llu => %llu\n",
362  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
363  team->t.t_id, parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
364  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
365 
366  // Mark arrival to parent thread
367  /* After performing this write, a worker thread may not assume that the team
368  is valid any more - it could be deallocated by the primary thread at any
369  time. */
370  kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[parent_tid]);
371  flag.release();
372  } else {
373  // Need to update the team arrived pointer if we are the primary thread
374  if (nproc > 1) // New value was already computed above
375  team->t.t_bar[bt].b_arrived = new_state;
376  else
377  team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
378  KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d "
379  "arrived(%p) = %llu\n",
380  gtid, team->t.t_id, tid, team->t.t_id,
381  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
382  }
383  KA_TRACE(20,
384  ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
385  gtid, team->t.t_id, tid, bt));
386 }
387 
388 static void __kmp_tree_barrier_release(
389  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
390  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
391  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_release);
392  kmp_team_t *team;
393  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
394  kmp_uint32 nproc;
395  kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
396  kmp_uint32 branch_factor = 1 << branch_bits;
397  kmp_uint32 child;
398  kmp_uint32 child_tid;
399 
400  // Perform a tree release for all of the threads that have been gathered
401  if (!KMP_MASTER_TID(
402  tid)) { // Handle fork barrier workers who aren't part of a team yet
403  KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n", gtid,
404  &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
405  // Wait for parent thread to release us
406  kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
407  flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
408 #if USE_ITT_BUILD && USE_ITT_NOTIFY
409  if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
410  // In fork barrier where we could not get the object reliably (or
411  // ITTNOTIFY is disabled)
412  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
413  // Cancel wait on previous parallel region...
414  __kmp_itt_task_starting(itt_sync_obj);
415 
416  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
417  return;
418 
419  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
420  if (itt_sync_obj != NULL)
421  // Call prepare as early as possible for "new" barrier
422  __kmp_itt_task_finished(itt_sync_obj);
423  } else
424 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
425  // Early exit for reaping threads releasing forkjoin barrier
426  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
427  return;
428 
429  // The worker thread may now assume that the team is valid.
430  team = __kmp_threads[gtid]->th.th_team;
431  KMP_DEBUG_ASSERT(team != NULL);
432  tid = __kmp_tid_from_gtid(gtid);
433 
434  TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
435  KA_TRACE(20,
436  ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", gtid,
437  team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
438  KMP_MB(); // Flush all pending memory write invalidates.
439  } else {
440  team = __kmp_threads[gtid]->th.th_team;
441  KMP_DEBUG_ASSERT(team != NULL);
442  KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) primary enter for "
443  "barrier type %d\n",
444  gtid, team->t.t_id, tid, bt));
445  }
446  nproc = this_thr->th.th_team_nproc;
447  child_tid = (tid << branch_bits) + 1;
448 
449  if (child_tid < nproc) {
450  kmp_info_t **other_threads = team->t.t_threads;
451  child = 1;
452  // Parent threads release all their children
453  do {
454  kmp_info_t *child_thr = other_threads[child_tid];
455  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
456 #if KMP_CACHE_MANAGE
457  // Prefetch next thread's go count
458  if (child + 1 <= branch_factor && child_tid + 1 < nproc)
459  KMP_CACHE_PREFETCH(
460  &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_go);
461 #endif /* KMP_CACHE_MANAGE */
462 
463 #if KMP_BARRIER_ICV_PUSH
464  {
465  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
466  if (propagate_icvs) {
467  __kmp_init_implicit_task(team->t.t_ident,
468  team->t.t_threads[child_tid], team,
469  child_tid, FALSE);
470  copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
471  &team->t.t_implicit_task_taskdata[0].td_icvs);
472  }
473  }
474 #endif // KMP_BARRIER_ICV_PUSH
475  KA_TRACE(20,
476  ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
477  "go(%p): %u => %u\n",
478  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
479  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
480  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
481  // Release child from barrier
482  kmp_flag_64<> flag(&child_bar->b_go, child_thr);
483  flag.release();
484  child++;
485  child_tid++;
486  } while (child <= branch_factor && child_tid < nproc);
487  }
488  KA_TRACE(
489  20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
490  gtid, team->t.t_id, tid, bt));
491 }
492 
493 // Hyper Barrier
494 static void __kmp_hyper_barrier_gather(
495  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
496  void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
497  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_gather);
498  kmp_team_t *team = this_thr->th.th_team;
499  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
500  kmp_info_t **other_threads = team->t.t_threads;
501  kmp_uint64 new_state = KMP_BARRIER_UNUSED_STATE;
502  kmp_uint32 num_threads = this_thr->th.th_team_nproc;
503  kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
504  kmp_uint32 branch_factor = 1 << branch_bits;
505  kmp_uint32 offset;
506  kmp_uint32 level;
507 
508  KA_TRACE(
509  20,
510  ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
511  gtid, team->t.t_id, tid, bt));
512  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
513 
514 #if USE_ITT_BUILD && USE_ITT_NOTIFY
515  // Barrier imbalance - save arrive time to the thread
516  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
517  this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
518  __itt_get_timestamp();
519  }
520 #endif
521  /* Perform a hypercube-embedded tree gather to wait until all of the threads
522  have arrived, and reduce any required data as we go. */
523  kmp_flag_64<> p_flag(&thr_bar->b_arrived);
524  for (level = 0, offset = 1; offset < num_threads;
525  level += branch_bits, offset <<= branch_bits) {
526  kmp_uint32 child;
527  kmp_uint32 child_tid;
528 
529  if (((tid >> level) & (branch_factor - 1)) != 0) {
530  kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) - 1);
531 
532  KMP_MB(); // Synchronize parent and child threads.
533  KA_TRACE(20,
534  ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
535  "arrived(%p): %llu => %llu\n",
536  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
537  team->t.t_id, parent_tid, &thr_bar->b_arrived,
538  thr_bar->b_arrived,
539  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
540  // Mark arrival to parent thread
541  /* After performing this write (in the last iteration of the enclosing for
542  loop), a worker thread may not assume that the team is valid any more
543  - it could be deallocated by the primary thread at any time. */
544  p_flag.set_waiter(other_threads[parent_tid]);
545  p_flag.release();
546  break;
547  }
548 
549  // Parent threads wait for children to arrive
550  if (new_state == KMP_BARRIER_UNUSED_STATE)
551  new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
552  for (child = 1, child_tid = tid + (1 << level);
553  child < branch_factor && child_tid < num_threads;
554  child++, child_tid += (1 << level)) {
555  kmp_info_t *child_thr = other_threads[child_tid];
556  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
557 #if KMP_CACHE_MANAGE
558  kmp_uint32 next_child_tid = child_tid + (1 << level);
559  // Prefetch next thread's arrived count
560  if (child + 1 < branch_factor && next_child_tid < num_threads)
561  KMP_CACHE_PREFETCH(
562  &other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
563 #endif /* KMP_CACHE_MANAGE */
564  KA_TRACE(20,
565  ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
566  "arrived(%p) == %llu\n",
567  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
568  team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
569  // Wait for child to arrive
570  kmp_flag_64<> c_flag(&child_bar->b_arrived, new_state);
571  c_flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
572  KMP_MB(); // Synchronize parent and child threads.
573 #if USE_ITT_BUILD && USE_ITT_NOTIFY
574  // Barrier imbalance - write min of the thread time and a child time to
575  // the thread.
576  if (__kmp_forkjoin_frames_mode == 2) {
577  this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
578  child_thr->th.th_bar_min_time);
579  }
580 #endif
581  if (reduce) {
582  KA_TRACE(100,
583  ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
584  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
585  team->t.t_id, child_tid));
586  OMPT_REDUCTION_DECL(this_thr, gtid);
587  OMPT_REDUCTION_BEGIN;
588  (*reduce)(this_thr->th.th_local.reduce_data,
589  child_thr->th.th_local.reduce_data);
590  OMPT_REDUCTION_END;
591  }
592  }
593  }
594 
595  if (KMP_MASTER_TID(tid)) {
596  // Need to update the team arrived pointer if we are the primary thread
597  if (new_state == KMP_BARRIER_UNUSED_STATE)
598  team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
599  else
600  team->t.t_bar[bt].b_arrived = new_state;
601  KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d "
602  "arrived(%p) = %llu\n",
603  gtid, team->t.t_id, tid, team->t.t_id,
604  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
605  }
606  KA_TRACE(
607  20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
608  gtid, team->t.t_id, tid, bt));
609 }
610 
611 // The reverse versions seem to beat the forward versions overall
612 #define KMP_REVERSE_HYPER_BAR
613 static void __kmp_hyper_barrier_release(
614  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
615  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
616  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_release);
617  kmp_team_t *team;
618  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
619  kmp_info_t **other_threads;
620  kmp_uint32 num_threads;
621  kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
622  kmp_uint32 branch_factor = 1 << branch_bits;
623  kmp_uint32 child;
624  kmp_uint32 child_tid;
625  kmp_uint32 offset;
626  kmp_uint32 level;
627 
628  /* Perform a hypercube-embedded tree release for all of the threads that have
629  been gathered. If KMP_REVERSE_HYPER_BAR is defined (default) the threads
630  are released in the reverse order of the corresponding gather, otherwise
631  threads are released in the same order. */
632  if (KMP_MASTER_TID(tid)) { // primary thread
633  team = __kmp_threads[gtid]->th.th_team;
634  KMP_DEBUG_ASSERT(team != NULL);
635  KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) primary enter for "
636  "barrier type %d\n",
637  gtid, team->t.t_id, tid, bt));
638 #if KMP_BARRIER_ICV_PUSH
639  if (propagate_icvs) { // primary already has ICVs in final destination; copy
640  copy_icvs(&thr_bar->th_fixed_icvs,
641  &team->t.t_implicit_task_taskdata[tid].td_icvs);
642  }
643 #endif
644  } else { // Handle fork barrier workers who aren't part of a team yet
645  KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n", gtid,
646  &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
647  // Wait for parent thread to release us
648  kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
649  flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
650 #if USE_ITT_BUILD && USE_ITT_NOTIFY
651  if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
652  // In fork barrier where we could not get the object reliably
653  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
654  // Cancel wait on previous parallel region...
655  __kmp_itt_task_starting(itt_sync_obj);
656 
657  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
658  return;
659 
660  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
661  if (itt_sync_obj != NULL)
662  // Call prepare as early as possible for "new" barrier
663  __kmp_itt_task_finished(itt_sync_obj);
664  } else
665 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
666  // Early exit for reaping threads releasing forkjoin barrier
667  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
668  return;
669 
670  // The worker thread may now assume that the team is valid.
671  team = __kmp_threads[gtid]->th.th_team;
672  KMP_DEBUG_ASSERT(team != NULL);
673  tid = __kmp_tid_from_gtid(gtid);
674 
675  TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
676  KA_TRACE(20,
677  ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
678  gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
679  KMP_MB(); // Flush all pending memory write invalidates.
680  }
681  num_threads = this_thr->th.th_team_nproc;
682  other_threads = team->t.t_threads;
683 
684 #ifdef KMP_REVERSE_HYPER_BAR
685  // Count up to correct level for parent
686  for (level = 0, offset = 1;
687  offset < num_threads && (((tid >> level) & (branch_factor - 1)) == 0);
688  level += branch_bits, offset <<= branch_bits)
689  ;
690 
691  // Now go down from there
692  for (level -= branch_bits, offset >>= branch_bits; offset != 0;
693  level -= branch_bits, offset >>= branch_bits)
694 #else
695  // Go down the tree, level by level
696  for (level = 0, offset = 1; offset < num_threads;
697  level += branch_bits, offset <<= branch_bits)
698 #endif // KMP_REVERSE_HYPER_BAR
699  {
700 #ifdef KMP_REVERSE_HYPER_BAR
701  /* Now go in reverse order through the children, highest to lowest.
702  Initial setting of child is conservative here. */
703  child = num_threads >> ((level == 0) ? level : level - 1);
704  for (child = (child < branch_factor - 1) ? child : branch_factor - 1,
705  child_tid = tid + (child << level);
706  child >= 1; child--, child_tid -= (1 << level))
707 #else
708  if (((tid >> level) & (branch_factor - 1)) != 0)
709  // No need to go lower than this, since this is the level parent would be
710  // notified
711  break;
712  // Iterate through children on this level of the tree
713  for (child = 1, child_tid = tid + (1 << level);
714  child < branch_factor && child_tid < num_threads;
715  child++, child_tid += (1 << level))
716 #endif // KMP_REVERSE_HYPER_BAR
717  {
718  if (child_tid >= num_threads)
719  continue; // Child doesn't exist so keep going
720  else {
721  kmp_info_t *child_thr = other_threads[child_tid];
722  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
723 #if KMP_CACHE_MANAGE
724  kmp_uint32 next_child_tid = child_tid - (1 << level);
725 // Prefetch next thread's go count
726 #ifdef KMP_REVERSE_HYPER_BAR
727  if (child - 1 >= 1 && next_child_tid < num_threads)
728 #else
729  if (child + 1 < branch_factor && next_child_tid < num_threads)
730 #endif // KMP_REVERSE_HYPER_BAR
731  KMP_CACHE_PREFETCH(
732  &other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
733 #endif /* KMP_CACHE_MANAGE */
734 
735 #if KMP_BARRIER_ICV_PUSH
736  if (propagate_icvs) // push my fixed ICVs to my child
737  copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
738 #endif // KMP_BARRIER_ICV_PUSH
739 
740  KA_TRACE(
741  20,
742  ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
743  "go(%p): %u => %u\n",
744  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
745  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
746  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
747  // Release child from barrier
748  kmp_flag_64<> flag(&child_bar->b_go, child_thr);
749  flag.release();
750  }
751  }
752  }
753 #if KMP_BARRIER_ICV_PUSH
754  if (propagate_icvs &&
755  !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest
756  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
757  FALSE);
758  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
759  &thr_bar->th_fixed_icvs);
760  }
761 #endif
762  KA_TRACE(
763  20,
764  ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
765  gtid, team->t.t_id, tid, bt));
766 }
767 
768 // Hierarchical Barrier
769 
770 // Initialize thread barrier data
771 /* Initializes/re-initializes the hierarchical barrier data stored on a thread.
772  Performs the minimum amount of initialization required based on how the team
773  has changed. Returns true if leaf children will require both on-core and
774  traditional wake-up mechanisms. For example, if the team size increases,
775  threads already in the team will respond to on-core wakeup on their parent
776  thread, but threads newly added to the team will only be listening on the
777  their local b_go. */
778 static bool __kmp_init_hierarchical_barrier_thread(enum barrier_type bt,
779  kmp_bstate_t *thr_bar,
780  kmp_uint32 nproc, int gtid,
781  int tid, kmp_team_t *team) {
782  // Checks to determine if (re-)initialization is needed
783  bool uninitialized = thr_bar->team == NULL;
784  bool team_changed = team != thr_bar->team;
785  bool team_sz_changed = nproc != thr_bar->nproc;
786  bool tid_changed = tid != thr_bar->old_tid;
787  bool retval = false;
788 
789  if (uninitialized || team_sz_changed) {
790  __kmp_get_hierarchy(nproc, thr_bar);
791  }
792 
793  if (uninitialized || team_sz_changed || tid_changed) {
794  thr_bar->my_level = thr_bar->depth - 1; // default for primary thread
795  thr_bar->parent_tid = -1; // default for primary thread
796  if (!KMP_MASTER_TID(tid)) {
797  // if not primary thread, find parent thread in hierarchy
798  kmp_uint32 d = 0;
799  while (d < thr_bar->depth) { // find parent based on level of thread in
800  // hierarchy, and note level
801  kmp_uint32 rem;
802  if (d == thr_bar->depth - 2) { // reached level right below the primary
803  thr_bar->parent_tid = 0;
804  thr_bar->my_level = d;
805  break;
806  } else if ((rem = tid % thr_bar->skip_per_level[d + 1]) != 0) {
807  // TODO: can we make the above op faster?
808  // thread is not a subtree root at next level, so this is max
809  thr_bar->parent_tid = tid - rem;
810  thr_bar->my_level = d;
811  break;
812  }
813  ++d;
814  }
815  }
816  __kmp_type_convert(7 - ((tid - thr_bar->parent_tid) /
817  (thr_bar->skip_per_level[thr_bar->my_level])),
818  &(thr_bar->offset));
819  thr_bar->old_tid = tid;
820  thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
821  thr_bar->team = team;
822  thr_bar->parent_bar =
823  &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
824  }
825  if (uninitialized || team_changed || tid_changed) {
826  thr_bar->team = team;
827  thr_bar->parent_bar =
828  &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
829  retval = true;
830  }
831  if (uninitialized || team_sz_changed || tid_changed) {
832  thr_bar->nproc = nproc;
833  thr_bar->leaf_kids = thr_bar->base_leaf_kids;
834  if (thr_bar->my_level == 0)
835  thr_bar->leaf_kids = 0;
836  if (thr_bar->leaf_kids && (kmp_uint32)tid + thr_bar->leaf_kids + 1 > nproc)
837  __kmp_type_convert(nproc - tid - 1, &(thr_bar->leaf_kids));
838  thr_bar->leaf_state = 0;
839  for (int i = 0; i < thr_bar->leaf_kids; ++i)
840  ((char *)&(thr_bar->leaf_state))[7 - i] = 1;
841  }
842  return retval;
843 }
844 
845 static void __kmp_hierarchical_barrier_gather(
846  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
847  void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
848  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_gather);
849  kmp_team_t *team = this_thr->th.th_team;
850  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
851  kmp_uint32 nproc = this_thr->th.th_team_nproc;
852  kmp_info_t **other_threads = team->t.t_threads;
853  kmp_uint64 new_state = 0;
854 
855  int level = team->t.t_level;
856  if (other_threads[0]
857  ->th.th_teams_microtask) // are we inside the teams construct?
858  if (this_thr->th.th_teams_size.nteams > 1)
859  ++level; // level was not increased in teams construct for team_of_masters
860  if (level == 1)
861  thr_bar->use_oncore_barrier = 1;
862  else
863  thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
864 
865  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for "
866  "barrier type %d\n",
867  gtid, team->t.t_id, tid, bt));
868  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
869 
870 #if USE_ITT_BUILD && USE_ITT_NOTIFY
871  // Barrier imbalance - save arrive time to the thread
872  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
873  this_thr->th.th_bar_arrive_time = __itt_get_timestamp();
874  }
875 #endif
876 
877  (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid,
878  team);
879 
880  if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf)
881  kmp_int32 child_tid;
882  new_state =
883  (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
884  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
885  thr_bar->use_oncore_barrier) {
886  if (thr_bar->leaf_kids) {
887  // First, wait for leaf children to check-in on my b_arrived flag
888  kmp_uint64 leaf_state =
889  KMP_MASTER_TID(tid)
890  ? thr_bar->b_arrived | thr_bar->leaf_state
891  : team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;
892  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting "
893  "for leaf kids\n",
894  gtid, team->t.t_id, tid));
895  kmp_flag_64<> flag(&thr_bar->b_arrived, leaf_state);
896  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
897  if (reduce) {
898  OMPT_REDUCTION_DECL(this_thr, gtid);
899  OMPT_REDUCTION_BEGIN;
900  for (child_tid = tid + 1; child_tid <= tid + thr_bar->leaf_kids;
901  ++child_tid) {
902  KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
903  "T#%d(%d:%d)\n",
904  gtid, team->t.t_id, tid,
905  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
906  child_tid));
907  (*reduce)(this_thr->th.th_local.reduce_data,
908  other_threads[child_tid]->th.th_local.reduce_data);
909  }
910  OMPT_REDUCTION_END;
911  }
912  // clear leaf_state bits
913  KMP_TEST_THEN_AND64(&thr_bar->b_arrived, ~(thr_bar->leaf_state));
914  }
915  // Next, wait for higher level children on each child's b_arrived flag
916  for (kmp_uint32 d = 1; d < thr_bar->my_level;
917  ++d) { // gather lowest level threads first, but skip 0
918  kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
919  skip = thr_bar->skip_per_level[d];
920  if (last > nproc)
921  last = nproc;
922  for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
923  kmp_info_t *child_thr = other_threads[child_tid];
924  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
925  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
926  "T#%d(%d:%d) "
927  "arrived(%p) == %llu\n",
928  gtid, team->t.t_id, tid,
929  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
930  child_tid, &child_bar->b_arrived, new_state));
931  kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
932  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
933  if (reduce) {
934  KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
935  "T#%d(%d:%d)\n",
936  gtid, team->t.t_id, tid,
937  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
938  child_tid));
939  (*reduce)(this_thr->th.th_local.reduce_data,
940  child_thr->th.th_local.reduce_data);
941  }
942  }
943  }
944  } else { // Blocktime is not infinite
945  for (kmp_uint32 d = 0; d < thr_bar->my_level;
946  ++d) { // Gather lowest level threads first
947  kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
948  skip = thr_bar->skip_per_level[d];
949  if (last > nproc)
950  last = nproc;
951  for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
952  kmp_info_t *child_thr = other_threads[child_tid];
953  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
954  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
955  "T#%d(%d:%d) "
956  "arrived(%p) == %llu\n",
957  gtid, team->t.t_id, tid,
958  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
959  child_tid, &child_bar->b_arrived, new_state));
960  kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
961  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
962  if (reduce) {
963  KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
964  "T#%d(%d:%d)\n",
965  gtid, team->t.t_id, tid,
966  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
967  child_tid));
968  (*reduce)(this_thr->th.th_local.reduce_data,
969  child_thr->th.th_local.reduce_data);
970  }
971  }
972  }
973  }
974  }
975  // All subordinates are gathered; now release parent if not primary thread
976 
977  if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy
978  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing"
979  " T#%d(%d:%d) arrived(%p): %llu => %llu\n",
980  gtid, team->t.t_id, tid,
981  __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id,
982  thr_bar->parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
983  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
984  /* Mark arrival to parent: After performing this write, a worker thread may
985  not assume that the team is valid any more - it could be deallocated by
986  the primary thread at any time. */
987  if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ||
988  !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived
989  // flag; release it
990  kmp_flag_64<> flag(&thr_bar->b_arrived,
991  other_threads[thr_bar->parent_tid]);
992  flag.release();
993  } else {
994  // Leaf does special release on "offset" bits of parent's b_arrived flag
995  thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
996  kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived,
997  thr_bar->offset + 1);
998  flag.set_waiter(other_threads[thr_bar->parent_tid]);
999  flag.release();
1000  }
1001  } else { // Primary thread needs to update the team's b_arrived value
1002  team->t.t_bar[bt].b_arrived = new_state;
1003  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d "
1004  "arrived(%p) = %llu\n",
1005  gtid, team->t.t_id, tid, team->t.t_id,
1006  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
1007  }
1008  // Is the team access below unsafe or just technically invalid?
1009  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for "
1010  "barrier type %d\n",
1011  gtid, team->t.t_id, tid, bt));
1012 }
1013 
1014 static void __kmp_hierarchical_barrier_release(
1015  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
1016  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
1017  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_release);
1018  kmp_team_t *team;
1019  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1020  kmp_uint32 nproc;
1021  bool team_change = false; // indicates on-core barrier shouldn't be used
1022 
1023  if (KMP_MASTER_TID(tid)) {
1024  team = __kmp_threads[gtid]->th.th_team;
1025  KMP_DEBUG_ASSERT(team != NULL);
1026  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) primary "
1027  "entered barrier type %d\n",
1028  gtid, team->t.t_id, tid, bt));
1029  } else { // Worker threads
1030  // Wait for parent thread to release me
1031  if (!thr_bar->use_oncore_barrier ||
1032  __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME || thr_bar->my_level != 0 ||
1033  thr_bar->team == NULL) {
1034  // Use traditional method of waiting on my own b_go flag
1035  thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
1036  kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
1037  flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1038  TCW_8(thr_bar->b_go,
1039  KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1040  } else { // Thread barrier data is initialized, this is a leaf, blocktime is
1041  // infinite, not nested
1042  // Wait on my "offset" bits on parent's b_go flag
1043  thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
1044  kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP,
1045  thr_bar->offset + 1, bt,
1046  this_thr USE_ITT_BUILD_ARG(itt_sync_obj));
1047  flag.wait(this_thr, TRUE);
1048  if (thr_bar->wait_flag ==
1049  KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go
1050  TCW_8(thr_bar->b_go,
1051  KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1052  } else { // Reset my bits on parent's b_go flag
1053  (RCAST(volatile char *,
1054  &(thr_bar->parent_bar->b_go)))[thr_bar->offset + 1] = 0;
1055  }
1056  }
1057  thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
1058  // Early exit for reaping threads releasing forkjoin barrier
1059  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1060  return;
1061  // The worker thread may now assume that the team is valid.
1062  team = __kmp_threads[gtid]->th.th_team;
1063  KMP_DEBUG_ASSERT(team != NULL);
1064  tid = __kmp_tid_from_gtid(gtid);
1065 
1066  KA_TRACE(
1067  20,
1068  ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
1069  gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
1070  KMP_MB(); // Flush all pending memory write invalidates.
1071  }
1072 
1073  nproc = this_thr->th.th_team_nproc;
1074  int level = team->t.t_level;
1075  if (team->t.t_threads[0]
1076  ->th.th_teams_microtask) { // are we inside the teams construct?
1077  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1078  this_thr->th.th_teams_level == level)
1079  ++level; // level was not increased in teams construct for team_of_workers
1080  if (this_thr->th.th_teams_size.nteams > 1)
1081  ++level; // level was not increased in teams construct for team_of_masters
1082  }
1083  if (level == 1)
1084  thr_bar->use_oncore_barrier = 1;
1085  else
1086  thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
1087 
1088  // If the team size has increased, we still communicate with old leaves via
1089  // oncore barrier.
1090  unsigned short int old_leaf_kids = thr_bar->leaf_kids;
1091  kmp_uint64 old_leaf_state = thr_bar->leaf_state;
1092  team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid,
1093  tid, team);
1094  // But if the entire team changes, we won't use oncore barrier at all
1095  if (team_change)
1096  old_leaf_kids = 0;
1097 
1098 #if KMP_BARRIER_ICV_PUSH
1099  if (propagate_icvs) {
1100  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
1101  FALSE);
1102  if (KMP_MASTER_TID(
1103  tid)) { // primary already has copy in final destination; copy
1104  copy_icvs(&thr_bar->th_fixed_icvs,
1105  &team->t.t_implicit_task_taskdata[tid].td_icvs);
1106  } else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1107  thr_bar->use_oncore_barrier) { // optimization for inf blocktime
1108  if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0)
1109  // leaves (on-core children) pull parent's fixed ICVs directly to local
1110  // ICV store
1111  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1112  &thr_bar->parent_bar->th_fixed_icvs);
1113  // non-leaves will get ICVs piggybacked with b_go via NGO store
1114  } else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs
1115  if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can
1116  // access
1117  copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);
1118  else // leaves copy parent's fixed ICVs directly to local ICV store
1119  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1120  &thr_bar->parent_bar->th_fixed_icvs);
1121  }
1122  }
1123 #endif // KMP_BARRIER_ICV_PUSH
1124 
1125  // Now, release my children
1126  if (thr_bar->my_level) { // not a leaf
1127  kmp_int32 child_tid;
1128  kmp_uint32 last;
1129  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1130  thr_bar->use_oncore_barrier) {
1131  if (KMP_MASTER_TID(tid)) { // do a flat release
1132  // Set local b_go to bump children via NGO store of the cache line
1133  // containing IVCs and b_go.
1134  thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
1135  // Use ngo stores if available; b_go piggybacks in the last 8 bytes of
1136  // the cache line
1137  ngo_load(&thr_bar->th_fixed_icvs);
1138  // This loops over all the threads skipping only the leaf nodes in the
1139  // hierarchy
1140  for (child_tid = thr_bar->skip_per_level[1]; child_tid < (int)nproc;
1141  child_tid += thr_bar->skip_per_level[1]) {
1142  kmp_bstate_t *child_bar =
1143  &team->t.t_threads[child_tid]->th.th_bar[bt].bb;
1144  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1145  "releasing T#%d(%d:%d)"
1146  " go(%p): %u => %u\n",
1147  gtid, team->t.t_id, tid,
1148  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1149  child_tid, &child_bar->b_go, child_bar->b_go,
1150  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1151  // Use ngo store (if available) to both store ICVs and release child
1152  // via child's b_go
1153  ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
1154  }
1155  ngo_sync();
1156  }
1157  TCW_8(thr_bar->b_go,
1158  KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1159  // Now, release leaf children
1160  if (thr_bar->leaf_kids) { // if there are any
1161  // We test team_change on the off-chance that the level 1 team changed.
1162  if (team_change ||
1163  old_leaf_kids < thr_bar->leaf_kids) { // some old, some new
1164  if (old_leaf_kids) { // release old leaf kids
1165  thr_bar->b_go |= old_leaf_state;
1166  }
1167  // Release new leaf kids
1168  last = tid + thr_bar->skip_per_level[1];
1169  if (last > nproc)
1170  last = nproc;
1171  for (child_tid = tid + 1 + old_leaf_kids; child_tid < (int)last;
1172  ++child_tid) { // skip_per_level[0]=1
1173  kmp_info_t *child_thr = team->t.t_threads[child_tid];
1174  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1175  KA_TRACE(
1176  20,
1177  ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"
1178  " T#%d(%d:%d) go(%p): %u => %u\n",
1179  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1180  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1181  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1182  // Release child using child's b_go flag
1183  kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1184  flag.release();
1185  }
1186  } else { // Release all children at once with leaf_state bits on my own
1187  // b_go flag
1188  thr_bar->b_go |= thr_bar->leaf_state;
1189  }
1190  }
1191  } else { // Blocktime is not infinite; do a simple hierarchical release
1192  for (int d = thr_bar->my_level - 1; d >= 0;
1193  --d) { // Release highest level threads first
1194  last = tid + thr_bar->skip_per_level[d + 1];
1195  kmp_uint32 skip = thr_bar->skip_per_level[d];
1196  if (last > nproc)
1197  last = nproc;
1198  for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1199  kmp_info_t *child_thr = team->t.t_threads[child_tid];
1200  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1201  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1202  "releasing T#%d(%d:%d) go(%p): %u => %u\n",
1203  gtid, team->t.t_id, tid,
1204  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1205  child_tid, &child_bar->b_go, child_bar->b_go,
1206  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1207  // Release child using child's b_go flag
1208  kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1209  flag.release();
1210  }
1211  }
1212  }
1213 #if KMP_BARRIER_ICV_PUSH
1214  if (propagate_icvs && !KMP_MASTER_TID(tid))
1215  // non-leaves copy ICVs from fixed ICVs to local dest
1216  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1217  &thr_bar->th_fixed_icvs);
1218 #endif // KMP_BARRIER_ICV_PUSH
1219  }
1220  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for "
1221  "barrier type %d\n",
1222  gtid, team->t.t_id, tid, bt));
1223 }
1224 
1225 // End of Barrier Algorithms
1226 
1227 // type traits for cancellable value
1228 // if cancellable is true, then is_cancellable is a normal boolean variable
1229 // if cancellable is false, then is_cancellable is a compile time constant
1230 template <bool cancellable> struct is_cancellable {};
1231 template <> struct is_cancellable<true> {
1232  bool value;
1233  is_cancellable() : value(false) {}
1234  is_cancellable(bool b) : value(b) {}
1235  is_cancellable &operator=(bool b) {
1236  value = b;
1237  return *this;
1238  }
1239  operator bool() const { return value; }
1240 };
1241 template <> struct is_cancellable<false> {
1242  is_cancellable &operator=(bool b) { return *this; }
1243  constexpr operator bool() const { return false; }
1244 };
1245 
1246 // Internal function to do a barrier.
1247 /* If is_split is true, do a split barrier, otherwise, do a plain barrier
1248  If reduce is non-NULL, do a split reduction barrier, otherwise, do a split
1249  barrier
1250  When cancellable = false,
1251  Returns 0 if primary thread, 1 if worker thread.
1252  When cancellable = true
1253  Returns 0 if not cancelled, 1 if cancelled. */
1254 template <bool cancellable = false>
1255 static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split,
1256  size_t reduce_size, void *reduce_data,
1257  void (*reduce)(void *, void *)) {
1258  KMP_TIME_PARTITIONED_BLOCK(OMP_plain_barrier);
1259  KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1260  int tid = __kmp_tid_from_gtid(gtid);
1261  kmp_info_t *this_thr = __kmp_threads[gtid];
1262  kmp_team_t *team = this_thr->th.th_team;
1263  int status = 0;
1264  is_cancellable<cancellable> cancelled;
1265 #if OMPT_SUPPORT && OMPT_OPTIONAL
1266  ompt_data_t *my_task_data;
1267  ompt_data_t *my_parallel_data;
1268  void *return_address;
1269  ompt_sync_region_t barrier_kind;
1270 #endif
1271 
1272  KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n", gtid,
1273  __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1274 
1275 #if OMPT_SUPPORT
1276  if (ompt_enabled.enabled) {
1277 #if OMPT_OPTIONAL
1278  my_task_data = OMPT_CUR_TASK_DATA(this_thr);
1279  my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
1280  return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1281  barrier_kind = __ompt_get_barrier_kind(bt, this_thr);
1282  if (ompt_enabled.ompt_callback_sync_region) {
1283  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1284  barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1285  return_address);
1286  }
1287  if (ompt_enabled.ompt_callback_sync_region_wait) {
1288  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1289  barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1290  return_address);
1291  }
1292 #endif
1293  // It is OK to report the barrier state after the barrier begin callback.
1294  // According to the OMPT specification, a compliant implementation may
1295  // even delay reporting this state until the barrier begins to wait.
1296  this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
1297  }
1298 #endif
1299 
1300  if (!team->t.t_serialized) {
1301 #if USE_ITT_BUILD
1302  // This value will be used in itt notify events below.
1303  void *itt_sync_obj = NULL;
1304 #if USE_ITT_NOTIFY
1305  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1306  itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1307 #endif
1308 #endif /* USE_ITT_BUILD */
1309  if (__kmp_tasking_mode == tskm_extra_barrier) {
1310  __kmp_tasking_barrier(team, this_thr, gtid);
1311  KA_TRACE(15,
1312  ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n", gtid,
1313  __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1314  }
1315 
1316  /* Copy the blocktime info to the thread, where __kmp_wait_template() can
1317  access it when the team struct is not guaranteed to exist. */
1318  // See note about the corresponding code in __kmp_join_barrier() being
1319  // performance-critical.
1320  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1321 #if KMP_USE_MONITOR
1322  this_thr->th.th_team_bt_intervals =
1323  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1324  this_thr->th.th_team_bt_set =
1325  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1326 #else
1327  this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1328 #endif
1329  }
1330 
1331 #if USE_ITT_BUILD
1332  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1333  __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1334 #endif /* USE_ITT_BUILD */
1335 #if USE_DEBUGGER
1336  // Let the debugger know: the thread arrived to the barrier and waiting.
1337  if (KMP_MASTER_TID(tid)) { // Primary thread counter stored in team struct
1338  team->t.t_bar[bt].b_master_arrived += 1;
1339  } else {
1340  this_thr->th.th_bar[bt].bb.b_worker_arrived += 1;
1341  } // if
1342 #endif /* USE_DEBUGGER */
1343  if (reduce != NULL) {
1344  // KMP_DEBUG_ASSERT( is_split == TRUE ); // #C69956
1345  this_thr->th.th_local.reduce_data = reduce_data;
1346  }
1347 
1348  if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec)
1349  // use 0 to only setup the current team if nthreads > 1
1350  __kmp_task_team_setup(this_thr, team, 0);
1351 
1352  if (cancellable) {
1353  cancelled = __kmp_linear_barrier_gather_cancellable(
1354  bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1355  } else {
1356  switch (__kmp_barrier_gather_pattern[bt]) {
1357  case bp_hyper_bar: {
1358  // don't set branch bits to 0; use linear
1359  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
1360  __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid,
1361  reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1362  break;
1363  }
1364  case bp_hierarchical_bar: {
1365  __kmp_hierarchical_barrier_gather(
1366  bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1367  break;
1368  }
1369  case bp_tree_bar: {
1370  // don't set branch bits to 0; use linear
1371  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
1372  __kmp_tree_barrier_gather(bt, this_thr, gtid, tid,
1373  reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1374  break;
1375  }
1376  default: {
1377  __kmp_linear_barrier_gather(bt, this_thr, gtid, tid,
1378  reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1379  }
1380  }
1381  }
1382 
1383  KMP_MB();
1384 
1385  if (KMP_MASTER_TID(tid)) {
1386  status = 0;
1387  if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
1388  __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1389  }
1390 #if USE_DEBUGGER
1391  // Let the debugger know: All threads are arrived and starting leaving the
1392  // barrier.
1393  team->t.t_bar[bt].b_team_arrived += 1;
1394 #endif
1395 
1396  if (__kmp_omp_cancellation) {
1397  kmp_int32 cancel_request = KMP_ATOMIC_LD_RLX(&team->t.t_cancel_request);
1398  // Reset cancellation flag for worksharing constructs
1399  if (cancel_request == cancel_loop ||
1400  cancel_request == cancel_sections) {
1401  KMP_ATOMIC_ST_RLX(&team->t.t_cancel_request, cancel_noreq);
1402  }
1403  }
1404 #if USE_ITT_BUILD
1405  /* TODO: In case of split reduction barrier, primary thread may send
1406  acquired event early, before the final summation into the shared
1407  variable is done (final summation can be a long operation for array
1408  reductions). */
1409  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1410  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1411 #endif /* USE_ITT_BUILD */
1412 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1413  // Barrier - report frame end (only if active_level == 1)
1414  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
1415  __kmp_forkjoin_frames_mode &&
1416  (this_thr->th.th_teams_microtask == NULL || // either not in teams
1417  this_thr->th.th_teams_size.nteams == 1) && // or inside single team
1418  team->t.t_active_level == 1) {
1419  ident_t *loc = __kmp_threads[gtid]->th.th_ident;
1420  kmp_uint64 cur_time = __itt_get_timestamp();
1421  kmp_info_t **other_threads = team->t.t_threads;
1422  int nproc = this_thr->th.th_team_nproc;
1423  int i;
1424  switch (__kmp_forkjoin_frames_mode) {
1425  case 1:
1426  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1427  loc, nproc);
1428  this_thr->th.th_frame_time = cur_time;
1429  break;
1430  case 2: // AC 2015-01-19: currently does not work for hierarchical (to
1431  // be fixed)
1432  __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time,
1433  1, loc, nproc);
1434  break;
1435  case 3:
1436  if (__itt_metadata_add_ptr) {
1437  // Initialize with primary thread's wait time
1438  kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1439  // Set arrive time to zero to be able to check it in
1440  // __kmp_invoke_task(); the same is done inside the loop below
1441  this_thr->th.th_bar_arrive_time = 0;
1442  for (i = 1; i < nproc; ++i) {
1443  delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
1444  other_threads[i]->th.th_bar_arrive_time = 0;
1445  }
1446  __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
1447  cur_time, delta,
1448  (kmp_uint64)(reduce != NULL));
1449  }
1450  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1451  loc, nproc);
1452  this_thr->th.th_frame_time = cur_time;
1453  break;
1454  }
1455  }
1456 #endif /* USE_ITT_BUILD */
1457  } else {
1458  status = 1;
1459 #if USE_ITT_BUILD
1460  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1461  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1462 #endif /* USE_ITT_BUILD */
1463  }
1464  if ((status == 1 || !is_split) && !cancelled) {
1465  if (cancellable) {
1466  cancelled = __kmp_linear_barrier_release_cancellable(
1467  bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1468  } else {
1469  switch (__kmp_barrier_release_pattern[bt]) {
1470  case bp_hyper_bar: {
1471  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1472  __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
1473  FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1474  break;
1475  }
1476  case bp_hierarchical_bar: {
1477  __kmp_hierarchical_barrier_release(
1478  bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1479  break;
1480  }
1481  case bp_tree_bar: {
1482  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1483  __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
1484  FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1485  break;
1486  }
1487  default: {
1488  __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
1489  FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1490  }
1491  }
1492  }
1493  if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
1494  __kmp_task_team_sync(this_thr, team);
1495  }
1496  }
1497 
1498 #if USE_ITT_BUILD
1499  /* GEH: TODO: Move this under if-condition above and also include in
1500  __kmp_end_split_barrier(). This will more accurately represent the actual
1501  release time of the threads for split barriers. */
1502  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1503  __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1504 #endif /* USE_ITT_BUILD */
1505  } else { // Team is serialized.
1506  status = 0;
1507  if (__kmp_tasking_mode != tskm_immediate_exec) {
1508  if (this_thr->th.th_task_team != NULL) {
1509 #if USE_ITT_NOTIFY
1510  void *itt_sync_obj = NULL;
1511  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1512  itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1513  __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1514  }
1515 #endif
1516 
1517  KMP_DEBUG_ASSERT(this_thr->th.th_task_team->tt.tt_found_proxy_tasks ==
1518  TRUE);
1519  __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1520  __kmp_task_team_setup(this_thr, team, 0);
1521 
1522 #if USE_ITT_BUILD
1523  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1524  __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1525 #endif /* USE_ITT_BUILD */
1526  }
1527  }
1528  }
1529  KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
1530  gtid, __kmp_team_from_gtid(gtid)->t.t_id,
1531  __kmp_tid_from_gtid(gtid), status));
1532 
1533 #if OMPT_SUPPORT
1534  if (ompt_enabled.enabled) {
1535 #if OMPT_OPTIONAL
1536  if (ompt_enabled.ompt_callback_sync_region_wait) {
1537  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1538  barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
1539  return_address);
1540  }
1541  if (ompt_enabled.ompt_callback_sync_region) {
1542  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1543  barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
1544  return_address);
1545  }
1546 #endif
1547  this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1548  }
1549 #endif
1550 
1551  if (cancellable)
1552  return (int)cancelled;
1553  return status;
1554 }
1555 
1556 // Returns 0 if primary thread, 1 if worker thread.
1557 int __kmp_barrier(enum barrier_type bt, int gtid, int is_split,
1558  size_t reduce_size, void *reduce_data,
1559  void (*reduce)(void *, void *)) {
1560  return __kmp_barrier_template<>(bt, gtid, is_split, reduce_size, reduce_data,
1561  reduce);
1562 }
1563 
1564 #if defined(KMP_GOMP_COMPAT)
1565 // Returns 1 if cancelled, 0 otherwise
1566 int __kmp_barrier_gomp_cancel(int gtid) {
1567  if (__kmp_omp_cancellation) {
1568  int cancelled = __kmp_barrier_template<true>(bs_plain_barrier, gtid, FALSE,
1569  0, NULL, NULL);
1570  if (cancelled) {
1571  int tid = __kmp_tid_from_gtid(gtid);
1572  kmp_info_t *this_thr = __kmp_threads[gtid];
1573  if (KMP_MASTER_TID(tid)) {
1574  // Primary thread does not need to revert anything
1575  } else {
1576  // Workers need to revert their private b_arrived flag
1577  this_thr->th.th_bar[bs_plain_barrier].bb.b_arrived -=
1578  KMP_BARRIER_STATE_BUMP;
1579  }
1580  }
1581  return cancelled;
1582  }
1583  __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
1584  return FALSE;
1585 }
1586 #endif
1587 
1588 void __kmp_end_split_barrier(enum barrier_type bt, int gtid) {
1589  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_end_split_barrier);
1590  KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1591  KMP_DEBUG_ASSERT(bt < bs_last_barrier);
1592  int tid = __kmp_tid_from_gtid(gtid);
1593  kmp_info_t *this_thr = __kmp_threads[gtid];
1594  kmp_team_t *team = this_thr->th.th_team;
1595 
1596  if (!team->t.t_serialized) {
1597  if (KMP_MASTER_GTID(gtid)) {
1598  switch (__kmp_barrier_release_pattern[bt]) {
1599  case bp_hyper_bar: {
1600  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1601  __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
1602  FALSE USE_ITT_BUILD_ARG(NULL));
1603  break;
1604  }
1605  case bp_hierarchical_bar: {
1606  __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid,
1607  FALSE USE_ITT_BUILD_ARG(NULL));
1608  break;
1609  }
1610  case bp_tree_bar: {
1611  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1612  __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
1613  FALSE USE_ITT_BUILD_ARG(NULL));
1614  break;
1615  }
1616  default: {
1617  __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
1618  FALSE USE_ITT_BUILD_ARG(NULL));
1619  }
1620  }
1621  if (__kmp_tasking_mode != tskm_immediate_exec) {
1622  __kmp_task_team_sync(this_thr, team);
1623  } // if
1624  }
1625  }
1626 }
1627 
1628 void __kmp_join_barrier(int gtid) {
1629  KMP_TIME_PARTITIONED_BLOCK(OMP_join_barrier);
1630  KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
1631 
1632  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
1633 
1634  kmp_info_t *this_thr = __kmp_threads[gtid];
1635  kmp_team_t *team;
1636  kmp_uint nproc;
1637  kmp_info_t *master_thread;
1638  int tid;
1639 #ifdef KMP_DEBUG
1640  int team_id;
1641 #endif /* KMP_DEBUG */
1642 #if USE_ITT_BUILD
1643  void *itt_sync_obj = NULL;
1644 #if USE_ITT_NOTIFY
1645  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need
1646  // Get object created at fork_barrier
1647  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1648 #endif
1649 #endif /* USE_ITT_BUILD */
1650  KMP_MB();
1651 
1652  // Get current info
1653  team = this_thr->th.th_team;
1654  nproc = this_thr->th.th_team_nproc;
1655  KMP_DEBUG_ASSERT((int)nproc == team->t.t_nproc);
1656  tid = __kmp_tid_from_gtid(gtid);
1657 #ifdef KMP_DEBUG
1658  team_id = team->t.t_id;
1659 #endif /* KMP_DEBUG */
1660  master_thread = this_thr->th.th_team_master;
1661 #ifdef KMP_DEBUG
1662  if (master_thread != team->t.t_threads[0]) {
1663  __kmp_print_structure();
1664  }
1665 #endif /* KMP_DEBUG */
1666  KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);
1667  KMP_MB();
1668 
1669  // Verify state
1670  KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
1671  KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
1672  KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
1673  KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n",
1674  gtid, team_id, tid));
1675 
1676 #if OMPT_SUPPORT
1677  if (ompt_enabled.enabled) {
1678 #if OMPT_OPTIONAL
1679  ompt_data_t *my_task_data;
1680  ompt_data_t *my_parallel_data;
1681  void *codeptr = NULL;
1682  int ds_tid = this_thr->th.th_info.ds.ds_tid;
1683  if (KMP_MASTER_TID(ds_tid) &&
1684  (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
1685  ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
1686  codeptr = team->t.ompt_team_info.master_return_address;
1687  my_task_data = OMPT_CUR_TASK_DATA(this_thr);
1688  my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
1689  if (ompt_enabled.ompt_callback_sync_region) {
1690  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1691  ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data,
1692  my_task_data, codeptr);
1693  }
1694  if (ompt_enabled.ompt_callback_sync_region_wait) {
1695  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1696  ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data,
1697  my_task_data, codeptr);
1698  }
1699  if (!KMP_MASTER_TID(ds_tid))
1700  this_thr->th.ompt_thread_info.task_data = *OMPT_CUR_TASK_DATA(this_thr);
1701 #endif
1702  this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier_implicit;
1703  }
1704 #endif
1705 
1706  if (__kmp_tasking_mode == tskm_extra_barrier) {
1707  __kmp_tasking_barrier(team, this_thr, gtid);
1708  KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n", gtid,
1709  team_id, tid));
1710  }
1711 #ifdef KMP_DEBUG
1712  if (__kmp_tasking_mode != tskm_immediate_exec) {
1713  KA_TRACE(20, ("__kmp_join_barrier: T#%d, old team = %d, old task_team = "
1714  "%p, th_task_team = %p\n",
1715  __kmp_gtid_from_thread(this_thr), team_id,
1716  team->t.t_task_team[this_thr->th.th_task_state],
1717  this_thr->th.th_task_team));
1718  KMP_DEBUG_ASSERT(this_thr->th.th_task_team ==
1719  team->t.t_task_team[this_thr->th.th_task_state]);
1720  }
1721 #endif /* KMP_DEBUG */
1722 
1723  /* Copy the blocktime info to the thread, where __kmp_wait_template() can
1724  access it when the team struct is not guaranteed to exist. Doing these
1725  loads causes a cache miss slows down EPCC parallel by 2x. As a workaround,
1726  we do not perform the copy if blocktime=infinite, since the values are not
1727  used by __kmp_wait_template() in that case. */
1728  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1729 #if KMP_USE_MONITOR
1730  this_thr->th.th_team_bt_intervals =
1731  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1732  this_thr->th.th_team_bt_set =
1733  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1734 #else
1735  this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1736 #endif
1737  }
1738 
1739 #if USE_ITT_BUILD
1740  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1741  __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1742 #endif /* USE_ITT_BUILD */
1743 
1744  switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
1745  case bp_hyper_bar: {
1746  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1747  __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1748  NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1749  break;
1750  }
1751  case bp_hierarchical_bar: {
1752  __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1753  NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1754  break;
1755  }
1756  case bp_tree_bar: {
1757  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1758  __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1759  NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1760  break;
1761  }
1762  default: {
1763  __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1764  NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1765  }
1766  }
1767 
1768  /* From this point on, the team data structure may be deallocated at any time
1769  by the primary thread - it is unsafe to reference it in any of the worker
1770  threads. Any per-team data items that need to be referenced before the
1771  end of the barrier should be moved to the kmp_task_team_t structs. */
1772  if (KMP_MASTER_TID(tid)) {
1773  if (__kmp_tasking_mode != tskm_immediate_exec) {
1774  __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1775  }
1776  if (__kmp_display_affinity) {
1777  KMP_CHECK_UPDATE(team->t.t_display_affinity, 0);
1778  }
1779 #if KMP_STATS_ENABLED
1780  // Have primary thread flag the workers to indicate they are now waiting for
1781  // next parallel region, Also wake them up so they switch their timers to
1782  // idle.
1783  for (int i = 0; i < team->t.t_nproc; ++i) {
1784  kmp_info_t *team_thread = team->t.t_threads[i];
1785  if (team_thread == this_thr)
1786  continue;
1787  team_thread->th.th_stats->setIdleFlag();
1788  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME &&
1789  team_thread->th.th_sleep_loc != NULL)
1790  __kmp_null_resume_wrapper(__kmp_gtid_from_thread(team_thread),
1791  team_thread->th.th_sleep_loc);
1792  }
1793 #endif
1794 #if USE_ITT_BUILD
1795  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1796  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1797 #endif /* USE_ITT_BUILD */
1798 
1799 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1800  // Join barrier - report frame end
1801  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
1802  __kmp_forkjoin_frames_mode &&
1803  (this_thr->th.th_teams_microtask == NULL || // either not in teams
1804  this_thr->th.th_teams_size.nteams == 1) && // or inside single team
1805  team->t.t_active_level == 1) {
1806  kmp_uint64 cur_time = __itt_get_timestamp();
1807  ident_t *loc = team->t.t_ident;
1808  kmp_info_t **other_threads = team->t.t_threads;
1809  int nproc = this_thr->th.th_team_nproc;
1810  int i;
1811  switch (__kmp_forkjoin_frames_mode) {
1812  case 1:
1813  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1814  loc, nproc);
1815  break;
1816  case 2:
1817  __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1,
1818  loc, nproc);
1819  break;
1820  case 3:
1821  if (__itt_metadata_add_ptr) {
1822  // Initialize with primary thread's wait time
1823  kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1824  // Set arrive time to zero to be able to check it in
1825  // __kmp_invoke_task(); the same is done inside the loop below
1826  this_thr->th.th_bar_arrive_time = 0;
1827  for (i = 1; i < nproc; ++i) {
1828  delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
1829  other_threads[i]->th.th_bar_arrive_time = 0;
1830  }
1831  __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
1832  cur_time, delta, 0);
1833  }
1834  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1835  loc, nproc);
1836  this_thr->th.th_frame_time = cur_time;
1837  break;
1838  }
1839  }
1840 #endif /* USE_ITT_BUILD */
1841  }
1842 #if USE_ITT_BUILD
1843  else {
1844  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1845  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1846  }
1847 #endif /* USE_ITT_BUILD */
1848 
1849 #if KMP_DEBUG
1850  if (KMP_MASTER_TID(tid)) {
1851  KA_TRACE(
1852  15,
1853  ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
1854  gtid, team_id, tid, nproc));
1855  }
1856 #endif /* KMP_DEBUG */
1857 
1858  // TODO now, mark worker threads as done so they may be disbanded
1859  KMP_MB(); // Flush all pending memory write invalidates.
1860  KA_TRACE(10,
1861  ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
1862 
1863 }
1864 
1865 // TODO release worker threads' fork barriers as we are ready instead of all at
1866 // once
1867 void __kmp_fork_barrier(int gtid, int tid) {
1868  KMP_TIME_PARTITIONED_BLOCK(OMP_fork_barrier);
1869  KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
1870  kmp_info_t *this_thr = __kmp_threads[gtid];
1871  kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
1872 #if USE_ITT_BUILD
1873  void *itt_sync_obj = NULL;
1874 #endif /* USE_ITT_BUILD */
1875  if (team)
1876 
1877  KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n", gtid,
1878  (team != NULL) ? team->t.t_id : -1, tid));
1879 
1880  // th_team pointer only valid for primary thread here
1881  if (KMP_MASTER_TID(tid)) {
1882 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1883  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1884  // Create itt barrier object
1885  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);
1886  __kmp_itt_barrier_middle(gtid, itt_sync_obj); // Call acquired/releasing
1887  }
1888 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1889 
1890 #ifdef KMP_DEBUG
1891  KMP_DEBUG_ASSERT(team);
1892  kmp_info_t **other_threads = team->t.t_threads;
1893  int i;
1894 
1895  // Verify state
1896  KMP_MB();
1897 
1898  for (i = 1; i < team->t.t_nproc; ++i) {
1899  KA_TRACE(500,
1900  ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go "
1901  "== %u.\n",
1902  gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
1903  team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
1904  other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
1905  KMP_DEBUG_ASSERT(
1906  (TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go) &
1907  ~(KMP_BARRIER_SLEEP_STATE)) == KMP_INIT_BARRIER_STATE);
1908  KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
1909  }
1910 #endif
1911 
1912  if (__kmp_tasking_mode != tskm_immediate_exec) {
1913  // 0 indicates setup current task team if nthreads > 1
1914  __kmp_task_team_setup(this_thr, team, 0);
1915  }
1916 
1917  /* The primary thread may have changed its blocktime between join barrier
1918  and fork barrier. Copy the blocktime info to the thread, where
1919  __kmp_wait_template() can access it when the team struct is not
1920  guaranteed to exist. */
1921  // See note about the corresponding code in __kmp_join_barrier() being
1922  // performance-critical
1923  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1924 #if KMP_USE_MONITOR
1925  this_thr->th.th_team_bt_intervals =
1926  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1927  this_thr->th.th_team_bt_set =
1928  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1929 #else
1930  this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1931 #endif
1932  }
1933  } // primary thread
1934 
1935  switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
1936  case bp_hyper_bar: {
1937  KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1938  __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1939  TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1940  break;
1941  }
1942  case bp_hierarchical_bar: {
1943  __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1944  TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1945  break;
1946  }
1947  case bp_tree_bar: {
1948  KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1949  __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1950  TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1951  break;
1952  }
1953  default: {
1954  __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1955  TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1956  }
1957  }
1958 
1959 #if OMPT_SUPPORT
1960  if (ompt_enabled.enabled &&
1961  this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
1962  int ds_tid = this_thr->th.th_info.ds.ds_tid;
1963  ompt_data_t *task_data = (team)
1964  ? OMPT_CUR_TASK_DATA(this_thr)
1965  : &(this_thr->th.ompt_thread_info.task_data);
1966  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
1967 #if OMPT_OPTIONAL
1968  void *codeptr = NULL;
1969  if (KMP_MASTER_TID(ds_tid) &&
1970  (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
1971  ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
1972  codeptr = team ? team->t.ompt_team_info.master_return_address : NULL;
1973  if (ompt_enabled.ompt_callback_sync_region_wait) {
1974  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1975  ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
1976  codeptr);
1977  }
1978  if (ompt_enabled.ompt_callback_sync_region) {
1979  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1980  ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
1981  codeptr);
1982  }
1983 #endif
1984  if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
1985  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1986  ompt_scope_end, NULL, task_data, 0, ds_tid,
1987  ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1988  }
1989  }
1990 #endif
1991 
1992  // Early exit for reaping threads releasing forkjoin barrier
1993  if (TCR_4(__kmp_global.g.g_done)) {
1994  this_thr->th.th_task_team = NULL;
1995 
1996 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1997  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1998  if (!KMP_MASTER_TID(tid)) {
1999  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2000  if (itt_sync_obj)
2001  __kmp_itt_barrier_finished(gtid, itt_sync_obj);
2002  }
2003  }
2004 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2005  KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid));
2006  return;
2007  }
2008 
2009  /* We can now assume that a valid team structure has been allocated by the
2010  primary thread and propagated to all worker threads. The current thread,
2011  however, may not be part of the team, so we can't blindly assume that the
2012  team pointer is non-null. */
2013  team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
2014  KMP_DEBUG_ASSERT(team != NULL);
2015  tid = __kmp_tid_from_gtid(gtid);
2016 
2017 #if KMP_BARRIER_ICV_PULL
2018  /* Primary thread's copy of the ICVs was set up on the implicit taskdata in
2019  __kmp_reinitialize_team. __kmp_fork_call() assumes the primary thread's
2020  implicit task has this data before this function is called. We cannot
2021  modify __kmp_fork_call() to look at the fixed ICVs in the primary thread's
2022  thread struct, because it is not always the case that the threads arrays
2023  have been allocated when __kmp_fork_call() is executed. */
2024  {
2025  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
2026  if (!KMP_MASTER_TID(tid)) { // primary thread already has ICVs
2027  // Copy the initial ICVs from the primary thread's thread struct to the
2028  // implicit task for this tid.
2029  KA_TRACE(10,
2030  ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
2031  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team,
2032  tid, FALSE);
2033  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
2034  &team->t.t_threads[0]
2035  ->th.th_bar[bs_forkjoin_barrier]
2036  .bb.th_fixed_icvs);
2037  }
2038  }
2039 #endif // KMP_BARRIER_ICV_PULL
2040 
2041  if (__kmp_tasking_mode != tskm_immediate_exec) {
2042  __kmp_task_team_sync(this_thr, team);
2043  }
2044 
2045 #if KMP_AFFINITY_SUPPORTED
2046  kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
2047  if (proc_bind == proc_bind_intel) {
2048  // Call dynamic affinity settings
2049  if (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) {
2050  __kmp_balanced_affinity(this_thr, team->t.t_nproc);
2051  }
2052  } else if (proc_bind != proc_bind_false) {
2053  if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
2054  KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n",
2055  __kmp_gtid_from_thread(this_thr),
2056  this_thr->th.th_current_place));
2057  } else {
2058  __kmp_affinity_set_place(gtid);
2059  }
2060  }
2061 #endif // KMP_AFFINITY_SUPPORTED
2062  // Perform the display affinity functionality
2063  if (__kmp_display_affinity) {
2064  if (team->t.t_display_affinity
2065 #if KMP_AFFINITY_SUPPORTED
2066  || (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed)
2067 #endif
2068  ) {
2069  // NULL means use the affinity-format-var ICV
2070  __kmp_aux_display_affinity(gtid, NULL);
2071  this_thr->th.th_prev_num_threads = team->t.t_nproc;
2072  this_thr->th.th_prev_level = team->t.t_level;
2073  }
2074  }
2075  if (!KMP_MASTER_TID(tid))
2076  KMP_CHECK_UPDATE(this_thr->th.th_def_allocator, team->t.t_def_allocator);
2077 
2078 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2079  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2080  if (!KMP_MASTER_TID(tid)) {
2081  // Get correct barrier object
2082  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2083  __kmp_itt_barrier_finished(gtid, itt_sync_obj); // Workers call acquired
2084  } // (prepare called inside barrier_release)
2085  }
2086 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2087  KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid,
2088  team->t.t_id, tid));
2089 }
2090 
2091 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
2092  kmp_internal_control_t *new_icvs, ident_t *loc) {
2093  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_setup_icv_copy);
2094 
2095  KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
2096  KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
2097 
2098 /* Primary thread's copy of the ICVs was set up on the implicit taskdata in
2099  __kmp_reinitialize_team. __kmp_fork_call() assumes the primary thread's
2100  implicit task has this data before this function is called. */
2101 #if KMP_BARRIER_ICV_PULL
2102  /* Copy ICVs to primary thread's thread structure into th_fixed_icvs (which
2103  remains untouched), where all of the worker threads can access them and
2104  make their own copies after the barrier. */
2105  KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
2106  // allocated at this point
2107  copy_icvs(
2108  &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs,
2109  new_icvs);
2110  KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n", 0,
2111  team->t.t_threads[0], team));
2112 #elif KMP_BARRIER_ICV_PUSH
2113  // The ICVs will be propagated in the fork barrier, so nothing needs to be
2114  // done here.
2115  KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n", 0,
2116  team->t.t_threads[0], team));
2117 #else
2118  // Copy the ICVs to each of the non-primary threads. This takes O(nthreads)
2119  // time.
2120  ngo_load(new_icvs);
2121  KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
2122  // allocated at this point
2123  for (int f = 1; f < new_nproc; ++f) { // Skip the primary thread
2124  // TODO: GEH - pass in better source location info since usually NULL here
2125  KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2126  f, team->t.t_threads[f], team));
2127  __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
2128  ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
2129  KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2130  f, team->t.t_threads[f], team));
2131  }
2132  ngo_sync();
2133 #endif // KMP_BARRIER_ICV_PULL
2134 }
Definition: kmp.h:233