* kmp_runtime.cpp -- KPTS runtime support library
*/
#include "kmp.h"
#include "kmp_affinity.h"
#include "kmp_atomic.h"
#include "kmp_environment.h"
#include "kmp_error.h"
#include "kmp_i18n.h"
#include "kmp_io.h"
#include "kmp_itt.h"
#include "kmp_settings.h"
#include "kmp_stats.h"
#include "kmp_str.h"
#include "kmp_wait_release.h"
#include "kmp_wrapper_getpid.h"
#include "kmp_dispatch.h"
#include "kmp_utils.h"
#if KMP_USE_HIER_SCHED
#include "kmp_dispatch_hier.h"
#endif
#if OMPT_SUPPORT
#include "ompt-specific.h"
#endif
#if OMPD_SUPPORT
#include "ompd-specific.h"
#endif
#if OMP_PROFILING_SUPPORT
#include "llvm/Support/TimeProfiler.h"
static char *ProfileTraceFile = nullptr;
#endif
#define KMP_USE_PRCTL 0
#if KMP_OS_WINDOWS
#include <process.h>
#endif
#ifndef KMP_USE_SHM
#else
#include <sys/mman.h>
#include <sys/stat.h>
#include <fcntl.h>
#define SHM_SIZE 1024
#endif
#if defined(KMP_GOMP_COMPAT)
char const __kmp_version_alt_comp[] =
KMP_VERSION_PREFIX "alternative compiler support: yes";
#endif
char const __kmp_version_omp_api[] =
KMP_VERSION_PREFIX "API version: 5.0 (201611)";
#ifdef KMP_DEBUG
char const __kmp_version_lock[] =
KMP_VERSION_PREFIX "lock type: run time selectable";
#endif
#define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
#if KMP_USE_MONITOR
kmp_info_t __kmp_monitor;
#endif
void __kmp_cleanup(void);
static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
int gtid);
static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
kmp_internal_control_t *new_icvs,
ident_t *loc);
#if KMP_AFFINITY_SUPPORTED
static void __kmp_partition_places(kmp_team_t *team,
int update_master_only = 0);
#endif
static void __kmp_do_serial_initialize(void);
void __kmp_fork_barrier(int gtid, int tid);
void __kmp_join_barrier(int gtid);
void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
kmp_internal_control_t *new_icvs, ident_t *loc);
#ifdef USE_LOAD_BALANCE
static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
#endif
static int __kmp_expand_threads(int nNeed);
#if KMP_OS_WINDOWS
static int __kmp_unregister_root_other_thread(int gtid);
#endif
static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
int new_nthreads);
void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads);
static kmp_nested_nthreads_t *__kmp_override_nested_nth(kmp_info_t *thr,
int level) {
kmp_nested_nthreads_t *new_nested_nth =
(kmp_nested_nthreads_t *)KMP_INTERNAL_MALLOC(
sizeof(kmp_nested_nthreads_t));
int new_size = level + thr->th.th_set_nested_nth_sz;
new_nested_nth->nth = (int *)KMP_INTERNAL_MALLOC(new_size * sizeof(int));
for (int i = 0; i < level + 1; ++i)
new_nested_nth->nth[i] = 0;
for (int i = level + 1, j = 1; i < new_size; ++i, ++j)
new_nested_nth->nth[i] = thr->th.th_set_nested_nth[j];
new_nested_nth->size = new_nested_nth->used = new_size;
return new_nested_nth;
}
thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
int __kmp_get_global_thread_id() {
int i;
kmp_info_t **other_threads;
size_t stack_data;
char *stack_addr;
size_t stack_size;
char *stack_base;
KA_TRACE(
1000,
("*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n",
__kmp_nth, __kmp_all_nth));
a parallel region, made it return KMP_GTID_DNE to force serial_initialize
by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
__kmp_init_gtid for this to work. */
if (!TCR_4(__kmp_init_gtid))
return KMP_GTID_DNE;
#ifdef KMP_TDATA_GTID
if (TCR_4(__kmp_gtid_mode) >= 3) {
KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
return __kmp_gtid;
}
#endif
if (TCR_4(__kmp_gtid_mode) >= 2) {
KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
return __kmp_gtid_get_specific();
}
KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
stack_addr = (char *)&stack_data;
other_threads = __kmp_threads;
access to __kmp_threads array. For example:
1. Current thread loads other_threads[i] to thr and checks it, it is
non-NULL.
2. Current thread is suspended by OS.
3. Another thread unregisters and finishes (debug versions of free()
may fill memory with something like 0xEF).
4. Current thread is resumed.
5. Current thread reads junk from *thr.
TODO: Fix it. --ln */
for (i = 0; i < __kmp_threads_capacity; i++) {
kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
if (!thr)
continue;
stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
if (stack_addr <= stack_base) {
size_t stack_diff = stack_base - stack_addr;
if (stack_diff <= stack_size) {
KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() < 0 ||
__kmp_gtid_get_specific() == i);
return i;
}
}
}
KA_TRACE(1000,
("*** __kmp_get_global_thread_id: internal alg. failed to find "
"thread, using TLS\n"));
i = __kmp_gtid_get_specific();
if (i < 0)
return i;
if (!TCR_SYNC_PTR(other_threads[i]))
return i;
call */
if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
KMP_FATAL(StackOverflow, i);
}
stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
if (stack_addr > stack_base) {
TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
stack_base);
} else {
TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
stack_base - stack_addr);
}
if (__kmp_storage_map) {
char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
__kmp_print_storage_map_gtid(i, stack_beg, stack_end,
other_threads[i]->th.th_info.ds.ds_stacksize,
"th_%d stack (refinement)", i);
}
return i;
}
int __kmp_get_global_thread_id_reg() {
int gtid;
if (!__kmp_init_serial) {
gtid = KMP_GTID_DNE;
} else
#ifdef KMP_TDATA_GTID
if (TCR_4(__kmp_gtid_mode) >= 3) {
KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
gtid = __kmp_gtid;
} else
#endif
if (TCR_4(__kmp_gtid_mode) >= 2) {
KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
gtid = __kmp_gtid_get_specific();
} else {
KA_TRACE(1000,
("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
gtid = __kmp_get_global_thread_id();
}
if (gtid == KMP_GTID_DNE) {
KA_TRACE(10,
("__kmp_get_global_thread_id_reg: Encountered new root thread. "
"Registering a new gtid.\n"));
__kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
if (!__kmp_init_serial) {
__kmp_do_serial_initialize();
gtid = __kmp_gtid_get_specific();
} else {
gtid = __kmp_register_root(FALSE);
}
__kmp_release_bootstrap_lock(&__kmp_initz_lock);
}
KMP_DEBUG_ASSERT(gtid >= 0);
return gtid;
}
void __kmp_check_stack_overlap(kmp_info_t *th) {
int f;
char *stack_beg = NULL;
char *stack_end = NULL;
int gtid;
KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
if (__kmp_storage_map) {
stack_end = (char *)th->th.th_info.ds.ds_stackbase;
stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
gtid = __kmp_gtid_from_thread(th);
if (gtid == KMP_GTID_MONITOR) {
__kmp_print_storage_map_gtid(
gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
"th_%s stack (%s)", "mon",
(th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
} else {
__kmp_print_storage_map_gtid(
gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
"th_%d stack (%s)", gtid,
(th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
}
}
* cannot overlap */
gtid = __kmp_gtid_from_thread(th);
if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
KA_TRACE(10,
("__kmp_check_stack_overlap: performing extensive checking\n"));
if (stack_beg == NULL) {
stack_end = (char *)th->th.th_info.ds.ds_stackbase;
stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
}
for (f = 0; f < __kmp_threads_capacity; f++) {
kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
if (f_th && f_th != th) {
char *other_stack_end =
(char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
char *other_stack_beg =
other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
(stack_end > other_stack_beg && stack_end < other_stack_end)) {
if (__kmp_storage_map)
__kmp_print_storage_map_gtid(
-1, other_stack_beg, other_stack_end,
(size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
"th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
__kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
__kmp_msg_null);
}
}
}
}
KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
}
void __kmp_infinite_loop(void) {
static int done = FALSE;
while (!done) {
KMP_YIELD(TRUE);
}
}
#define MAX_MESSAGE 512
void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
char const *format, ...) {
char buffer[MAX_MESSAGE];
va_list ap;
va_start(ap, format);
KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
p2, (unsigned long)size, format);
__kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
__kmp_vprintf(kmp_err, buffer, ap);
#if KMP_PRINT_DATA_PLACEMENT
int node;
if (gtid >= 0) {
if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
if (__kmp_storage_map_verbose) {
node = __kmp_get_host_node(p1);
if (node < 0)
__kmp_storage_map_verbose = FALSE;
else {
char *last;
int lastNode;
int localProc = __kmp_get_cpu_from_gtid(gtid);
const int page_size = KMP_GET_PAGE_SIZE();
p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
if (localProc >= 0)
__kmp_printf_no_lock(" GTID %d localNode %d\n", gtid,
localProc >> 1);
else
__kmp_printf_no_lock(" GTID %d\n", gtid);
#if KMP_USE_PRCTL
* hanging bug. */
do {
last = p1;
lastNode = node;
do {
(char *)p1 += page_size;
} while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
__kmp_printf_no_lock(" %p-%p memNode %d\n", last, (char *)p1 - 1,
lastNode);
} while (p1 <= p2);
#else
__kmp_printf_no_lock(" %p-%p memNode %d\n", p1,
(char *)p1 + (page_size - 1),
__kmp_get_host_node(p1));
if (p1 < p2) {
__kmp_printf_no_lock(" %p-%p memNode %d\n", p2,
(char *)p2 + (page_size - 1),
__kmp_get_host_node(p2));
}
#endif
}
}
} else
__kmp_printf_no_lock(" %s\n", KMP_I18N_STR(StorageMapWarning));
}
#endif
__kmp_release_bootstrap_lock(&__kmp_stdio_lock);
va_end(ap);
}
void __kmp_warn(char const *format, ...) {
char buffer[MAX_MESSAGE];
va_list ap;
if (__kmp_generate_warnings == kmp_warnings_off) {
return;
}
va_start(ap, format);
KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
__kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
__kmp_vprintf(kmp_err, buffer, ap);
__kmp_release_bootstrap_lock(&__kmp_stdio_lock);
va_end(ap);
}
void __kmp_abort_process() {
__kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
if (__kmp_debug_buf) {
__kmp_dump_debug_buffer();
}
#if KMP_OS_WINDOWS
__kmp_global.g.g_abort = SIGABRT;
nightly testing. Unfortunately, we cannot reliably suppress pop-up error
boxes. _set_abort_behavior() works well, but this function is not
available in VS7 (this is not problem for DLL, but it is a problem for
static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
help, at least in some versions of MS C RTL.
It seems following sequence is the only way to simulate abort() and
avoid pop-up error box. */
raise(SIGABRT);
_exit(3);
#else
__kmp_unregister_library();
abort();
#endif
__kmp_infinite_loop();
__kmp_release_bootstrap_lock(&__kmp_exit_lock);
}
void __kmp_abort_thread(void) {
__kmp_infinite_loop();
}
that are allocated together. */
static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
__kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
gtid);
__kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
sizeof(kmp_desc_t), "th_%d.th_info", gtid);
__kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
sizeof(kmp_local_t), "th_%d.th_local", gtid);
__kmp_print_storage_map_gtid(
gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
__kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
&thr->th.th_bar[bs_plain_barrier + 1],
sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
gtid);
__kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
&thr->th.th_bar[bs_forkjoin_barrier + 1],
sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
gtid);
#if KMP_FAST_REDUCTION_BARRIER
__kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
&thr->th.th_bar[bs_reduction_barrier + 1],
sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
gtid);
#endif
}
that are allocated together. */
static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
int team_id, int num_thr) {
int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
__kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
header, team_id);
__kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
&team->t.t_bar[bs_last_barrier],
sizeof(kmp_balign_team_t) * bs_last_barrier,
"%s_%d.t_bar", header, team_id);
__kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
&team->t.t_bar[bs_plain_barrier + 1],
sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
header, team_id);
__kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
&team->t.t_bar[bs_forkjoin_barrier + 1],
sizeof(kmp_balign_team_t),
"%s_%d.t_bar[forkjoin]", header, team_id);
#if KMP_FAST_REDUCTION_BARRIER
__kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
&team->t.t_bar[bs_reduction_barrier + 1],
sizeof(kmp_balign_team_t),
"%s_%d.t_bar[reduction]", header, team_id);
#endif
__kmp_print_storage_map_gtid(
-1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
__kmp_print_storage_map_gtid(
-1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
__kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
&team->t.t_disp_buffer[num_disp_buff],
sizeof(dispatch_shared_info_t) * num_disp_buff,
"%s_%d.t_disp_buffer", header, team_id);
}
static void __kmp_init_allocator() {
__kmp_init_memkind();
__kmp_init_target_mem();
}
static void __kmp_fini_allocator() { __kmp_fini_memkind(); }
#if ENABLE_LIBOMPTARGET
static void __kmp_init_omptarget() {
__kmp_init_target_task();
}
#endif
#if KMP_DYNAMIC_LIB
#if KMP_OS_WINDOWS
BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
switch (fdwReason) {
case DLL_PROCESS_ATTACH:
KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
return TRUE;
case DLL_PROCESS_DETACH:
KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
if (lpReserved == NULL)
__kmp_internal_end_library(__kmp_gtid_get_specific());
return TRUE;
case DLL_THREAD_ATTACH:
KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
* __kmp_get_gtid(); */
return TRUE;
case DLL_THREAD_DETACH:
KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
__kmp_internal_end_thread(__kmp_gtid_get_specific());
return TRUE;
}
return TRUE;
}
#endif
#endif
void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
int gtid = *gtid_ref;
#ifdef BUILD_PARALLEL_ORDERED
kmp_team_t *team = __kmp_team_from_gtid(gtid);
#endif
if (__kmp_env_consistency_check) {
if (__kmp_threads[gtid]->th.th_root->r.r_active)
#if KMP_USE_DYNAMIC_LOCK
__kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
#else
__kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
#endif
}
#ifdef BUILD_PARALLEL_ORDERED
if (!team->t.t_serialized) {
KMP_MB();
KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
NULL);
KMP_MB();
}
#endif
}
void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
int gtid = *gtid_ref;
#ifdef BUILD_PARALLEL_ORDERED
int tid = __kmp_tid_from_gtid(gtid);
kmp_team_t *team = __kmp_team_from_gtid(gtid);
#endif
if (__kmp_env_consistency_check) {
if (__kmp_threads[gtid]->th.th_root->r.r_active)
__kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
}
#ifdef BUILD_PARALLEL_ORDERED
if (!team->t.t_serialized) {
KMP_MB();
team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
KMP_MB();
}
#endif
}
int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
int status;
kmp_info_t *th;
kmp_team_t *team;
if (!TCR_4(__kmp_init_parallel))
__kmp_parallel_initialize();
__kmp_resume_if_soft_paused();
th = __kmp_threads[gtid];
team = th->th.th_team;
status = 0;
th->th.th_ident = id_ref;
if (team->t.t_serialized) {
status = 1;
} else {
kmp_int32 old_this = th->th.th_local.this_construct;
++th->th.th_local.this_construct;
single block */
if (team->t.t_construct == old_this) {
status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
th->th.th_local.this_construct);
}
#if USE_ITT_BUILD
if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
team->t.t_active_level == 1) {
__kmp_itt_metadata_single(id_ref);
}
#endif
}
if (__kmp_env_consistency_check) {
if (status && push_ws) {
__kmp_push_workshare(gtid, ct_psingle, id_ref);
} else {
__kmp_check_workshare(gtid, ct_psingle, id_ref);
}
}
#if USE_ITT_BUILD
if (status) {
__kmp_itt_single_start(gtid);
}
#endif
return status;
}
void __kmp_exit_single(int gtid) {
#if USE_ITT_BUILD
__kmp_itt_single_end(gtid);
#endif
if (__kmp_env_consistency_check)
__kmp_pop_workshare(gtid, ct_psingle, NULL);
}
* how many threads we can use
* set_nproc is the number of threads requested for the team
* returns 0 if we should serialize or only use one thread,
* otherwise the number of threads to use
* The forkjoin lock is held by the caller. */
static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
int master_tid, int set_nthreads,
int enter_teams) {
int capacity;
int new_nthreads;
KMP_DEBUG_ASSERT(__kmp_init_serial);
KMP_DEBUG_ASSERT(root && parent_team);
kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
new_nthreads = set_nthreads;
if (!get__dynamic_2(parent_team, master_tid)) {
;
}
#ifdef USE_LOAD_BALANCE
else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
if (new_nthreads == 1) {
KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
"reservation to 1 thread\n",
master_tid));
return 1;
}
if (new_nthreads < set_nthreads) {
KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
"reservation to %d threads\n",
master_tid, new_nthreads));
}
}
#endif
else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
new_nthreads = __kmp_avail_proc - __kmp_nth +
(root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
if (new_nthreads <= 1) {
KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
"reservation to 1 thread\n",
master_tid));
return 1;
}
if (new_nthreads < set_nthreads) {
KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
"reservation to %d threads\n",
master_tid, new_nthreads));
} else {
new_nthreads = set_nthreads;
}
} else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
if (set_nthreads > 2) {
new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
new_nthreads = (new_nthreads % set_nthreads) + 1;
if (new_nthreads == 1) {
KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
"reservation to 1 thread\n",
master_tid));
return 1;
}
if (new_nthreads < set_nthreads) {
KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
"reservation to %d threads\n",
master_tid, new_nthreads));
}
}
} else {
KMP_ASSERT(0);
}
if (__kmp_nth + new_nthreads -
(root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
__kmp_max_nth) {
int tl_nthreads = __kmp_max_nth - __kmp_nth +
(root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
if (tl_nthreads <= 0) {
tl_nthreads = 1;
}
if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
__kmp_reserve_warn = 1;
__kmp_msg(kmp_ms_warning,
KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
}
if (tl_nthreads == 1) {
KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
"reduced reservation to 1 thread\n",
master_tid));
return 1;
}
KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
"reservation to %d threads\n",
master_tid, tl_nthreads));
new_nthreads = tl_nthreads;
}
int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
if (cg_nthreads + new_nthreads -
(root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
max_cg_threads) {
int tl_nthreads = max_cg_threads - cg_nthreads +
(root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
if (tl_nthreads <= 0) {
tl_nthreads = 1;
}
if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
__kmp_reserve_warn = 1;
__kmp_msg(kmp_ms_warning,
KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
}
if (tl_nthreads == 1) {
KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
"reduced reservation to 1 thread\n",
master_tid));
return 1;
}
KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
"reservation to %d threads\n",
master_tid, tl_nthreads));
new_nthreads = tl_nthreads;
}
capacity = __kmp_threads_capacity;
if (TCR_PTR(__kmp_threads[0]) == NULL) {
--capacity;
}
if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
capacity -= __kmp_hidden_helper_threads_num;
}
if (__kmp_nth + new_nthreads -
(root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
capacity) {
int slotsRequired = __kmp_nth + new_nthreads -
(root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
capacity;
int slotsAdded = __kmp_expand_threads(slotsRequired);
if (slotsAdded < slotsRequired) {
new_nthreads -= (slotsRequired - slotsAdded);
KMP_ASSERT(new_nthreads >= 1);
if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
__kmp_reserve_warn = 1;
if (__kmp_tp_cached) {
__kmp_msg(kmp_ms_warning,
KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
} else {
__kmp_msg(kmp_ms_warning,
KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
}
}
}
}
#ifdef KMP_DEBUG
if (new_nthreads == 1) {
KC_TRACE(10,
("__kmp_reserve_threads: T#%d serializing team after reclaiming "
"dead roots and rechecking; requested %d threads\n",
__kmp_get_gtid(), set_nthreads));
} else {
KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
" %d threads\n",
__kmp_get_gtid(), new_nthreads, set_nthreads));
}
#endif
if (this_thr->th.th_nt_strict && new_nthreads < set_nthreads) {
__kmpc_error(this_thr->th.th_nt_loc, this_thr->th.th_nt_sev,
this_thr->th.th_nt_msg);
}
return new_nthreads;
}
assured that there are enough threads available, because we checked on that
earlier within critical section forkjoin */
static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
kmp_info_t *master_th, int master_gtid,
int fork_teams_workers) {
int i;
int use_hot_team;
KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
KMP_MB();
master_th->th.th_info.ds.ds_tid = 0;
master_th->th.th_team = team;
master_th->th.th_team_nproc = team->t.t_nproc;
master_th->th.th_team_master = master_th;
master_th->th.th_team_serialized = FALSE;
master_th->th.th_dispatch = &team->t.t_dispatch[0];
#if KMP_NESTED_HOT_TEAMS
use_hot_team = 0;
kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
if (hot_teams) {
int level = team->t.t_active_level - 1;
if (master_th->th.th_teams_microtask) {
if (master_th->th.th_teams_size.nteams > 1) {
++level;
}
if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
master_th->th.th_teams_level == team->t.t_level) {
++level;
}
}
if (level < __kmp_hot_teams_max_level) {
if (hot_teams[level].hot_team) {
KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
use_hot_team = 1;
} else {
use_hot_team = 0;
hot_teams[level].hot_team = team;
hot_teams[level].hot_team_nth = team->t.t_nproc;
}
} else {
use_hot_team = 0;
}
}
#else
use_hot_team = team == root->r.r_hot_team;
#endif
if (!use_hot_team) {
team->t.t_threads[0] = master_th;
__kmp_initialize_info(master_th, team, 0, master_gtid);
for (i = 1; i < team->t.t_nproc; i++) {
kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
team->t.t_threads[i] = thr;
KMP_DEBUG_ASSERT(thr);
KMP_DEBUG_ASSERT(thr->th.th_team == team);
KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
"T#%d(%d:%d) join =%llu, plain=%llu\n",
__kmp_gtid_from_tid(0, team), team->t.t_id, 0,
__kmp_gtid_from_tid(i, team), team->t.t_id, i,
team->t.t_bar[bs_forkjoin_barrier].b_arrived,
team->t.t_bar[bs_plain_barrier].b_arrived));
thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
thr->th.th_teams_level = master_th->th.th_teams_level;
thr->th.th_teams_size = master_th->th.th_teams_size;
{
int b;
kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
for (b = 0; b < bs_last_barrier; ++b) {
balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
#if USE_DEBUGGER
balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
#endif
}
}
}
#if KMP_AFFINITY_SUPPORTED
if (!fork_teams_workers) {
__kmp_partition_places(team);
}
#endif
if (team->t.t_nproc > 1 &&
__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
team->t.b->update_num_threads(team->t.t_nproc);
__kmp_add_threads_to_team(team, team->t.t_nproc);
}
}
if (__kmp_tasking_mode != tskm_immediate_exec) {
if (use_hot_team) {
KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(team->t.t_parent, master_th);
KA_TRACE(
20,
("__kmp_fork_team_threads: Primary T#%d pushing task_team %p / team "
"%p, new task_team %p / team %p\n",
__kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
team->t.t_parent, team->t.t_task_team[master_th->th.th_task_state],
team));
KMP_CHECK_UPDATE(team->t.t_primary_task_state,
master_th->th.th_task_state);
if (team->t.t_nproc > 1) {
KMP_DEBUG_ASSERT(team->t.t_threads[1]->th.th_task_state == 0 ||
team->t.t_threads[1]->th.th_task_state == 1);
KMP_CHECK_UPDATE(master_th->th.th_task_state,
team->t.t_threads[1]->th.th_task_state);
} else {
master_th->th.th_task_state = 0;
}
} else {
KMP_CHECK_UPDATE(team->t.t_primary_task_state,
master_th->th.th_task_state);
master_th->th.th_task_state = 0;
}
}
if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
for (i = 0; i < team->t.t_nproc; i++) {
kmp_info_t *thr = team->t.t_threads[i];
if (thr->th.th_prev_num_threads != team->t.t_nproc ||
thr->th.th_prev_level != team->t.t_level) {
team->t.t_display_affinity = 1;
break;
}
}
}
KMP_MB();
}
#if KMP_ARCH_X86 || KMP_ARCH_X86_64
inline static void propagateFPControl(kmp_team_t *team) {
if (__kmp_inherit_fp_control) {
kmp_int16 x87_fpu_control_word;
kmp_uint32 mxcsr;
__kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
__kmp_store_mxcsr(&mxcsr);
mxcsr &= KMP_X86_MXCSR_MASK;
KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
} else {
KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
}
}
inline static void updateHWFPControl(kmp_team_t *team) {
if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
kmp_int16 x87_fpu_control_word;
kmp_uint32 mxcsr;
__kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
__kmp_store_mxcsr(&mxcsr);
mxcsr &= KMP_X86_MXCSR_MASK;
if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
__kmp_clear_x87_fpu_status_word();
__kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
}
if (team->t.t_mxcsr != mxcsr) {
__kmp_load_mxcsr(&team->t.t_mxcsr);
}
}
}
#else
#define propagateFPControl(x) ((void)0)
#define updateHWFPControl(x) ((void)0)
#endif
static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
int realloc);
single primary thread. */
void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
kmp_info_t *this_thr;
kmp_team_t *serial_team;
KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
unacceptable overhead */
if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
return;
if (!TCR_4(__kmp_init_parallel))
__kmp_parallel_initialize();
__kmp_resume_if_soft_paused();
this_thr = __kmp_threads[global_tid];
serial_team = this_thr->th.th_serial_team;
KMP_DEBUG_ASSERT(serial_team);
KMP_MB();
kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
proc_bind = proc_bind_false;
} else if (proc_bind == proc_bind_default) {
proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
}
this_thr->th.th_set_proc_bind = proc_bind_default;
this_thr->th.th_set_nproc = 0;
#if OMPT_SUPPORT
ompt_data_t ompt_parallel_data = ompt_data_none;
void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
if (ompt_enabled.enabled &&
this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
ompt_task_info_t *parent_task_info;
parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
if (ompt_enabled.ompt_callback_parallel_begin) {
int team_size = 1;
ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
&(parent_task_info->task_data), &(parent_task_info->frame),
&ompt_parallel_data, team_size,
ompt_parallel_invoker_program | ompt_parallel_team, codeptr);
}
}
#endif
if (this_thr->th.th_team != serial_team) {
int level = this_thr->th.th_team->t.t_level;
if (serial_team->t.t_serialized) {
TODO increase performance by making this locks more specific */
kmp_team_t *new_team;
__kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
new_team =
__kmp_allocate_team(this_thr->th.th_root, 1, 1,
#if OMPT_SUPPORT
ompt_parallel_data,
#endif
proc_bind, &this_thr->th.th_current_task->td_icvs,
0 USE_NESTED_HOT_ARG(NULL));
__kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
KMP_ASSERT(new_team);
new_team->t.t_threads[0] = this_thr;
new_team->t.t_parent = this_thr->th.th_team;
serial_team = new_team;
this_thr->th.th_serial_team = serial_team;
KF_TRACE(
10,
("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
global_tid, serial_team));
then we can still guarantee that serialized teams are ok, since we may
need to allocate a new one */
} else {
KF_TRACE(
10,
("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
global_tid, serial_team));
}
KMP_DEBUG_ASSERT(serial_team->t.t_threads);
KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
serial_team->t.t_ident = loc;
serial_team->t.t_serialized = 1;
serial_team->t.t_nproc = 1;
serial_team->t.t_parent = this_thr->th.th_team;
if (this_thr->th.th_team->t.t_nested_nth)
serial_team->t.t_nested_nth = this_thr->th.th_team->t.t_nested_nth;
else
serial_team->t.t_nested_nth = &__kmp_nested_nth;
serial_team->t.t_primary_task_state = this_thr->th.th_task_state;
serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
this_thr->th.th_team = serial_team;
serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d curtask=%p\n", global_tid,
this_thr->th.th_current_task));
KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
this_thr->th.th_current_task->td_flags.executing = 0;
__kmp_push_current_task_to_thread(this_thr, serial_team, 0);
implicit task for each serialized task represented by
team->t.t_serialized? */
copy_icvs(&this_thr->th.th_current_task->td_icvs,
&this_thr->th.th_current_task->td_parent->td_icvs);
kmp_nested_nthreads_t *nested_nth = &__kmp_nested_nth;
if (this_thr->th.th_team->t.t_nested_nth)
nested_nth = this_thr->th.th_team->t.t_nested_nth;
if (nested_nth->used && (level + 1 < nested_nth->used)) {
this_thr->th.th_current_task->td_icvs.nproc = nested_nth->nth[level + 1];
}
if (__kmp_nested_proc_bind.used &&
(level + 1 < __kmp_nested_proc_bind.used)) {
this_thr->th.th_current_task->td_icvs.proc_bind =
__kmp_nested_proc_bind.bind_types[level + 1];
}
#if USE_DEBUGGER
serial_team->t.t_pkfn = (microtask_t)(~0);
#endif
this_thr->th.th_info.ds.ds_tid = 0;
this_thr->th.th_team_nproc = 1;
this_thr->th.th_team_master = this_thr;
this_thr->th.th_team_serialized = 1;
this_thr->th.th_task_team = NULL;
this_thr->th.th_task_state = 0;
serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
serial_team->t.t_def_allocator = this_thr->th.th_def_allocator;
propagateFPControl(serial_team);
KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
if (!serial_team->t.t_dispatch->th_disp_buffer) {
serial_team->t.t_dispatch->th_disp_buffer =
(dispatch_private_info_t *)__kmp_allocate(
sizeof(dispatch_private_info_t));
}
this_thr->th.th_dispatch = serial_team->t.t_dispatch;
KMP_MB();
} else {
* that's fine, just add another nested level */
KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
KMP_DEBUG_ASSERT(serial_team->t.t_threads);
KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
++serial_team->t.t_serialized;
this_thr->th.th_team_serialized = serial_team->t.t_serialized;
int level = this_thr->th.th_team->t.t_level;
kmp_nested_nthreads_t *nested_nth = &__kmp_nested_nth;
if (serial_team->t.t_nested_nth)
nested_nth = serial_team->t.t_nested_nth;
if (nested_nth->used && (level + 1 < nested_nth->used)) {
this_thr->th.th_current_task->td_icvs.nproc = nested_nth->nth[level + 1];
}
serial_team->t.t_level++;
KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
"of serial team %p to %d\n",
global_tid, serial_team, serial_team->t.t_level));
KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
{
dispatch_private_info_t *disp_buffer =
(dispatch_private_info_t *)__kmp_allocate(
sizeof(dispatch_private_info_t));
disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
}
this_thr->th.th_dispatch = serial_team->t.t_dispatch;
__kmp_push_task_team_node(this_thr, serial_team);
KMP_MB();
}
KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
if (__kmp_display_affinity) {
if (this_thr->th.th_prev_level != serial_team->t.t_level ||
this_thr->th.th_prev_num_threads != 1) {
__kmp_aux_display_affinity(global_tid, NULL);
this_thr->th.th_prev_level = serial_team->t.t_level;
this_thr->th.th_prev_num_threads = 1;
}
}
if (__kmp_env_consistency_check)
__kmp_push_parallel(global_tid, NULL);
#if OMPT_SUPPORT
serial_team->t.ompt_team_info.master_return_address = codeptr;
if (ompt_enabled.enabled &&
this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
OMPT_GET_FRAME_ADDRESS(0);
ompt_lw_taskteam_t lw_taskteam;
__ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
&ompt_parallel_data, codeptr);
__ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
if (ompt_enabled.ompt_callback_implicit_task) {
ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid),
ompt_task_implicit);
OMPT_CUR_TASK_INFO(this_thr)->thread_num =
__kmp_tid_from_gtid(global_tid);
}
this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
OMPT_GET_FRAME_ADDRESS(0);
}
#endif
}
static inline bool __kmp_is_fork_in_teams(kmp_info_t *master_th,
microtask_t microtask, int level,
int teams_level, kmp_va_list ap) {
return (master_th->th.th_teams_microtask && ap &&
microtask != (microtask_t)__kmp_teams_master && level == teams_level);
}
static inline bool __kmp_is_entering_teams(int active_level, int level,
int teams_level, kmp_va_list ap) {
return ((ap == NULL && active_level == 0) ||
(ap && teams_level > 0 && teams_level == level));
}
static inline int
__kmp_fork_in_teams(ident_t *loc, int gtid, kmp_team_t *parent_team,
kmp_int32 argc, kmp_info_t *master_th, kmp_root_t *root,
enum fork_context_e call_context, microtask_t microtask,
launch_t invoker, int master_set_numthreads, int level,
#if OMPT_SUPPORT
ompt_data_t ompt_parallel_data, void *return_address,
#endif
kmp_va_list ap) {
void **argv;
int i;
parent_team->t.t_ident = loc;
__kmp_alloc_argv_entries(argc, parent_team, TRUE);
parent_team->t.t_argc = argc;
argv = (void **)parent_team->t.t_argv;
for (i = argc - 1; i >= 0; --i) {
*argv++ = va_arg(kmp_va_deref(ap), void *);
}
if (parent_team == master_th->th.th_serial_team) {
__kmpc_serialized_parallel(loc, gtid);
KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
if (call_context == fork_context_gnu) {
parent_team->t.t_serialized--;
return TRUE;
}
#if OMPD_SUPPORT
parent_team->t.t_pkfn = microtask;
#endif
#if OMPT_SUPPORT
void *dummy;
void **exit_frame_p;
ompt_data_t *implicit_task_data;
ompt_lw_taskteam_t lw_taskteam;
if (ompt_enabled.enabled) {
__ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
&ompt_parallel_data, return_address);
exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
__ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
if (ompt_enabled.ompt_callback_implicit_task) {
OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), implicit_task_data,
1, OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
}
master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
} else {
exit_frame_p = &dummy;
}
#endif
parent_team->t.t_serialized--;
{
KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
__kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
#if OMPT_SUPPORT
,
exit_frame_p
#endif
);
}
#if OMPT_SUPPORT
if (ompt_enabled.enabled) {
*exit_frame_p = NULL;
OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
if (ompt_enabled.ompt_callback_implicit_task) {
ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
ompt_scope_end, NULL, implicit_task_data, 1,
OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
}
ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
__ompt_lw_taskteam_unlink(master_th);
if (ompt_enabled.ompt_callback_parallel_end) {
ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
&ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th),
OMPT_INVOKER(call_context) | ompt_parallel_team, return_address);
}
master_th->th.ompt_thread_info.state = ompt_state_overhead;
}
#endif
return TRUE;
}
parent_team->t.t_pkfn = microtask;
parent_team->t.t_invoke = invoker;
KMP_ATOMIC_INC(&root->r.r_in_parallel);
parent_team->t.t_active_level++;
parent_team->t.t_level++;
parent_team->t.t_def_allocator = master_th->th.th_def_allocator;
master_th->th.th_teams_size.nth = parent_team->t.t_nproc;
#if OMPT_SUPPORT
if (ompt_enabled.enabled) {
ompt_lw_taskteam_t lw_taskteam;
__ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, &ompt_parallel_data,
return_address);
__ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true);
}
#endif
if (master_set_numthreads) {
if (master_set_numthreads <= master_th->th.th_teams_size.nth) {
kmp_info_t **other_threads = parent_team->t.t_threads;
int old_proc = master_th->th.th_teams_size.nth;
if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
__kmp_resize_dist_barrier(parent_team, old_proc, master_set_numthreads);
__kmp_add_threads_to_team(parent_team, master_set_numthreads);
}
parent_team->t.t_nproc = master_set_numthreads;
for (i = 0; i < master_set_numthreads; ++i) {
other_threads[i]->th.th_team_nproc = master_set_numthreads;
}
}
master_th->th.th_set_nproc = 0;
}
#if USE_DEBUGGER
if (__kmp_debugging) {
int nth = __kmp_omp_num_threads(loc);
if (nth > 0) {
master_set_numthreads = nth;
}
}
#endif
kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
kmp_proc_bind_t proc_bind_icv = proc_bind_default;
if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
proc_bind = proc_bind_false;
} else {
if (proc_bind == proc_bind_default) {
proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
}
This overrides proc-bind-var for this parallel region, but does not
change proc-bind-var. */
if ((level + 1 < __kmp_nested_proc_bind.used) &&
(__kmp_nested_proc_bind.bind_types[level + 1] !=
master_th->th.th_current_task->td_icvs.proc_bind)) {
proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
}
}
KMP_CHECK_UPDATE(parent_team->t.t_proc_bind, proc_bind);
if (proc_bind_icv != proc_bind_default &&
master_th->th.th_current_task->td_icvs.proc_bind != proc_bind_icv) {
kmp_info_t **other_threads = parent_team->t.t_threads;
for (i = 0; i < master_th->th.th_team_nproc; ++i) {
other_threads[i]->th.th_current_task->td_icvs.proc_bind = proc_bind_icv;
}
}
master_th->th.th_set_proc_bind = proc_bind_default;
#if USE_ITT_BUILD && USE_ITT_NOTIFY
if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) ||
KMP_ITT_DEBUG) &&
__kmp_forkjoin_frames_mode == 3 &&
parent_team->t.t_active_level == 1
&& master_th->th.th_teams_size.nteams == 1) {
kmp_uint64 tmp_time = __itt_get_timestamp();
master_th->th.th_frame_time = tmp_time;
parent_team->t.t_region_time = tmp_time;
}
if (__itt_stack_caller_create_ptr) {
KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
}
#endif
#if KMP_AFFINITY_SUPPORTED
__kmp_partition_places(parent_team);
#endif
KF_TRACE(10, ("__kmp_fork_in_teams: before internal fork: root=%p, team=%p, "
"master_th=%p, gtid=%d\n",
root, parent_team, master_th, gtid));
__kmp_internal_fork(loc, gtid, parent_team);
KF_TRACE(10, ("__kmp_fork_in_teams: after internal fork: root=%p, team=%p, "
"master_th=%p, gtid=%d\n",
root, parent_team, master_th, gtid));
if (call_context == fork_context_gnu)
return TRUE;
KA_TRACE(20, ("__kmp_fork_in_teams: T#%d(%d:0) invoke microtask = %p\n", gtid,
parent_team->t.t_id, parent_team->t.t_pkfn));
if (!parent_team->t.t_invoke(gtid)) {
KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
}
KA_TRACE(20, ("__kmp_fork_in_teams: T#%d(%d:0) done microtask = %p\n", gtid,
parent_team->t.t_id, parent_team->t.t_pkfn));
KMP_MB();
KA_TRACE(20, ("__kmp_fork_in_teams: parallel exit T#%d\n", gtid));
return TRUE;
}
static inline int
__kmp_serial_fork_call(ident_t *loc, int gtid, enum fork_context_e call_context,
kmp_int32 argc, microtask_t microtask, launch_t invoker,
kmp_info_t *master_th, kmp_team_t *parent_team,
#if OMPT_SUPPORT
ompt_data_t *ompt_parallel_data, void **return_address,
ompt_data_t **parent_task_data,
#endif
kmp_va_list ap) {
kmp_team_t *team;
int i;
void **argv;
#if KMP_OS_LINUX && \
(KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
SimpleVLA<void *> args(argc);
#else
void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
#endif
KMP_ARCH_AARCH64) */
KA_TRACE(
20, ("__kmp_serial_fork_call: T#%d serializing parallel region\n", gtid));
__kmpc_serialized_parallel(loc, gtid);
#if OMPD_SUPPORT
master_th->th.th_serial_team->t.t_pkfn = microtask;
#endif
if (call_context == fork_context_intel) {
master_th->th.th_serial_team->t.t_ident = loc;
if (!ap) {
master_th->th.th_serial_team->t.t_level--;
#if OMPT_SUPPORT
void *dummy;
void **exit_frame_p;
ompt_task_info_t *task_info;
ompt_lw_taskteam_t lw_taskteam;
if (ompt_enabled.enabled) {
__ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
ompt_parallel_data, *return_address);
__ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
task_info = OMPT_CUR_TASK_INFO(master_th);
exit_frame_p = &(task_info->frame.exit_frame.ptr);
if (ompt_enabled.ompt_callback_implicit_task) {
OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
&(task_info->task_data), 1,
OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
}
master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
} else {
exit_frame_p = &dummy;
}
#endif
{
KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
__kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
#if OMPT_SUPPORT
,
exit_frame_p
#endif
);
}
#if OMPT_SUPPORT
if (ompt_enabled.enabled) {
*exit_frame_p = NULL;
if (ompt_enabled.ompt_callback_implicit_task) {
ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
ompt_scope_end, NULL, &(task_info->task_data), 1,
OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
}
*ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
__ompt_lw_taskteam_unlink(master_th);
if (ompt_enabled.ompt_callback_parallel_end) {
ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
ompt_parallel_data, *parent_task_data,
OMPT_INVOKER(call_context) | ompt_parallel_team, *return_address);
}
master_th->th.ompt_thread_info.state = ompt_state_overhead;
}
#endif
} else if (microtask == (microtask_t)__kmp_teams_master) {
KMP_DEBUG_ASSERT(master_th->th.th_team == master_th->th.th_serial_team);
team = master_th->th.th_team;
team->t.t_invoke = invoker;
__kmp_alloc_argv_entries(argc, team, TRUE);
team->t.t_argc = argc;
argv = (void **)team->t.t_argv;
for (i = argc - 1; i >= 0; --i)
*argv++ = va_arg(kmp_va_deref(ap), void *);
team->t.t_level--;
invoker(gtid);
#if OMPT_SUPPORT
if (ompt_enabled.enabled) {
ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th);
if (ompt_enabled.ompt_callback_implicit_task) {
ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
ompt_scope_end, NULL, &(task_info->task_data), 0,
OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial);
}
if (ompt_enabled.ompt_callback_parallel_end) {
ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
ompt_parallel_data, *parent_task_data,
OMPT_INVOKER(call_context) | ompt_parallel_league,
*return_address);
}
master_th->th.ompt_thread_info.state = ompt_state_overhead;
}
#endif
} else {
argv = args;
for (i = argc - 1; i >= 0; --i)
*argv++ = va_arg(kmp_va_deref(ap), void *);
KMP_MB();
#if OMPT_SUPPORT
void *dummy;
void **exit_frame_p;
ompt_task_info_t *task_info;
ompt_lw_taskteam_t lw_taskteam;
ompt_data_t *implicit_task_data;
if (ompt_enabled.enabled) {
__ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
ompt_parallel_data, *return_address);
__ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
task_info = OMPT_CUR_TASK_INFO(master_th);
exit_frame_p = &(task_info->frame.exit_frame.ptr);
implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
if (ompt_enabled.ompt_callback_implicit_task) {
ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
implicit_task_data, 1, __kmp_tid_from_gtid(gtid),
ompt_task_implicit);
OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
}
master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
} else {
exit_frame_p = &dummy;
}
#endif
{
KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
__kmp_invoke_microtask(microtask, gtid, 0, argc, args
#if OMPT_SUPPORT
,
exit_frame_p
#endif
);
}
#if OMPT_SUPPORT
if (ompt_enabled.enabled) {
*exit_frame_p = NULL;
if (ompt_enabled.ompt_callback_implicit_task) {
ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
ompt_scope_end, NULL, &(task_info->task_data), 1,
OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
}
*ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
__ompt_lw_taskteam_unlink(master_th);
if (ompt_enabled.ompt_callback_parallel_end) {
ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
ompt_parallel_data, *parent_task_data,
OMPT_INVOKER(call_context) | ompt_parallel_team, *return_address);
}
master_th->th.ompt_thread_info.state = ompt_state_overhead;
}
#endif
}
} else if (call_context == fork_context_gnu) {
#if OMPT_SUPPORT
if (ompt_enabled.enabled) {
ompt_lw_taskteam_t lwt;
__ompt_lw_taskteam_init(&lwt, master_th, gtid, ompt_parallel_data,
*return_address);
lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
__ompt_lw_taskteam_link(&lwt, master_th, 1);
}
#endif
KA_TRACE(20, ("__kmp_serial_fork_call: T#%d serial exit\n", gtid));
return FALSE;
} else {
KMP_ASSERT2(call_context < fork_context_last,
"__kmp_serial_fork_call: unknown fork_context parameter");
}
KA_TRACE(20, ("__kmp_serial_fork_call: T#%d serial exit\n", gtid));
KMP_MB();
return FALSE;
}
int __kmp_fork_call(ident_t *loc, int gtid,
enum fork_context_e call_context,
kmp_int32 argc, microtask_t microtask, launch_t invoker,
kmp_va_list ap) {
void **argv;
int i;
int master_tid;
int master_this_cons;
kmp_team_t *team;
kmp_team_t *parent_team;
kmp_info_t *master_th;
kmp_root_t *root;
int nthreads;
int master_active;
int master_set_numthreads;
int task_thread_limit = 0;
int level;
int active_level;
int teams_level;
#if KMP_NESTED_HOT_TEAMS
kmp_hot_team_ptr_t **p_hot_teams;
#endif
{
KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
void *dummy = KMP_ALLOCA(__kmp_stkpadding);
if (__kmp_stkpadding > KMP_MAX_STKPADDING)
__kmp_stkpadding += (short)((kmp_int64)dummy);
}
KMP_DEBUG_ASSERT(
__kmp_init_serial);
if (!TCR_4(__kmp_init_parallel))
__kmp_parallel_initialize();
__kmp_resume_if_soft_paused();
master_th = __kmp_threads[gtid];
parent_team = master_th->th.th_team;
master_tid = master_th->th.th_info.ds.ds_tid;
master_this_cons = master_th->th.th_local.this_construct;
root = master_th->th.th_root;
master_active = root->r.r_active;
master_set_numthreads = master_th->th.th_set_nproc;
task_thread_limit =
master_th->th.th_current_task->td_icvs.task_thread_limit;
#if OMPT_SUPPORT
ompt_data_t ompt_parallel_data = ompt_data_none;
ompt_data_t *parent_task_data;
ompt_frame_t *ompt_frame;
void *return_address = NULL;
if (ompt_enabled.enabled) {
__ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
NULL, NULL);
return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
}
#endif
__kmp_assign_root_init_mask();
level = parent_team->t.t_level;
active_level = parent_team->t.t_active_level;
teams_level = master_th->th.th_teams_level;
#if KMP_NESTED_HOT_TEAMS
p_hot_teams = &master_th->th.th_hot_teams;
if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
*p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
(*p_hot_teams)[0].hot_team = root->r.r_hot_team;
(*p_hot_teams)[0].hot_team_nth = 1;
}
#endif
#if OMPT_SUPPORT
if (ompt_enabled.enabled) {
if (ompt_enabled.ompt_callback_parallel_begin) {
int team_size = master_set_numthreads
? master_set_numthreads
: get__nproc_2(parent_team, master_tid);
int flags = OMPT_INVOKER(call_context) |
((microtask == (microtask_t)__kmp_teams_master)
? ompt_parallel_league
: ompt_parallel_team);
ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags,
return_address);
}
master_th->th.ompt_thread_info.state = ompt_state_overhead;
}
#endif
master_th->th.th_ident = loc;
if (__kmp_is_fork_in_teams(master_th, microtask, level, teams_level, ap)) {
return __kmp_fork_in_teams(loc, gtid, parent_team, argc, master_th, root,
call_context, microtask, invoker,
master_set_numthreads, level,
#if OMPT_SUPPORT
ompt_parallel_data, return_address,
#endif
ap);
}
KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(parent_team, master_th);
int enter_teams =
__kmp_is_entering_teams(active_level, level, teams_level, ap);
if ((!enter_teams &&
(parent_team->t.t_active_level >=
master_th->th.th_current_task->td_icvs.max_active_levels)) ||
(__kmp_library == library_serial)) {
KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team\n", gtid));
nthreads = 1;
} else {
nthreads = master_set_numthreads
? master_set_numthreads
: get__nproc_2(parent_team, master_tid);
nthreads = task_thread_limit > 0 && task_thread_limit < nthreads
? task_thread_limit
: nthreads;
if (nthreads > 1) {
__kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
should be created but each can only have 1 thread if nesting is
disabled. If teams called from serial region, then teams and their
threads should be created regardless of the nesting setting. */
nthreads = __kmp_reserve_threads(root, parent_team, master_tid,
nthreads, enter_teams);
if (nthreads == 1) {
__kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
}
}
}
KMP_DEBUG_ASSERT(nthreads > 0);
master_th->th.th_set_nproc = 0;
if (nthreads == 1) {
return __kmp_serial_fork_call(loc, gtid, call_context, argc, microtask,
invoker, master_th, parent_team,
#if OMPT_SUPPORT
&ompt_parallel_data, &return_address,
&parent_task_data,
#endif
ap);
}
KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
"curtask=%p, curtask_max_aclevel=%d\n",
parent_team->t.t_active_level, master_th,
master_th->th.th_current_task,
master_th->th.th_current_task->td_icvs.max_active_levels));
master_th->th.th_current_task->td_flags.executing = 0;
if (!master_th->th.th_teams_microtask || level > teams_level) {
KMP_ATOMIC_INC(&root->r.r_in_parallel);
}
int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
kmp_nested_nthreads_t *nested_nth = NULL;
if (!master_th->th.th_set_nested_nth &&
(level + 1 < parent_team->t.t_nested_nth->used) &&
(parent_team->t.t_nested_nth->nth[level + 1] != nthreads_icv)) {
nthreads_icv = parent_team->t.t_nested_nth->nth[level + 1];
} else if (master_th->th.th_set_nested_nth) {
nested_nth = __kmp_override_nested_nth(master_th, level);
if ((level + 1 < nested_nth->used) &&
(nested_nth->nth[level + 1] != nthreads_icv))
nthreads_icv = nested_nth->nth[level + 1];
else
nthreads_icv = 0;
} else {
nthreads_icv = 0;
}
kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
kmp_proc_bind_t proc_bind_icv = proc_bind_default;
if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
proc_bind = proc_bind_false;
} else {
if (proc_bind == proc_bind_default) {
proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
}
if (master_th->th.th_teams_microtask &&
microtask == (microtask_t)__kmp_teams_master) {
proc_bind = __kmp_teams_proc_bind;
}
This overrides proc-bind-var for this parallel region, but does not
change proc-bind-var. */
if ((level + 1 < __kmp_nested_proc_bind.used) &&
(__kmp_nested_proc_bind.bind_types[level + 1] !=
master_th->th.th_current_task->td_icvs.proc_bind)) {
if (!master_th->th.th_teams_microtask ||
!(microtask == (microtask_t)__kmp_teams_master || ap == NULL))
proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
}
}
master_th->th.th_set_proc_bind = proc_bind_default;
if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) {
kmp_internal_control_t new_icvs;
copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
new_icvs.next = NULL;
if (nthreads_icv > 0) {
new_icvs.nproc = nthreads_icv;
}
if (proc_bind_icv != proc_bind_default) {
new_icvs.proc_bind = proc_bind_icv;
}
KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
team = __kmp_allocate_team(root, nthreads, nthreads,
#if OMPT_SUPPORT
ompt_parallel_data,
#endif
proc_bind, &new_icvs,
argc USE_NESTED_HOT_ARG(master_th));
if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs, &new_icvs);
} else {
KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
team = __kmp_allocate_team(root, nthreads, nthreads,
#if OMPT_SUPPORT
ompt_parallel_data,
#endif
proc_bind,
&master_th->th.th_current_task->td_icvs,
argc USE_NESTED_HOT_ARG(master_th));
if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs,
&master_th->th.th_current_task->td_icvs);
}
KF_TRACE(
10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
KMP_CHECK_UPDATE(team->t.t_ident, loc);
KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
#if OMPT_SUPPORT
KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
return_address);
#endif
KMP_CHECK_UPDATE(team->t.t_invoke, invoker);
if (!master_th->th.th_teams_microtask || level > teams_level) {
int new_level = parent_team->t.t_level + 1;
KMP_CHECK_UPDATE(team->t.t_level, new_level);
new_level = parent_team->t.t_active_level + 1;
KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
} else {
int new_level = parent_team->t.t_level;
KMP_CHECK_UPDATE(team->t.t_level, new_level);
new_level = parent_team->t.t_active_level;
KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
}
kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
if (team->t.t_nested_nth &&
team->t.t_nested_nth != parent_team->t.t_nested_nth) {
KMP_INTERNAL_FREE(team->t.t_nested_nth->nth);
KMP_INTERNAL_FREE(team->t.t_nested_nth);
team->t.t_nested_nth = NULL;
}
team->t.t_nested_nth = parent_team->t.t_nested_nth;
if (master_th->th.th_set_nested_nth) {
if (!nested_nth)
nested_nth = __kmp_override_nested_nth(master_th, level);
team->t.t_nested_nth = nested_nth;
KMP_INTERNAL_FREE(master_th->th.th_set_nested_nth);
master_th->th.th_set_nested_nth = NULL;
master_th->th.th_set_nested_nth_sz = 0;
master_th->th.th_nt_strict = false;
}
propagateFPControl(team);
#if OMPD_SUPPORT
if (ompd_state & OMPD_ENABLE_BP)
ompd_bp_parallel_begin();
#endif
KA_TRACE(
20,
("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
team->t.t_nproc));
KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
(team->t.t_master_tid == 0 &&
(team->t.t_parent == root->r.r_root_team ||
team->t.t_parent->t.t_serialized)));
KMP_MB();
argv = (void **)team->t.t_argv;
if (ap) {
for (i = argc - 1; i >= 0; --i) {
void *new_argv = va_arg(kmp_va_deref(ap), void *);
KMP_CHECK_UPDATE(*argv, new_argv);
argv++;
}
} else {
for (i = 0; i < argc; ++i) {
KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
}
}
KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
if (!root->r.r_active)
root->r.r_active = TRUE;
__kmp_fork_team_threads(root, team, master_th, gtid, !ap);
__kmp_setup_icv_copy(team, nthreads,
&master_th->th.th_current_task->td_icvs, loc);
#if OMPT_SUPPORT
master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
#endif
__kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
#if USE_ITT_BUILD
if (team->t.t_active_level == 1
&& !master_th->th.th_teams_microtask) {
#if USE_ITT_NOTIFY
if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
(__kmp_forkjoin_frames_mode == 3 ||
__kmp_forkjoin_frames_mode == 1)) {
kmp_uint64 tmp_time = 0;
if (__itt_get_timestamp_ptr)
tmp_time = __itt_get_timestamp();
master_th->th.th_frame_time = tmp_time;
if (__kmp_forkjoin_frames_mode == 3)
team->t.t_region_time = tmp_time;
} else
#endif
if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
__kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
__kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
}
}
#endif
KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
KMP_MB();
KF_TRACE(10,
("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
root, team, master_th, gtid));
#if USE_ITT_BUILD
if (__itt_stack_caller_create_ptr) {
if (!enter_teams) {
KMP_DEBUG_ASSERT(team->t.t_stack_id == NULL);
team->t.t_stack_id = __kmp_itt_stack_caller_create();
} else if (parent_team->t.t_serialized) {
KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
}
}
#endif
if (ap) {
__kmp_internal_fork(loc, gtid, team);
KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
"master_th=%p, gtid=%d\n",
root, team, master_th, gtid));
}
if (call_context == fork_context_gnu) {
KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
return TRUE;
}
KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
team->t.t_id, team->t.t_pkfn));
}
#if KMP_STATS_ENABLED
stats_state_e previous_state = KMP_GET_THREAD_STATE();
if (!ap) {
KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
}
#endif
if (!team->t.t_invoke(gtid)) {
KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
}
#if KMP_STATS_ENABLED
if (!ap) {
KMP_SET_THREAD_STATE(previous_state);
}
#endif
KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
team->t.t_id, team->t.t_pkfn));
KMP_MB();
KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
#if OMPT_SUPPORT
if (ompt_enabled.enabled) {
master_th->th.ompt_thread_info.state = ompt_state_overhead;
}
#endif
return TRUE;
}
#if OMPT_SUPPORT
static inline void __kmp_join_restore_state(kmp_info_t *thread,
kmp_team_t *team) {
thread->th.ompt_thread_info.state =
((team->t.t_serialized) ? ompt_state_work_serial
: ompt_state_work_parallel);
}
static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
kmp_team_t *team, ompt_data_t *parallel_data,
int flags, void *codeptr) {
ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
if (ompt_enabled.ompt_callback_parallel_end) {
ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
parallel_data, &(task_info->task_data), flags, codeptr);
}
task_info->frame.enter_frame = ompt_data_none;
__kmp_join_restore_state(thread, team);
}
#endif
void __kmp_join_call(ident_t *loc, int gtid
#if OMPT_SUPPORT
,
enum fork_context_e fork_context
#endif
,
int exit_teams) {
KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
kmp_team_t *team;
kmp_team_t *parent_team;
kmp_info_t *master_th;
kmp_root_t *root;
int master_active;
KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
master_th = __kmp_threads[gtid];
root = master_th->th.th_root;
team = master_th->th.th_team;
parent_team = team->t.t_parent;
master_th->th.th_ident = loc;
#if OMPT_SUPPORT
void *team_microtask = (void *)team->t.t_pkfn;
if (ompt_enabled.enabled &&
!(team->t.t_serialized && fork_context == fork_context_gnu)) {
master_th->th.ompt_thread_info.state = ompt_state_overhead;
}
#endif
#if KMP_DEBUG
if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
"th_task_team = %p\n",
__kmp_gtid_from_thread(master_th), team,
team->t.t_task_team[master_th->th.th_task_state],
master_th->th.th_task_team));
KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(team, master_th);
}
#endif
if (team->t.t_serialized) {
if (master_th->th.th_teams_microtask) {
int level = team->t.t_level;
int tlevel = master_th->th.th_teams_level;
if (level == tlevel) {
team->t.t_level++;
} else if (level == tlevel + 1) {
team->t.t_serialized++;
}
}
__kmpc_end_serialized_parallel(loc, gtid);
#if OMPT_SUPPORT
if (ompt_enabled.enabled) {
if (fork_context == fork_context_gnu) {
__ompt_lw_taskteam_unlink(master_th);
}
__kmp_join_restore_state(master_th, parent_team);
}
#endif
return;
}
master_active = team->t.t_master_active;
if (!exit_teams) {
__kmp_internal_join(loc, gtid, team);
#if USE_ITT_BUILD
if (__itt_stack_caller_create_ptr) {
KMP_DEBUG_ASSERT(team->t.t_stack_id != NULL);
__kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id);
team->t.t_stack_id = NULL;
}
#endif
} else {
master_th->th.th_task_state =
0;
#if USE_ITT_BUILD
if (__itt_stack_caller_create_ptr && parent_team->t.t_serialized) {
KMP_DEBUG_ASSERT(parent_team->t.t_stack_id != NULL);
__kmp_itt_stack_caller_destroy((__itt_caller)parent_team->t.t_stack_id);
parent_team->t.t_stack_id = NULL;
}
#endif
}
KMP_MB();
#if OMPT_SUPPORT
ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
void *codeptr = team->t.ompt_team_info.master_return_address;
#endif
#if USE_ITT_BUILD
if (team->t.t_active_level == 1 &&
(!master_th->th.th_teams_microtask ||
master_th->th.th_teams_size.nteams == 1)) {
master_th->th.th_ident = loc;
if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
__kmp_forkjoin_frames_mode == 3)
__kmp_itt_frame_submit(gtid, team->t.t_region_time,
master_th->th.th_frame_time, 0, loc,
master_th->th.th_team_nproc, 1);
else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
!__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
__kmp_itt_region_joined(gtid);
}
#endif
#if KMP_AFFINITY_SUPPORTED
if (!exit_teams) {
master_th->th.th_first_place = team->t.t_first_place;
master_th->th.th_last_place = team->t.t_last_place;
}
#endif
if (master_th->th.th_teams_microtask && !exit_teams &&
team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
team->t.t_level == master_th->th.th_teams_level + 1) {
#if OMPT_SUPPORT
ompt_data_t ompt_parallel_data = ompt_data_none;
if (ompt_enabled.enabled) {
ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
if (ompt_enabled.ompt_callback_implicit_task) {
int ompt_team_size = team->t.t_nproc;
ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
}
task_info->frame.exit_frame = ompt_data_none;
task_info->task_data = ompt_data_none;
ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
__ompt_lw_taskteam_unlink(master_th);
}
#endif
team->t.t_level--;
team->t.t_active_level--;
KMP_ATOMIC_DEC(&root->r.r_in_parallel);
if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
int old_num = master_th->th.th_team_nproc;
int new_num = master_th->th.th_teams_size.nth;
kmp_info_t **other_threads = team->t.t_threads;
team->t.t_nproc = new_num;
for (int i = 0; i < old_num; ++i) {
other_threads[i]->th.th_team_nproc = new_num;
}
for (int i = old_num; i < new_num; ++i) {
KMP_DEBUG_ASSERT(other_threads[i]);
kmp_balign_t *balign = other_threads[i]->th.th_bar;
for (int b = 0; b < bs_last_barrier; ++b) {
balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
#if USE_DEBUGGER
balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
#endif
}
if (__kmp_tasking_mode != tskm_immediate_exec) {
other_threads[i]->th.th_task_state = master_th->th.th_task_state;
}
}
}
#if OMPT_SUPPORT
if (ompt_enabled.enabled) {
__kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data,
OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr);
}
#endif
return;
}
master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
master_th->th.th_local.this_construct = team->t.t_master_this_cons;
master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
separating the parallel user code called in this parallel region
from the serial user code called after this function returns. */
__kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
if (!master_th->th.th_teams_microtask ||
team->t.t_level > master_th->th.th_teams_level) {
KMP_ATOMIC_DEC(&root->r.r_in_parallel);
}
KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
#if OMPT_SUPPORT
if (ompt_enabled.enabled) {
ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
if (ompt_enabled.ompt_callback_implicit_task) {
int flags = (team_microtask == (void *)__kmp_teams_master)
? ompt_task_initial
: ompt_task_implicit;
int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc;
ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
OMPT_CUR_TASK_INFO(master_th)->thread_num, flags);
}
task_info->frame.exit_frame = ompt_data_none;
task_info->task_data = ompt_data_none;
}
#endif
KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
master_th, team));
__kmp_pop_current_task_from_thread(master_th);
master_th->th.th_def_allocator = team->t.t_def_allocator;
#if OMPD_SUPPORT
if (ompd_state & OMPD_ENABLE_BP)
ompd_bp_parallel_end();
#endif
updateHWFPControl(team);
if (root->r.r_active != master_active)
root->r.r_active = master_active;
__kmp_free_team(root, team USE_NESTED_HOT_ARG(
master_th));
region otherwise assertions may fail occasionally since the old team may be
reallocated and the hierarchy appears inconsistent. it is actually safe to
run and won't cause any bugs, but will cause those assertion failures. it's
only one deref&assign so might as well put this in the critical region */
master_th->th.th_team = parent_team;
master_th->th.th_team_nproc = parent_team->t.t_nproc;
master_th->th.th_team_master = parent_team->t.t_threads[0];
master_th->th.th_team_serialized = parent_team->t.t_serialized;
if (parent_team->t.t_serialized &&
parent_team != master_th->th.th_serial_team &&
parent_team != root->r.r_root_team) {
__kmp_free_team(root,
master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
master_th->th.th_serial_team = parent_team;
}
if (__kmp_tasking_mode != tskm_immediate_exec) {
KMP_DEBUG_ASSERT(team->t.t_primary_task_state == 0 ||
team->t.t_primary_task_state == 1);
master_th->th.th_task_state = (kmp_uint8)team->t.t_primary_task_state;
master_th->th.th_task_team =
parent_team->t.t_task_team[master_th->th.th_task_state];
KA_TRACE(20,
("__kmp_join_call: Primary T#%d restoring task_team %p, team %p\n",
__kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
parent_team));
}
master_th->th.th_current_task->td_flags.executing = 1;
__kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
#if KMP_AFFINITY_SUPPORTED
if (master_th->th.th_team->t.t_level == 0 && __kmp_affinity.flags.reset) {
__kmp_reset_root_init_mask(gtid);
}
#endif
#if OMPT_SUPPORT
int flags =
OMPT_INVOKER(fork_context) |
((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league
: ompt_parallel_team);
if (ompt_enabled.enabled) {
__kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags,
codeptr);
}
#endif
KMP_MB();
KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
}
serial team stack. If so, do it. */
void __kmp_save_internal_controls(kmp_info_t *thread) {
if (thread->th.th_team != thread->th.th_serial_team) {
return;
}
if (thread->th.th_team->t.t_serialized > 1) {
int push = 0;
if (thread->th.th_team->t.t_control_stack_top == NULL) {
push = 1;
} else {
if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
thread->th.th_team->t.t_serialized) {
push = 1;
}
}
if (push) {
kmp_internal_control_t *control =
(kmp_internal_control_t *)__kmp_allocate(
sizeof(kmp_internal_control_t));
copy_icvs(control, &thread->th.th_current_task->td_icvs);
control->serial_nesting_level = thread->th.th_team->t.t_serialized;
control->next = thread->th.th_team->t.t_control_stack_top;
thread->th.th_team->t.t_control_stack_top = control;
}
}
}
void __kmp_set_num_threads(int new_nth, int gtid) {
kmp_info_t *thread;
kmp_root_t *root;
KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
KMP_DEBUG_ASSERT(__kmp_init_serial);
if (new_nth < 1)
new_nth = 1;
else if (new_nth > __kmp_max_nth)
new_nth = __kmp_max_nth;
KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
thread = __kmp_threads[gtid];
if (thread->th.th_current_task->td_icvs.nproc == new_nth)
return;
__kmp_save_internal_controls(thread);
set__nproc(thread, new_nth);
root = thread->th.th_root;
if (__kmp_init_parallel && (!root->r.r_active) &&
(root->r.r_hot_team->t.t_nproc > new_nth)
#if KMP_NESTED_HOT_TEAMS
&& __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
#endif
) {
kmp_team_t *hot_team = root->r.r_hot_team;
int f;
__kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
__kmp_resize_dist_barrier(hot_team, hot_team->t.t_nproc, new_nth);
}
for (f = new_nth; f < hot_team->t.t_nproc; f++) {
KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
if (__kmp_tasking_mode != tskm_immediate_exec) {
hot_team->t.t_threads[f]->th.th_task_team = NULL;
}
__kmp_free_thread(hot_team->t.t_threads[f]);
hot_team->t.t_threads[f] = NULL;
}
hot_team->t.t_nproc = new_nth;
#if KMP_NESTED_HOT_TEAMS
if (thread->th.th_hot_teams) {
KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
thread->th.th_hot_teams[0].hot_team_nth = new_nth;
}
#endif
if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
hot_team->t.b->update_num_threads(new_nth);
__kmp_add_threads_to_team(hot_team, new_nth);
}
__kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
for (f = 0; f < new_nth; f++) {
KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
}
hot_team->t.t_size_changed = -1;
}
}
void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
kmp_info_t *thread;
KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
"%d = (%d)\n",
gtid, max_active_levels));
KMP_DEBUG_ASSERT(__kmp_init_serial);
if (max_active_levels < 0) {
KMP_WARNING(ActiveLevelsNegative, max_active_levels);
KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
"max_active_levels for thread %d = (%d)\n",
gtid, max_active_levels));
return;
}
if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
} else {
KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
KMP_MAX_ACTIVE_LEVELS_LIMIT);
max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
}
KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
"max_active_levels for thread %d = (%d)\n",
gtid, max_active_levels));
thread = __kmp_threads[gtid];
__kmp_save_internal_controls(thread);
set__max_active_levels(thread, max_active_levels);
}
int __kmp_get_max_active_levels(int gtid) {
kmp_info_t *thread;
KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
KMP_DEBUG_ASSERT(__kmp_init_serial);
thread = __kmp_threads[gtid];
KMP_DEBUG_ASSERT(thread->th.th_current_task);
KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
"curtask_maxaclevel=%d\n",
gtid, thread->th.th_current_task,
thread->th.th_current_task->td_icvs.max_active_levels));
return thread->th.th_current_task->td_icvs.max_active_levels;
}
void __kmp_set_num_teams(int num_teams) {
if (num_teams > 0)
__kmp_nteams = num_teams;
}
int __kmp_get_max_teams(void) { return __kmp_nteams; }
void __kmp_set_teams_thread_limit(int limit) {
if (limit > 0)
__kmp_teams_thread_limit = limit;
}
int __kmp_get_teams_thread_limit(void) { return __kmp_teams_thread_limit; }
KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int));
KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int));
void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
kmp_info_t *thread;
kmp_sched_t orig_kind;
KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
gtid, (int)kind, chunk));
KMP_DEBUG_ASSERT(__kmp_init_serial);
orig_kind = kind;
kind = __kmp_sched_without_mods(kind);
if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
(kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
__kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
__kmp_msg_null);
kind = kmp_sched_default;
chunk = 0;
}
thread = __kmp_threads[gtid];
__kmp_save_internal_controls(thread);
if (kind < kmp_sched_upper_std) {
if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
} else {
thread->th.th_current_task->td_icvs.sched.r_sched_type =
__kmp_sch_map[kind - kmp_sched_lower - 1];
}
} else {
thread->th.th_current_task->td_icvs.sched.r_sched_type =
__kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
kmp_sched_lower - 2];
}
__kmp_sched_apply_mods_intkind(
orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type));
if (kind == kmp_sched_auto || chunk < 1) {
thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
} else {
thread->th.th_current_task->td_icvs.sched.chunk = chunk;
}
}
void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
kmp_info_t *thread;
enum sched_type th_type;
KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
KMP_DEBUG_ASSERT(__kmp_init_serial);
thread = __kmp_threads[gtid];
th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) {
case kmp_sch_static:
case kmp_sch_static_greedy:
case kmp_sch_static_balanced:
*kind = kmp_sched_static;
__kmp_sched_apply_mods_stdkind(kind, th_type);
*chunk = 0;
return;
case kmp_sch_static_chunked:
*kind = kmp_sched_static;
break;
case kmp_sch_dynamic_chunked:
*kind = kmp_sched_dynamic;
break;
case kmp_sch_guided_chunked:
case kmp_sch_guided_iterative_chunked:
case kmp_sch_guided_analytical_chunked:
*kind = kmp_sched_guided;
break;
case kmp_sch_auto:
*kind = kmp_sched_auto;
break;
case kmp_sch_trapezoidal:
*kind = kmp_sched_trapezoidal;
break;
#if KMP_STATIC_STEAL_ENABLED
case kmp_sch_static_steal:
*kind = kmp_sched_static_steal;
break;
#endif
default:
KMP_FATAL(UnknownSchedulingType, th_type);
}
__kmp_sched_apply_mods_stdkind(kind, th_type);
*chunk = thread->th.th_current_task->td_icvs.sched.chunk;
}
int __kmp_get_ancestor_thread_num(int gtid, int level) {
int ii, dd;
kmp_team_t *team;
kmp_info_t *thr;
KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
KMP_DEBUG_ASSERT(__kmp_init_serial);
if (level == 0)
return 0;
if (level < 0)
return -1;
thr = __kmp_threads[gtid];
team = thr->th.th_team;
ii = team->t.t_level;
if (level > ii)
return -1;
if (thr->th.th_teams_microtask) {
int tlevel = thr->th.th_teams_level;
if (level <=
tlevel) {
KMP_DEBUG_ASSERT(ii >= tlevel);
if (ii == tlevel) {
ii += 2;
} else {
ii++;
}
}
}
if (ii == level)
return __kmp_tid_from_gtid(gtid);
dd = team->t.t_serialized;
level++;
while (ii > level) {
for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
}
if ((team->t.t_serialized) && (!dd)) {
team = team->t.t_parent;
continue;
}
if (ii > level) {
team = team->t.t_parent;
dd = team->t.t_serialized;
ii--;
}
}
return (dd > 1) ? (0) : (team->t.t_master_tid);
}
int __kmp_get_team_size(int gtid, int level) {
int ii, dd;
kmp_team_t *team;
kmp_info_t *thr;
KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
KMP_DEBUG_ASSERT(__kmp_init_serial);
if (level == 0)
return 1;
if (level < 0)
return -1;
thr = __kmp_threads[gtid];
team = thr->th.th_team;
ii = team->t.t_level;
if (level > ii)
return -1;
if (thr->th.th_teams_microtask) {
int tlevel = thr->th.th_teams_level;
if (level <=
tlevel) {
KMP_DEBUG_ASSERT(ii >= tlevel);
if (ii == tlevel) {
ii += 2;
} else {
ii++;
}
}
}
while (ii > level) {
for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
}
if (team->t.t_serialized && (!dd)) {
team = team->t.t_parent;
continue;
}
if (ii > level) {
team = team->t.t_parent;
ii--;
}
}
return team->t.t_nproc;
}
kmp_r_sched_t __kmp_get_schedule_global() {
kmp_r_sched_t r_sched;
enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched);
if (s == kmp_sch_static) {
r_sched.r_sched_type = __kmp_static;
} else if (s == kmp_sch_guided_chunked) {
r_sched.r_sched_type = __kmp_guided;
} else {
r_sched.r_sched_type = __kmp_sched;
}
SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers);
if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
r_sched.chunk = KMP_DEFAULT_CHUNK;
} else {
r_sched.chunk = __kmp_chunk;
}
return r_sched;
}
at least argc number of *t_argv entries for the requested team. */
static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
KMP_DEBUG_ASSERT(team);
if (!realloc || argc > team->t.t_max_argc) {
KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
"current entries=%d\n",
team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
__kmp_free((void *)team->t.t_argv);
if (argc <= KMP_INLINE_ARGV_ENTRIES) {
team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
"argv entries\n",
team->t.t_id, team->t.t_max_argc));
team->t.t_argv = &team->t.t_inline_argv[0];
if (__kmp_storage_map) {
__kmp_print_storage_map_gtid(
-1, &team->t.t_inline_argv[0],
&team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
(sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
team->t.t_id);
}
} else {
team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
? KMP_MIN_MALLOC_ARGV_ENTRIES
: 2 * argc;
KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
"argv entries\n",
team->t.t_id, team->t.t_max_argc));
team->t.t_argv =
(void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
if (__kmp_storage_map) {
__kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
&team->t.t_argv[team->t.t_max_argc],
sizeof(void *) * team->t.t_max_argc,
"team_%d.t_argv", team->t.t_id);
}
}
}
}
static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
int i;
int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
team->t.t_threads =
(kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
sizeof(dispatch_shared_info_t) * num_disp_buff);
team->t.t_dispatch =
(kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
team->t.t_implicit_task_taskdata =
(kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
team->t.t_max_nproc = max_nth;
for (i = 0; i < num_disp_buff; ++i) {
team->t.t_disp_buffer[i].buffer_index = i;
team->t.t_disp_buffer[i].doacross_buf_idx = i;
}
}
static void __kmp_free_team_arrays(kmp_team_t *team) {
int i;
for (i = 0; i < team->t.t_max_nproc; ++i) {
if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
__kmp_free(team->t.t_dispatch[i].th_disp_buffer);
team->t.t_dispatch[i].th_disp_buffer = NULL;
}
}
#if KMP_USE_HIER_SCHED
__kmp_dispatch_free_hierarchies(team);
#endif
__kmp_free(team->t.t_threads);
__kmp_free(team->t.t_disp_buffer);
__kmp_free(team->t.t_dispatch);
__kmp_free(team->t.t_implicit_task_taskdata);
team->t.t_threads = NULL;
team->t.t_disp_buffer = NULL;
team->t.t_dispatch = NULL;
team->t.t_implicit_task_taskdata = 0;
}
static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
kmp_info_t **oldThreads = team->t.t_threads;
__kmp_free(team->t.t_disp_buffer);
__kmp_free(team->t.t_dispatch);
__kmp_free(team->t.t_implicit_task_taskdata);
__kmp_allocate_team_arrays(team, max_nth);
KMP_MEMCPY(team->t.t_threads, oldThreads,
team->t.t_nproc * sizeof(kmp_info_t *));
__kmp_free(oldThreads);
}
static kmp_internal_control_t __kmp_get_global_icvs(void) {
kmp_r_sched_t r_sched =
__kmp_get_schedule_global();
KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
kmp_internal_control_t g_icvs = {
0,
(kmp_int8)__kmp_global.g.g_dynamic,
(kmp_int8)__kmp_env_blocktime,
__kmp_dflt_blocktime,
#if KMP_USE_MONITOR
__kmp_bt_intervals,
#endif
__kmp_dflt_team_nth,
__kmp_cg_max_nth,
__kmp_task_max_nth,
__kmp_dflt_max_active_levels,
r_sched,
__kmp_nested_proc_bind.bind_types[0],
__kmp_default_device,
NULL
};
return g_icvs;
}
static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
kmp_internal_control_t gx_icvs;
gx_icvs.serial_nesting_level =
0;
copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
gx_icvs.next = NULL;
return gx_icvs;
}
static void __kmp_initialize_root(kmp_root_t *root) {
int f;
kmp_team_t *root_team;
kmp_team_t *hot_team;
int hot_team_max_nth;
kmp_r_sched_t r_sched =
__kmp_get_schedule_global();
kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
KMP_DEBUG_ASSERT(root);
KMP_ASSERT(!root->r.r_begin);
__kmp_init_lock(&root->r.r_begin_lock);
root->r.r_begin = FALSE;
root->r.r_active = FALSE;
root->r.r_in_parallel = 0;
root->r.r_blocktime = __kmp_dflt_blocktime;
#if KMP_AFFINITY_SUPPORTED
root->r.r_affinity_assigned = FALSE;
#endif
KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
root_team =
__kmp_allocate_team(root,
1,
1,
#if OMPT_SUPPORT
ompt_data_none,
#endif
__kmp_nested_proc_bind.bind_types[0], &r_icvs,
0
USE_NESTED_HOT_ARG(NULL)
);
#if USE_DEBUGGER
TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
#endif
KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
root->r.r_root_team = root_team;
root_team->t.t_control_stack_top = NULL;
root_team->t.t_threads[0] = NULL;
root_team->t.t_nproc = 1;
root_team->t.t_serialized = 1;
root_team->t.t_sched.sched = r_sched.sched;
root_team->t.t_nested_nth = &__kmp_nested_nth;
KA_TRACE(
20,
("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
hot_team =
__kmp_allocate_team(root,
1,
__kmp_dflt_team_nth_ub * 2,
#if OMPT_SUPPORT
ompt_data_none,
#endif
__kmp_nested_proc_bind.bind_types[0], &r_icvs,
0
USE_NESTED_HOT_ARG(NULL)
);
KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
root->r.r_hot_team = hot_team;
root_team->t.t_control_stack_top = NULL;
hot_team->t.t_parent = root_team;
hot_team_max_nth = hot_team->t.t_max_nproc;
for (f = 0; f < hot_team_max_nth; ++f) {
hot_team->t.t_threads[f] = NULL;
}
hot_team->t.t_nproc = 1;
hot_team->t.t_sched.sched = r_sched.sched;
hot_team->t.t_size_changed = 0;
hot_team->t.t_nested_nth = &__kmp_nested_nth;
}
#ifdef KMP_DEBUG
typedef struct kmp_team_list_item {
kmp_team_p const *entry;
struct kmp_team_list_item *next;
} kmp_team_list_item_t;
typedef kmp_team_list_item_t *kmp_team_list_t;
static void __kmp_print_structure_team_accum(
kmp_team_list_t list,
kmp_team_p const *team
) {
kmp_team_list_t l;
KMP_DEBUG_ASSERT(list != NULL);
if (team == NULL) {
return;
}
__kmp_print_structure_team_accum(list, team->t.t_parent);
__kmp_print_structure_team_accum(list, team->t.t_next_pool);
l = list;
while (l->next != NULL && l->entry != team) {
l = l->next;
}
if (l->next != NULL) {
return;
}
l = list;
while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
l = l->next;
}
{
kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
sizeof(kmp_team_list_item_t));
*item = *l;
l->entry = team;
l->next = item;
}
}
static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
) {
__kmp_printf("%s", title);
if (team != NULL) {
__kmp_printf("%2x %p\n", team->t.t_id, team);
} else {
__kmp_printf(" - (nil)\n");
}
}
static void __kmp_print_structure_thread(char const *title,
kmp_info_p const *thread) {
__kmp_printf("%s", title);
if (thread != NULL) {
__kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
} else {
__kmp_printf(" - (nil)\n");
}
}
void __kmp_print_structure(void) {
kmp_team_list_t list;
list =
(kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
list->entry = NULL;
list->next = NULL;
__kmp_printf("\n------------------------------\nGlobal Thread "
"Table\n------------------------------\n");
{
int gtid;
for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
__kmp_printf("%2d", gtid);
if (__kmp_threads != NULL) {
__kmp_printf(" %p", __kmp_threads[gtid]);
}
if (__kmp_root != NULL) {
__kmp_printf(" %p", __kmp_root[gtid]);
}
__kmp_printf("\n");
}
}
__kmp_printf("\n------------------------------\nThreads\n--------------------"
"----------\n");
if (__kmp_threads != NULL) {
int gtid;
for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
kmp_info_t const *thread = __kmp_threads[gtid];
if (thread != NULL) {
__kmp_printf("GTID %2d %p:\n", gtid, thread);
__kmp_printf(" Our Root: %p\n", thread->th.th_root);
__kmp_print_structure_team(" Our Team: ", thread->th.th_team);
__kmp_print_structure_team(" Serial Team: ",
thread->th.th_serial_team);
__kmp_printf(" Threads: %2d\n", thread->th.th_team_nproc);
__kmp_print_structure_thread(" Primary: ",
thread->th.th_team_master);
__kmp_printf(" Serialized?: %2d\n", thread->th.th_team_serialized);
__kmp_printf(" Set NProc: %2d\n", thread->th.th_set_nproc);
__kmp_printf(" Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
__kmp_print_structure_thread(" Next in pool: ",
thread->th.th_next_pool);
__kmp_printf("\n");
__kmp_print_structure_team_accum(list, thread->th.th_team);
__kmp_print_structure_team_accum(list, thread->th.th_serial_team);
}
}
} else {
__kmp_printf("Threads array is not allocated.\n");
}
__kmp_printf("\n------------------------------\nUbers\n----------------------"
"--------\n");
if (__kmp_root != NULL) {
int gtid;
for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
kmp_root_t const *root = __kmp_root[gtid];
if (root != NULL) {
__kmp_printf("GTID %2d %p:\n", gtid, root);
__kmp_print_structure_team(" Root Team: ", root->r.r_root_team);
__kmp_print_structure_team(" Hot Team: ", root->r.r_hot_team);
__kmp_print_structure_thread(" Uber Thread: ",
root->r.r_uber_thread);
__kmp_printf(" Active?: %2d\n", root->r.r_active);
__kmp_printf(" In Parallel: %2d\n",
KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
__kmp_printf("\n");
__kmp_print_structure_team_accum(list, root->r.r_root_team);
__kmp_print_structure_team_accum(list, root->r.r_hot_team);
}
}
} else {
__kmp_printf("Ubers array is not allocated.\n");
}
__kmp_printf("\n------------------------------\nTeams\n----------------------"
"--------\n");
while (list->next != NULL) {
kmp_team_p const *team = list->entry;
int i;
__kmp_printf("Team %2x %p:\n", team->t.t_id, team);
__kmp_print_structure_team(" Parent Team: ", team->t.t_parent);
__kmp_printf(" Primary TID: %2d\n", team->t.t_master_tid);
__kmp_printf(" Max threads: %2d\n", team->t.t_max_nproc);
__kmp_printf(" Levels of serial: %2d\n", team->t.t_serialized);
__kmp_printf(" Number threads: %2d\n", team->t.t_nproc);
for (i = 0; i < team->t.t_nproc; ++i) {
__kmp_printf(" Thread %2d: ", i);
__kmp_print_structure_thread("", team->t.t_threads[i]);
}
__kmp_print_structure_team(" Next in pool: ", team->t.t_next_pool);
__kmp_printf("\n");
list = list->next;
}
__kmp_printf("\n------------------------------\nPools\n----------------------"
"--------\n");
__kmp_print_structure_thread("Thread pool: ",
CCAST(kmp_info_t *, __kmp_thread_pool));
__kmp_print_structure_team("Team pool: ",
CCAST(kmp_team_t *, __kmp_team_pool));
__kmp_printf("\n");
while (list != NULL) {
kmp_team_list_item_t *item = list;
list = list->next;
KMP_INTERNAL_FREE(item);
}
}
#endif
static const unsigned __kmp_primes[] = {
0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
unsigned short __kmp_get_random(kmp_info_t *thread) {
unsigned x = thread->th.th_x;
unsigned short r = (unsigned short)(x >> 16);
thread->th.th_x = x * thread->th.th_a + 1;
KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
thread->th.th_info.ds.ds_tid, r));
return r;
}
void __kmp_init_random(kmp_info_t *thread) {
unsigned seed = thread->th.th_info.ds.ds_tid;
thread->th.th_a =
__kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
KA_TRACE(30,
("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
}
#if KMP_OS_WINDOWS
* reclaimed */
static int __kmp_reclaim_dead_roots(void) {
int i, r = 0;
for (i = 0; i < __kmp_threads_capacity; ++i) {
if (KMP_UBER_GTID(i) &&
!__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
!__kmp_root[i]
->r.r_active) {
r += __kmp_unregister_root_other_thread(i);
}
}
return r;
}
#endif
__kmp_root, and returns the number of free entries generated.
For Windows* OS static library, the first mechanism used is to reclaim array
entries for root threads that are already dead.
On all platforms, expansion is attempted on the arrays __kmp_threads_ and
__kmp_root, with appropriate update to __kmp_threads_capacity. Array
capacity is increased by doubling with clipping to __kmp_tp_capacity, if
threadprivate cache array has been created. Synchronization with
__kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
After any dead root reclamation, if the clipping value allows array expansion
to result in the generation of a total of nNeed free slots, the function does
that expansion. If not, nothing is done beyond the possible initial root
thread reclamation.
If any argument is negative, the behavior is undefined. */
static int __kmp_expand_threads(int nNeed) {
int added = 0;
int minimumRequiredCapacity;
int newCapacity;
kmp_info_t **newThreads;
kmp_root_t **newRoot;
#if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
added = __kmp_reclaim_dead_roots();
if (nNeed) {
nNeed -= added;
if (nNeed < 0)
nNeed = 0;
}
#endif
if (nNeed <= 0)
return added;
KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
return added;
}
minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
newCapacity = __kmp_threads_capacity;
do {
newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
: __kmp_sys_max_nth;
} while (newCapacity < minimumRequiredCapacity);
newThreads = (kmp_info_t **)__kmp_allocate(
(sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
newRoot =
(kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
KMP_MEMCPY(newThreads, __kmp_threads,
__kmp_threads_capacity * sizeof(kmp_info_t *));
KMP_MEMCPY(newRoot, __kmp_root,
__kmp_threads_capacity * sizeof(kmp_root_t *));
kmp_old_threads_list_t *node =
(kmp_old_threads_list_t *)__kmp_allocate(sizeof(kmp_old_threads_list_t));
node->threads = __kmp_threads;
node->next = __kmp_old_threads_list;
__kmp_old_threads_list = node;
*(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
*(kmp_root_t * *volatile *)&__kmp_root = newRoot;
added += newCapacity - __kmp_threads_capacity;
*(volatile int *)&__kmp_threads_capacity = newCapacity;
if (newCapacity > __kmp_tp_capacity) {
__kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
__kmp_threadprivate_resize_cache(newCapacity);
} else {
*(volatile int *)&__kmp_tp_capacity = newCapacity;
}
__kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
}
return added;
}
have the __kmp_initz_lock held at this point. Argument TRUE only if are the
thread that calls from __kmp_do_serial_initialize() */
int __kmp_register_root(int initial_thread) {
kmp_info_t *root_thread;
kmp_root_t *root;
int gtid;
int capacity;
__kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
KA_TRACE(20, ("__kmp_register_root: entered\n"));
KMP_MB();
If initial thread did not invoke OpenMP RTL yet, and this thread is not an
initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
work as expected -- it may return false (that means there is at least one
empty slot in __kmp_threads array), but it is possible the only free slot
is #0, which is reserved for initial thread and so cannot be used for this
one. Following code workarounds this bug.
However, right solution seems to be not reserving slot #0 for initial
thread because:
(1) there is no magic in slot #0,
(2) we cannot detect initial thread reliably (the first thread which does
serial initialization may be not a real initial thread).
*/
capacity = __kmp_threads_capacity;
if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
--capacity;
}
if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
capacity -= __kmp_hidden_helper_threads_num;
}
if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
if (__kmp_tp_cached) {
__kmp_fatal(KMP_MSG(CantRegisterNewThread),
KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
} else {
__kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
__kmp_msg_null);
}
}
if (TCR_4(__kmp_init_hidden_helper_threads)) {
for (gtid = 1; TCR_PTR(__kmp_threads[gtid]) != NULL &&
gtid <= __kmp_hidden_helper_threads_num;
gtid++)
;
KMP_ASSERT(gtid <= __kmp_hidden_helper_threads_num);
KA_TRACE(1, ("__kmp_register_root: found slot in threads array for "
"hidden helper thread: T#%d\n",
gtid));
} else {
if (initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
gtid = 0;
} else {
for (gtid = __kmp_hidden_helper_threads_num + 1;
TCR_PTR(__kmp_threads[gtid]) != NULL; gtid++)
;
}
KA_TRACE(
1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
KMP_ASSERT(gtid < __kmp_threads_capacity);
}
__kmp_all_nth++;
TCW_4(__kmp_nth, __kmp_nth + 1);
if (__kmp_adjust_gtid_mode) {
if (__kmp_all_nth >= __kmp_tls_gtid_min) {
if (TCR_4(__kmp_gtid_mode) != 2) {
TCW_4(__kmp_gtid_mode, 2);
}
} else {
if (TCR_4(__kmp_gtid_mode) != 1) {
TCW_4(__kmp_gtid_mode, 1);
}
}
}
#ifdef KMP_ADJUST_BLOCKTIME
if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
if (__kmp_nth > __kmp_avail_proc) {
__kmp_zero_bt = TRUE;
}
}
#endif
if (!(root = __kmp_root[gtid])) {
root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
KMP_DEBUG_ASSERT(!root->r.r_root_team);
}
#if KMP_STATS_ENABLED
__kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
__kmp_stats_thread_ptr->startLife();
KMP_SET_THREAD_STATE(SERIAL_REGION);
KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
#endif
__kmp_initialize_root(root);
if (root->r.r_uber_thread) {
root_thread = root->r.r_uber_thread;
} else {
root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
if (__kmp_storage_map) {
__kmp_print_thread_storage_map(root_thread, gtid);
}
root_thread->th.th_info.ds.ds_gtid = gtid;
#if OMPT_SUPPORT
root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
#endif
root_thread->th.th_root = root;
if (__kmp_env_consistency_check) {
root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
}
#if USE_FAST_MEMORY
__kmp_initialize_fast_memory(root_thread);
#endif
#if KMP_USE_BGET
KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
__kmp_initialize_bget(root_thread);
#endif
__kmp_init_random(root_thread);
}
if (!root_thread->th.th_serial_team) {
kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
root_thread->th.th_serial_team = __kmp_allocate_team(
root, 1, 1,
#if OMPT_SUPPORT
ompt_data_none,
#endif
proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
}
KMP_ASSERT(root_thread->th.th_serial_team);
KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
root_thread->th.th_serial_team));
TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
root->r.r_root_team->t.t_threads[0] = root_thread;
root->r.r_hot_team->t.t_threads[0] = root_thread;
root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
root_thread->th.th_serial_team->t.t_serialized = 0;
root->r.r_uber_thread = root_thread;
__kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
TCW_4(__kmp_init_gtid, TRUE);
__kmp_gtid_set_specific(gtid);
#if USE_ITT_BUILD
__kmp_itt_thread_name(gtid);
#endif
#ifdef KMP_TDATA_GTID
__kmp_gtid = gtid;
#endif
__kmp_create_worker(gtid, root_thread, __kmp_stksize);
KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
"plain=%u\n",
gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
KMP_INIT_BARRIER_STATE));
{
int b;
for (b = 0; b < bs_last_barrier; ++b) {
root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
#if USE_DEBUGGER
root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
#endif
}
}
KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
KMP_INIT_BARRIER_STATE);
#if KMP_AFFINITY_SUPPORTED
root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
#endif
root_thread->th.th_def_allocator = __kmp_def_allocator;
root_thread->th.th_prev_level = 0;
root_thread->th.th_prev_num_threads = 1;
kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
tmp->cg_root = root_thread;
tmp->cg_thread_limit = __kmp_cg_max_nth;
tmp->cg_nthreads = 1;
KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
" cg_nthreads init to 1\n",
root_thread, tmp));
tmp->up = NULL;
root_thread->th.th_cg_roots = tmp;
__kmp_root_counter++;
#if OMPT_SUPPORT
if (ompt_enabled.enabled) {
kmp_info_t *root_thread = ompt_get_thread();
ompt_set_thread_state(root_thread, ompt_state_overhead);
if (ompt_enabled.ompt_callback_thread_begin) {
ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
ompt_thread_initial, __ompt_get_thread_data_internal());
}
ompt_data_t *task_data;
ompt_data_t *parallel_data;
__ompt_get_task_info_internal(0, NULL, &task_data, NULL, ¶llel_data,
NULL);
if (ompt_enabled.ompt_callback_implicit_task) {
ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
}
ompt_set_thread_state(root_thread, ompt_state_work_serial);
}
#endif
#if OMPD_SUPPORT
if (ompd_state & OMPD_ENABLE_BP)
ompd_bp_thread_begin();
#endif
KMP_MB();
__kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
return gtid;
}
#if KMP_NESTED_HOT_TEAMS
static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
const int max_level) {
int i, n, nth;
kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
if (!hot_teams || !hot_teams[level].hot_team) {
return 0;
}
KMP_DEBUG_ASSERT(level < max_level);
kmp_team_t *team = hot_teams[level].hot_team;
nth = hot_teams[level].hot_team_nth;
n = nth - 1;
if (level < max_level - 1) {
for (i = 0; i < nth; ++i) {
kmp_info_t *th = team->t.t_threads[i];
n += __kmp_free_hot_teams(root, th, level + 1, max_level);
if (i > 0 && th->th.th_hot_teams) {
__kmp_free(th->th.th_hot_teams);
th->th.th_hot_teams = NULL;
}
}
}
__kmp_free_team(root, team, NULL);
return n;
}
#endif
static int __kmp_reset_root(int gtid, kmp_root_t *root) {
kmp_team_t *root_team = root->r.r_root_team;
kmp_team_t *hot_team = root->r.r_hot_team;
int n = hot_team->t.t_nproc;
int i;
KMP_DEBUG_ASSERT(!root->r.r_active);
root->r.r_root_team = NULL;
root->r.r_hot_team = NULL;
__kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
#if KMP_NESTED_HOT_TEAMS
if (__kmp_hot_teams_max_level >
0) {
for (i = 0; i < hot_team->t.t_nproc; ++i) {
kmp_info_t *th = hot_team->t.t_threads[i];
if (__kmp_hot_teams_max_level > 1) {
n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
}
if (th->th.th_hot_teams) {
__kmp_free(th->th.th_hot_teams);
th->th.th_hot_teams = NULL;
}
}
}
#endif
__kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
if (__kmp_tasking_mode != tskm_immediate_exec) {
__kmp_wait_to_unref_task_teams();
}
#if KMP_OS_WINDOWS
KA_TRACE(
10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
"\n",
(LPVOID) & (root->r.r_uber_thread->th),
root->r.r_uber_thread->th.th_info.ds.ds_thread));
__kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
#endif
#if OMPD_SUPPORT
if (ompd_state & OMPD_ENABLE_BP)
ompd_bp_thread_end();
#endif
#if OMPT_SUPPORT
ompt_data_t *task_data;
ompt_data_t *parallel_data;
__ompt_get_task_info_internal(0, NULL, &task_data, NULL, ¶llel_data,
NULL);
if (ompt_enabled.ompt_callback_implicit_task) {
ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial);
}
if (ompt_enabled.ompt_callback_thread_end) {
ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
&(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
}
#endif
TCW_4(__kmp_nth,
__kmp_nth - 1);
i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
" to %d\n",
root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
if (i == 1) {
KMP_DEBUG_ASSERT(root->r.r_uber_thread ==
root->r.r_uber_thread->th.th_cg_roots->cg_root);
KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL);
__kmp_free(root->r.r_uber_thread->th.th_cg_roots);
root->r.r_uber_thread->th.th_cg_roots = NULL;
}
__kmp_reap_thread(root->r.r_uber_thread, 1);
root->r.r_uber_thread = NULL;
root->r.r_begin = FALSE;
return n;
}
void __kmp_unregister_root_current_thread(int gtid) {
KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
called during an abort, only during a normal close. furthermore, if you
have the forkjoin lock, you should never try to get the initz lock */
__kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
"exiting T#%d\n",
gtid));
__kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
return;
}
kmp_root_t *root = __kmp_root[gtid];
KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
KMP_ASSERT(KMP_UBER_GTID(gtid));
KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
KMP_ASSERT(root->r.r_active == FALSE);
KMP_MB();
kmp_info_t *thread = __kmp_threads[gtid];
kmp_team_t *team = thread->th.th_team;
kmp_task_team_t *task_team = thread->th.th_task_team;
if (task_team != NULL && (task_team->tt.tt_found_proxy_tasks ||
task_team->tt.tt_hidden_helper_task_encountered)) {
#if OMPT_SUPPORT
thread->th.ompt_thread_info.state = ompt_state_undefined;
#endif
__kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
}
__kmp_reset_root(gtid, root);
KMP_MB();
KC_TRACE(10,
("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
__kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
}
#if KMP_OS_WINDOWS
Unregisters a root thread that is not the current thread. Returns the number
of __kmp_threads entries freed as a result. */
static int __kmp_unregister_root_other_thread(int gtid) {
kmp_root_t *root = __kmp_root[gtid];
int r;
KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
KMP_ASSERT(KMP_UBER_GTID(gtid));
KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
KMP_ASSERT(root->r.r_active == FALSE);
r = __kmp_reset_root(gtid, root);
KC_TRACE(10,
("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
return r;
}
#endif
#if KMP_DEBUG
void __kmp_task_info() {
kmp_int32 gtid = __kmp_entry_gtid();
kmp_int32 tid = __kmp_tid_from_gtid(gtid);
kmp_info_t *this_thr = __kmp_threads[gtid];
kmp_team_t *steam = this_thr->th.th_serial_team;
kmp_team_t *team = this_thr->th.th_team;
__kmp_printf(
"__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
"ptask=%p\n",
gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
team->t.t_implicit_task_taskdata[tid].td_parent);
}
#endif
responsibility to workers as much as possible, and delay initialization of
features as much as possible */
static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
int tid, int gtid) {
kmp_allocate_thread/create_worker.
this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
KMP_DEBUG_ASSERT(this_thr != NULL);
KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
KMP_DEBUG_ASSERT(team);
KMP_DEBUG_ASSERT(team->t.t_threads);
KMP_DEBUG_ASSERT(team->t.t_dispatch);
kmp_info_t *master = team->t.t_threads[0];
KMP_DEBUG_ASSERT(master);
KMP_DEBUG_ASSERT(master->th.th_root);
KMP_MB();
TCW_SYNC_PTR(this_thr->th.th_team, team);
this_thr->th.th_info.ds.ds_tid = tid;
this_thr->th.th_set_nproc = 0;
if (__kmp_tasking_mode != tskm_immediate_exec)
this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
else
this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
this_thr->th.th_set_proc_bind = proc_bind_default;
#if KMP_AFFINITY_SUPPORTED
this_thr->th.th_new_place = this_thr->th.th_current_place;
#endif
this_thr->th.th_root = master->th.th_root;
this_thr->th.th_team_nproc = team->t.t_nproc;
this_thr->th.th_team_master = master;
this_thr->th.th_team_serialized = team->t.t_serialized;
KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
tid, gtid, this_thr, this_thr->th.th_current_task));
__kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
team, tid, TRUE);
KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
tid, gtid, this_thr, this_thr->th.th_current_task));
this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
this_thr->th.th_local.this_construct = 0;
if (!this_thr->th.th_pri_common) {
this_thr->th.th_pri_common =
(struct common_table *)__kmp_allocate(sizeof(struct common_table));
if (__kmp_storage_map) {
__kmp_print_storage_map_gtid(
gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
}
this_thr->th.th_pri_head = NULL;
}
if (this_thr != master &&
this_thr->th.th_cg_roots != master->th.th_cg_roots) {
KMP_DEBUG_ASSERT(master->th.th_cg_roots);
kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
if (tmp) {
int i = tmp->cg_nthreads--;
KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads"
" on node %p of thread %p to %d\n",
this_thr, tmp, tmp->cg_root, tmp->cg_nthreads));
if (i == 1) {
__kmp_free(tmp);
}
}
this_thr->th.th_cg_roots = master->th.th_cg_roots;
this_thr->th.th_cg_roots->cg_nthreads++;
KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
" node %p of thread %p to %d\n",
this_thr, this_thr->th.th_cg_roots,
this_thr->th.th_cg_roots->cg_root,
this_thr->th.th_cg_roots->cg_nthreads));
this_thr->th.th_current_task->td_icvs.thread_limit =
this_thr->th.th_cg_roots->cg_thread_limit;
}
{
volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
size_t disp_size =
sizeof(dispatch_private_info_t) *
(team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
team->t.t_max_nproc));
KMP_ASSERT(dispatch);
KMP_DEBUG_ASSERT(team->t.t_dispatch);
KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
dispatch->th_disp_index = 0;
dispatch->th_doacross_buf_idx = 0;
if (!dispatch->th_disp_buffer) {
dispatch->th_disp_buffer =
(dispatch_private_info_t *)__kmp_allocate(disp_size);
if (__kmp_storage_map) {
__kmp_print_storage_map_gtid(
gtid, &dispatch->th_disp_buffer[0],
&dispatch->th_disp_buffer[team->t.t_max_nproc == 1
? 1
: __kmp_dispatch_num_buffers],
disp_size,
"th_%d.th_dispatch.th_disp_buffer "
"(team_%d.t_dispatch[%d].th_disp_buffer)",
gtid, team->t.t_id, gtid);
}
} else {
memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
}
dispatch->th_dispatch_pr_current = 0;
dispatch->th_dispatch_sh_current = 0;
dispatch->th_deo_fcn = 0;
dispatch->th_dxo_fcn = 0;
}
this_thr->th.th_next_pool = NULL;
KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
KMP_MB();
}
within a forkjoin critical section. we will first try to get an available
thread from the thread pool. if none is available, we will fork a new one
assuming we are able to create a new one. this should be assured, as the
caller should check on this first. */
kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
int new_tid) {
kmp_team_t *serial_team;
kmp_info_t *new_thr;
int new_gtid;
KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
KMP_DEBUG_ASSERT(root && team);
#if !KMP_NESTED_HOT_TEAMS
KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
#endif
KMP_MB();
* the main hidden helper thread. The hidden helper team should always
* allocate new OS threads. */
if (__kmp_thread_pool && !KMP_HIDDEN_HELPER_TEAM(team)) {
new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
__kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
if (new_thr == __kmp_thread_pool_insert_pt) {
__kmp_thread_pool_insert_pt = NULL;
}
TCW_4(new_thr->th.th_in_pool, FALSE);
__kmp_suspend_initialize_thread(new_thr);
__kmp_lock_suspend_mx(new_thr);
if (new_thr->th.th_active_in_pool == TRUE) {
KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE);
KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
new_thr->th.th_active_in_pool = FALSE;
}
__kmp_unlock_suspend_mx(new_thr);
KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
__kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
KMP_ASSERT(!new_thr->th.th_team);
KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
__kmp_initialize_info(new_thr, team, new_tid,
new_thr->th.th_info.ds.ds_gtid);
KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
TCW_4(__kmp_nth, __kmp_nth + 1);
new_thr->th.th_task_state = 0;
if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
KMP_DEBUG_ASSERT(new_thr->th.th_used_in_team.load() == 0);
}
#ifdef KMP_ADJUST_BLOCKTIME
if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
if (__kmp_nth > __kmp_avail_proc) {
__kmp_zero_bt = TRUE;
}
}
#endif
#if KMP_DEBUG
int b;
kmp_balign_t *balign = new_thr->th.th_bar;
for (b = 0; b < bs_last_barrier; ++b)
KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
#endif
KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
__kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
KMP_MB();
return new_thr;
}
KMP_ASSERT(KMP_HIDDEN_HELPER_TEAM(team) || __kmp_nth == __kmp_all_nth);
KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
#if KMP_USE_MONITOR
if (!TCR_4(__kmp_init_monitor)) {
__kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
if (!TCR_4(__kmp_init_monitor)) {
KF_TRACE(10, ("before __kmp_create_monitor\n"));
TCW_4(__kmp_init_monitor, 1);
__kmp_create_monitor(&__kmp_monitor);
KF_TRACE(10, ("after __kmp_create_monitor\n"));
#if KMP_OS_WINDOWS
while (TCR_4(__kmp_init_monitor) < 2) {
KMP_YIELD(TRUE);
}
KF_TRACE(10, ("after monitor thread has started\n"));
#endif
}
__kmp_release_bootstrap_lock(&__kmp_monitor_lock);
}
#endif
KMP_MB();
{
int new_start_gtid = TCR_4(__kmp_init_hidden_helper_threads)
? 1
: __kmp_hidden_helper_threads_num + 1;
for (new_gtid = new_start_gtid; TCR_PTR(__kmp_threads[new_gtid]) != NULL;
++new_gtid) {
KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
}
if (TCR_4(__kmp_init_hidden_helper_threads)) {
KMP_DEBUG_ASSERT(new_gtid <= __kmp_hidden_helper_threads_num);
}
}
new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
new_thr->th.th_nt_strict = false;
new_thr->th.th_nt_loc = NULL;
new_thr->th.th_nt_sev = severity_fatal;
new_thr->th.th_nt_msg = NULL;
TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
#if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
__itt_suppress_mark_range(
__itt_suppress_range, __itt_suppress_threading_errors,
&new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc));
__itt_suppress_mark_range(
__itt_suppress_range, __itt_suppress_threading_errors,
&new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state));
#if KMP_OS_WINDOWS
__itt_suppress_mark_range(
__itt_suppress_range, __itt_suppress_threading_errors,
&new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init));
#else
__itt_suppress_mark_range(__itt_suppress_range,
__itt_suppress_threading_errors,
&new_thr->th.th_suspend_init_count,
sizeof(new_thr->th.th_suspend_init_count));
#endif
__itt_suppress_mark_range(__itt_suppress_range,
__itt_suppress_threading_errors,
CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go),
sizeof(new_thr->th.th_bar[0].bb.b_go));
__itt_suppress_mark_range(__itt_suppress_range,
__itt_suppress_threading_errors,
CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go),
sizeof(new_thr->th.th_bar[1].bb.b_go));
__itt_suppress_mark_range(__itt_suppress_range,
__itt_suppress_threading_errors,
CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go),
sizeof(new_thr->th.th_bar[2].bb.b_go));
#endif
if (__kmp_storage_map) {
__kmp_print_thread_storage_map(new_thr, new_gtid);
}
{
kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
new_thr->th.th_serial_team = serial_team =
(kmp_team_t *)__kmp_allocate_team(root, 1, 1,
#if OMPT_SUPPORT
ompt_data_none,
#endif
proc_bind_default, &r_icvs,
0 USE_NESTED_HOT_ARG(NULL));
}
KMP_ASSERT(serial_team);
serial_team->t.t_serialized = 0;
serial_team->t.t_threads[0] = new_thr;
KF_TRACE(10,
("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
new_thr));
__kmp_initialize_info(new_thr, team, new_tid, new_gtid);
#if USE_FAST_MEMORY
__kmp_initialize_fast_memory(new_thr);
#endif
#if KMP_USE_BGET
KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
__kmp_initialize_bget(new_thr);
#endif
__kmp_init_random(new_thr);
KA_TRACE(20,
("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
__kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
int b;
kmp_balign_t *balign = new_thr->th.th_bar;
for (b = 0; b < bs_last_barrier; ++b) {
balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
balign[b].bb.team = NULL;
balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
balign[b].bb.use_oncore_barrier = 0;
}
TCW_PTR(new_thr->th.th_sleep_loc, NULL);
new_thr->th.th_sleep_loc_type = flag_unset;
new_thr->th.th_spin_here = FALSE;
new_thr->th.th_next_waiting = 0;
#if KMP_OS_UNIX
new_thr->th.th_blocking = false;
#endif
#if KMP_AFFINITY_SUPPORTED
new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
#endif
new_thr->th.th_def_allocator = __kmp_def_allocator;
new_thr->th.th_prev_level = 0;
new_thr->th.th_prev_num_threads = 1;
TCW_4(new_thr->th.th_in_pool, FALSE);
new_thr->th.th_active_in_pool = FALSE;
TCW_4(new_thr->th.th_active, TRUE);
new_thr->th.th_set_nested_nth = NULL;
new_thr->th.th_set_nested_nth_sz = 0;
__kmp_all_nth++;
__kmp_nth++;
if (__kmp_adjust_gtid_mode) {
if (__kmp_all_nth >= __kmp_tls_gtid_min) {
if (TCR_4(__kmp_gtid_mode) != 2) {
TCW_4(__kmp_gtid_mode, 2);
}
} else {
if (TCR_4(__kmp_gtid_mode) != 1) {
TCW_4(__kmp_gtid_mode, 1);
}
}
}
#ifdef KMP_ADJUST_BLOCKTIME
if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
if (__kmp_nth > __kmp_avail_proc) {
__kmp_zero_bt = TRUE;
}
}
#endif
#if KMP_AFFINITY_SUPPORTED
__kmp_affinity_set_init_mask(new_gtid, FALSE);
#endif
KF_TRACE(
10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
__kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
KF_TRACE(10,
("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
new_gtid));
KMP_MB();
return new_thr;
}
The hot team code calls this case at every fork barrier, so EPCC barrier
test are extremely sensitive to changes in it, esp. writes to the team
struct, which cause a cache invalidation in all threads.
IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
static void __kmp_reinitialize_team(kmp_team_t *team,
kmp_internal_control_t *new_icvs,
ident_t *loc) {
KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
team->t.t_threads[0], team));
KMP_DEBUG_ASSERT(team && new_icvs);
KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
KMP_CHECK_UPDATE(team->t.t_ident, loc);
KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
__kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
team->t.t_threads[0], team));
}
This assumes the t_threads and t_max_nproc are already set.
Also, we don't touch the arguments */
static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
kmp_internal_control_t *new_icvs,
ident_t *loc) {
KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
KMP_DEBUG_ASSERT(team);
KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
KMP_DEBUG_ASSERT(team->t.t_threads);
KMP_MB();
team->t.t_master_tid = 0;
team->t.t_serialized = new_nproc > 1 ? 0 : 1;
team->t.t_nproc = new_nproc;
team->t.t_next_pool = NULL;
* up hot team */
TCW_SYNC_PTR(team->t.t_pkfn, NULL);
team->t.t_invoke = NULL;
team->t.t_sched.sched = new_icvs->sched.sched;
#if KMP_ARCH_X86 || KMP_ARCH_X86_64
team->t.t_fp_control_saved = FALSE;
team->t.t_x87_fpu_control_word = 0;
team->t.t_mxcsr = 0;
#endif
team->t.t_construct = 0;
team->t.t_ordered.dt.t_value = 0;
team->t.t_master_active = FALSE;
#ifdef KMP_DEBUG
team->t.t_copypriv_data = NULL;
#endif
#if KMP_OS_WINDOWS
team->t.t_copyin_counter = 0;
#endif
team->t.t_control_stack_top = NULL;
__kmp_reinitialize_team(team, new_icvs, loc);
KMP_MB();
KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
}
#if KMP_AFFINITY_SUPPORTED
static inline void __kmp_set_thread_place(kmp_team_t *team, kmp_info_t *th,
int first, int last, int newp) {
th->th.th_first_place = first;
th->th.th_last_place = last;
th->th.th_new_place = newp;
if (newp != th->th.th_current_place) {
if (__kmp_display_affinity && team->t.t_display_affinity != 1)
team->t.t_display_affinity = 1;
th->th.th_topology_ids = __kmp_affinity.ids[th->th.th_new_place];
th->th.th_topology_attrs = __kmp_affinity.attrs[th->th.th_new_place];
}
}
static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
if (KMP_HIDDEN_HELPER_TEAM(team))
return;
kmp_info_t *master_th = team->t.t_threads[0];
KMP_DEBUG_ASSERT(master_th != NULL);
kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
int first_place = master_th->th.th_first_place;
int last_place = master_th->th.th_last_place;
int masters_place = master_th->th.th_current_place;
int num_masks = __kmp_affinity.num_masks;
team->t.t_first_place = first_place;
team->t.t_last_place = last_place;
KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
"bound to place %d partition = [%d,%d]\n",
proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
team->t.t_id, masters_place, first_place, last_place));
switch (proc_bind) {
case proc_bind_default:
KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
break;
case proc_bind_primary: {
int f;
int n_th = team->t.t_nproc;
for (f = 1; f < n_th; f++) {
kmp_info_t *th = team->t.t_threads[f];
KMP_DEBUG_ASSERT(th != NULL);
__kmp_set_thread_place(team, th, first_place, last_place, masters_place);
KA_TRACE(100, ("__kmp_partition_places: primary: T#%d(%d:%d) place %d "
"partition = [%d,%d]\n",
__kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
f, masters_place, first_place, last_place));
}
} break;
case proc_bind_close: {
int f;
int n_th = team->t.t_nproc;
int n_places;
if (first_place <= last_place) {
n_places = last_place - first_place + 1;
} else {
n_places = num_masks - first_place + last_place + 1;
}
if (n_th <= n_places) {
int place = masters_place;
for (f = 1; f < n_th; f++) {
kmp_info_t *th = team->t.t_threads[f];
KMP_DEBUG_ASSERT(th != NULL);
if (place == last_place) {
place = first_place;
} else if (place == (num_masks - 1)) {
place = 0;
} else {
place++;
}
__kmp_set_thread_place(team, th, first_place, last_place, place);
KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
"partition = [%d,%d]\n",
__kmp_gtid_from_thread(team->t.t_threads[f]),
team->t.t_id, f, place, first_place, last_place));
}
} else {
int S, rem, gap, s_count;
S = n_th / n_places;
s_count = 0;
rem = n_th - (S * n_places);
gap = rem > 0 ? n_places / rem : n_places;
int place = masters_place;
int gap_ct = gap;
for (f = 0; f < n_th; f++) {
kmp_info_t *th = team->t.t_threads[f];
KMP_DEBUG_ASSERT(th != NULL);
__kmp_set_thread_place(team, th, first_place, last_place, place);
s_count++;
if ((s_count == S) && rem && (gap_ct == gap)) {
} else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
if (place == last_place) {
place = first_place;
} else if (place == (num_masks - 1)) {
place = 0;
} else {
place++;
}
s_count = 0;
gap_ct = 1;
rem--;
} else if (s_count == S) {
if (place == last_place) {
place = first_place;
} else if (place == (num_masks - 1)) {
place = 0;
} else {
place++;
}
gap_ct++;
s_count = 0;
}
KA_TRACE(100,
("__kmp_partition_places: close: T#%d(%d:%d) place %d "
"partition = [%d,%d]\n",
__kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
th->th.th_new_place, first_place, last_place));
}
KMP_DEBUG_ASSERT(place == masters_place);
}
} break;
case proc_bind_spread: {
int f;
int n_th = team->t.t_nproc;
int n_places;
int thidx;
if (first_place <= last_place) {
n_places = last_place - first_place + 1;
} else {
n_places = num_masks - first_place + last_place + 1;
}
if (n_th <= n_places) {
int place = -1;
if (n_places != num_masks) {
int S = n_places / n_th;
int s_count, rem, gap, gap_ct;
place = masters_place;
rem = n_places - n_th * S;
gap = rem ? n_th / rem : 1;
gap_ct = gap;
thidx = n_th;
if (update_master_only == 1)
thidx = 1;
for (f = 0; f < thidx; f++) {
kmp_info_t *th = team->t.t_threads[f];
KMP_DEBUG_ASSERT(th != NULL);
int fplace = place, nplace = place;
s_count = 1;
while (s_count < S) {
if (place == last_place) {
place = first_place;
} else if (place == (num_masks - 1)) {
place = 0;
} else {
place++;
}
s_count++;
}
if (rem && (gap_ct == gap)) {
if (place == last_place) {
place = first_place;
} else if (place == (num_masks - 1)) {
place = 0;
} else {
place++;
}
rem--;
gap_ct = 0;
}
__kmp_set_thread_place(team, th, fplace, place, nplace);
gap_ct++;
if (place == last_place) {
place = first_place;
} else if (place == (num_masks - 1)) {
place = 0;
} else {
place++;
}
KA_TRACE(100,
("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
"partition = [%d,%d], num_masks: %u\n",
__kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
f, th->th.th_new_place, th->th.th_first_place,
th->th.th_last_place, num_masks));
}
} else {
T partitions of round(P/T) size and put threads into the first
place of each partition. */
double current = static_cast<double>(masters_place);
double spacing =
(static_cast<double>(n_places + 1) / static_cast<double>(n_th));
int first, last;
kmp_info_t *th;
thidx = n_th + 1;
if (update_master_only == 1)
thidx = 1;
for (f = 0; f < thidx; f++) {
first = static_cast<int>(current);
last = static_cast<int>(current + spacing) - 1;
KMP_DEBUG_ASSERT(last >= first);
if (first >= n_places) {
if (masters_place) {
first -= n_places;
last -= n_places;
if (first == (masters_place + 1)) {
KMP_DEBUG_ASSERT(f == n_th);
first--;
}
if (last == masters_place) {
KMP_DEBUG_ASSERT(f == (n_th - 1));
last--;
}
} else {
KMP_DEBUG_ASSERT(f == n_th);
first = 0;
last = 0;
}
}
if (last >= n_places) {
last = (n_places - 1);
}
place = first;
current += spacing;
if (f < n_th) {
KMP_DEBUG_ASSERT(0 <= first);
KMP_DEBUG_ASSERT(n_places > first);
KMP_DEBUG_ASSERT(0 <= last);
KMP_DEBUG_ASSERT(n_places > last);
KMP_DEBUG_ASSERT(last_place >= first_place);
th = team->t.t_threads[f];
KMP_DEBUG_ASSERT(th);
__kmp_set_thread_place(team, th, first, last, place);
KA_TRACE(100,
("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
"partition = [%d,%d], spacing = %.4f\n",
__kmp_gtid_from_thread(team->t.t_threads[f]),
team->t.t_id, f, th->th.th_new_place,
th->th.th_first_place, th->th.th_last_place, spacing));
}
}
}
KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
} else {
int S, rem, gap, s_count;
S = n_th / n_places;
s_count = 0;
rem = n_th - (S * n_places);
gap = rem > 0 ? n_places / rem : n_places;
int place = masters_place;
int gap_ct = gap;
thidx = n_th;
if (update_master_only == 1)
thidx = 1;
for (f = 0; f < thidx; f++) {
kmp_info_t *th = team->t.t_threads[f];
KMP_DEBUG_ASSERT(th != NULL);
__kmp_set_thread_place(team, th, place, place, place);
s_count++;
if ((s_count == S) && rem && (gap_ct == gap)) {
} else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
if (place == last_place) {
place = first_place;
} else if (place == (num_masks - 1)) {
place = 0;
} else {
place++;
}
s_count = 0;
gap_ct = 1;
rem--;
} else if (s_count == S) {
if (place == last_place) {
place = first_place;
} else if (place == (num_masks - 1)) {
place = 0;
} else {
place++;
}
gap_ct++;
s_count = 0;
}
KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
"partition = [%d,%d]\n",
__kmp_gtid_from_thread(team->t.t_threads[f]),
team->t.t_id, f, th->th.th_new_place,
th->th.th_first_place, th->th.th_last_place));
}
KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
}
} break;
default:
break;
}
KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
}
#endif
available */
kmp_team_t *
__kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
#if OMPT_SUPPORT
ompt_data_t ompt_parallel_data,
#endif
kmp_proc_bind_t new_proc_bind,
kmp_internal_control_t *new_icvs,
int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
int f;
kmp_team_t *team;
int use_hot_team = !root->r.r_active;
int level = 0;
int do_place_partition = 1;
KA_TRACE(20, ("__kmp_allocate_team: called\n"));
KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
KMP_MB();
#if KMP_NESTED_HOT_TEAMS
kmp_hot_team_ptr_t *hot_teams;
if (master) {
team = master->th.th_team;
level = team->t.t_active_level;
if (master->th.th_teams_microtask) {
if (master->th.th_teams_size.nteams > 1 &&
(
team->t.t_pkfn ==
(microtask_t)__kmp_teams_master ||
master->th.th_teams_level <
team->t.t_level)) {
++level;
}
if ((master->th.th_teams_size.nteams == 1 &&
master->th.th_teams_level >= team->t.t_level) ||
(team->t.t_pkfn == (microtask_t)__kmp_teams_master))
do_place_partition = 0;
}
hot_teams = master->th.th_hot_teams;
if (level < __kmp_hot_teams_max_level && hot_teams &&
hot_teams[level].hot_team) {
use_hot_team = 1;
} else {
use_hot_team = 0;
}
} else {
KMP_DEBUG_ASSERT(new_nproc == 1);
}
#endif
if (use_hot_team && new_nproc > 1) {
KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
#if KMP_NESTED_HOT_TEAMS
team = hot_teams[level].hot_team;
#else
team = root->r.r_hot_team;
#endif
#if KMP_DEBUG
if (__kmp_tasking_mode != tskm_immediate_exec) {
KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
"task_team[1] = %p before reinit\n",
team->t.t_task_team[0], team->t.t_task_team[1]));
}
#endif
if (team->t.t_nproc != new_nproc &&
__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
int old_nthr = team->t.t_nproc;
__kmp_resize_dist_barrier(team, old_nthr, new_nproc);
}
if (do_place_partition == 0)
team->t.t_proc_bind = proc_bind_default;
unchanged, and put that case first. */
if (team->t.t_nproc == new_nproc) {
KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
if (team->t.t_size_changed == -1) {
team->t.t_size_changed = 1;
} else {
KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
}
kmp_r_sched_t new_sched = new_icvs->sched;
KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
__kmp_reinitialize_team(team, new_icvs,
root->r.r_uber_thread->th.th_ident);
KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
team->t.t_threads[0], team));
__kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
#if KMP_AFFINITY_SUPPORTED
if ((team->t.t_size_changed == 0) &&
(team->t.t_proc_bind == new_proc_bind)) {
if (new_proc_bind == proc_bind_spread) {
if (do_place_partition) {
__kmp_partition_places(team, 1);
}
}
KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
"proc_bind = %d, partition = [%d,%d]\n",
team->t.t_id, new_proc_bind, team->t.t_first_place,
team->t.t_last_place));
} else {
if (do_place_partition) {
KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
__kmp_partition_places(team);
}
}
#else
KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
#endif
} else if (team->t.t_nproc > new_nproc) {
KA_TRACE(20,
("__kmp_allocate_team: decreasing hot team thread count to %d\n",
new_nproc));
team->t.t_size_changed = 1;
if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
__kmp_add_threads_to_team(team, new_nproc);
}
if (__kmp_tasking_mode != tskm_immediate_exec) {
for (f = new_nproc; f < team->t.t_nproc; f++) {
kmp_info_t *th = team->t.t_threads[f];
KMP_DEBUG_ASSERT(th);
th->th.th_task_team = NULL;
}
}
#if KMP_NESTED_HOT_TEAMS
if (__kmp_hot_teams_mode == 0) {
KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
hot_teams[level].hot_team_nth = new_nproc;
#endif
for (f = new_nproc; f < team->t.t_nproc; f++) {
KMP_DEBUG_ASSERT(team->t.t_threads[f]);
__kmp_free_thread(team->t.t_threads[f]);
team->t.t_threads[f] = NULL;
}
#if KMP_NESTED_HOT_TEAMS
}
else {
for (f = new_nproc; f < team->t.t_nproc; ++f) {
KMP_DEBUG_ASSERT(team->t.t_threads[f]);
kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
for (int b = 0; b < bs_last_barrier; ++b) {
if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
}
KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
}
}
}
#endif
team->t.t_nproc = new_nproc;
KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
__kmp_reinitialize_team(team, new_icvs,
root->r.r_uber_thread->th.th_ident);
for (f = 0; f < new_nproc; ++f) {
team->t.t_threads[f]->th.th_team_nproc = new_nproc;
}
KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
team->t.t_threads[0], team));
__kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
#ifdef KMP_DEBUG
for (f = 0; f < team->t.t_nproc; f++) {
KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
team->t.t_threads[f]->th.th_team_nproc ==
team->t.t_nproc);
}
#endif
if (do_place_partition) {
KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
#if KMP_AFFINITY_SUPPORTED
__kmp_partition_places(team);
#endif
}
} else {
KA_TRACE(20,
("__kmp_allocate_team: increasing hot team thread count to %d\n",
new_nproc));
int old_nproc = team->t.t_nproc;
team->t.t_size_changed = 1;
#if KMP_NESTED_HOT_TEAMS
int avail_threads = hot_teams[level].hot_team_nth;
if (new_nproc < avail_threads)
avail_threads = new_nproc;
kmp_info_t **other_threads = team->t.t_threads;
for (f = team->t.t_nproc; f < avail_threads; ++f) {
int b;
kmp_balign_t *balign = other_threads[f]->th.th_bar;
for (b = 0; b < bs_last_barrier; ++b) {
balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
#if USE_DEBUGGER
balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
#endif
}
}
if (hot_teams[level].hot_team_nth >= new_nproc) {
KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
team->t.t_nproc = new_nproc;
} else {
team->t.t_nproc = hot_teams[level].hot_team_nth;
hot_teams[level].hot_team_nth = new_nproc;
#endif
if (team->t.t_max_nproc < new_nproc) {
__kmp_reallocate_team_arrays(team, new_nproc);
__kmp_reinitialize_team(team, new_icvs, NULL);
}
#if (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY) && \
KMP_AFFINITY_SUPPORTED
workers. The reason is that workers inherit the affinity from the
primary thread, so if a lot of workers are created on the single
core quickly, they don't get a chance to set their own affinity for
a long time. */
kmp_affinity_raii_t new_temp_affinity{__kmp_affin_fullMask};
#endif
for (f = team->t.t_nproc; f < new_nproc; f++) {
kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
KMP_DEBUG_ASSERT(new_worker);
team->t.t_threads[f] = new_worker;
KA_TRACE(20,
("__kmp_allocate_team: team %d init T#%d arrived: "
"join=%llu, plain=%llu\n",
team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
team->t.t_bar[bs_forkjoin_barrier].b_arrived,
team->t.t_bar[bs_plain_barrier].b_arrived));
{
int b;
kmp_balign_t *balign = new_worker->th.th_bar;
for (b = 0; b < bs_last_barrier; ++b) {
balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
KMP_BARRIER_PARENT_FLAG);
#if USE_DEBUGGER
balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
#endif
}
}
}
#if (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY) && \
KMP_AFFINITY_SUPPORTED
new_temp_affinity.restore();
#endif
#if KMP_NESTED_HOT_TEAMS
}
#endif
if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
__kmp_add_threads_to_team(team, new_nproc);
}
__kmp_initialize_team(team, new_nproc, new_icvs,
root->r.r_uber_thread->th.th_ident);
KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
for (f = 0; f < team->t.t_nproc; ++f)
__kmp_initialize_info(team->t.t_threads[f], team, f,
__kmp_gtid_from_tid(f, team));
kmp_uint8 old_state = team->t.t_threads[old_nproc - 1]->th.th_task_state;
for (f = old_nproc; f < team->t.t_nproc; ++f)
team->t.t_threads[f]->th.th_task_state = old_state;
#ifdef KMP_DEBUG
for (f = 0; f < team->t.t_nproc; ++f) {
KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
team->t.t_threads[f]->th.th_team_nproc ==
team->t.t_nproc);
}
#endif
if (do_place_partition) {
KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
#if KMP_AFFINITY_SUPPORTED
__kmp_partition_places(team);
#endif
}
}
if (master->th.th_teams_microtask) {
for (f = 1; f < new_nproc; ++f) {
kmp_info_t *thr = team->t.t_threads[f];
thr->th.th_teams_microtask = master->th.th_teams_microtask;
thr->th.th_teams_level = master->th.th_teams_level;
thr->th.th_teams_size = master->th.th_teams_size;
}
}
#if KMP_NESTED_HOT_TEAMS
if (level) {
for (f = 1; f < new_nproc; ++f) {
kmp_info_t *thr = team->t.t_threads[f];
int b;
kmp_balign_t *balign = thr->th.th_bar;
for (b = 0; b < bs_last_barrier; ++b) {
balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
#if USE_DEBUGGER
balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
#endif
}
}
}
#endif
__kmp_alloc_argv_entries(argc, team, TRUE);
KMP_CHECK_UPDATE(team->t.t_argc, argc);
KF_TRACE(10, (" hot_team = %p\n", team));
#if KMP_DEBUG
if (__kmp_tasking_mode != tskm_immediate_exec) {
KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
"task_team[1] = %p after reinit\n",
team->t.t_task_team[0], team->t.t_task_team[1]));
}
#endif
#if OMPT_SUPPORT
__ompt_team_assign_id(team, ompt_parallel_data);
#endif
KMP_MB();
return team;
}
KMP_MB();
for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
that we have a resizing mechanism */
if (team->t.t_max_nproc >= max_nproc) {
__kmp_team_pool = team->t.t_next_pool;
if (max_nproc > 1 &&
__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
if (!team->t.b) {
team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
}
}
__kmp_initialize_team(team, new_nproc, new_icvs, NULL);
KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
"task_team[1] %p to NULL\n",
&team->t.t_task_team[0], &team->t.t_task_team[1]));
team->t.t_task_team[0] = NULL;
team->t.t_task_team[1] = NULL;
__kmp_alloc_argv_entries(argc, team, TRUE);
KMP_CHECK_UPDATE(team->t.t_argc, argc);
KA_TRACE(
20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
{
int b;
for (b = 0; b < bs_last_barrier; ++b) {
team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
#if USE_DEBUGGER
team->t.t_bar[b].b_master_arrived = 0;
team->t.t_bar[b].b_team_arrived = 0;
#endif
}
}
team->t.t_proc_bind = new_proc_bind;
KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
team->t.t_id));
#if OMPT_SUPPORT
__ompt_team_assign_id(team, ompt_parallel_data);
#endif
team->t.t_nested_nth = NULL;
KMP_MB();
return team;
}
team = __kmp_reap_team(team);
__kmp_team_pool = team;
}
KMP_MB();
team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
team->t.t_max_nproc = max_nproc;
if (max_nproc > 1 &&
__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
}
seems to really hurt performance a lot on the P4, so, let's not use this */
__kmp_allocate_team_arrays(team, max_nproc);
KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
__kmp_initialize_team(team, new_nproc, new_icvs, NULL);
KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
"%p to NULL\n",
&team->t.t_task_team[0], &team->t.t_task_team[1]));
team->t.t_task_team[0] = NULL;
team->t.t_task_team[1] = NULL;
if (__kmp_storage_map) {
__kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
}
__kmp_alloc_argv_entries(argc, team, FALSE);
team->t.t_argc = argc;
KA_TRACE(20,
("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
{
int b;
for (b = 0; b < bs_last_barrier; ++b) {
team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
#if USE_DEBUGGER
team->t.t_bar[b].b_master_arrived = 0;
team->t.t_bar[b].b_team_arrived = 0;
#endif
}
}
team->t.t_proc_bind = new_proc_bind;
#if OMPT_SUPPORT
__ompt_team_assign_id(team, ompt_parallel_data);
team->t.ompt_serialized_team_info = NULL;
#endif
KMP_MB();
team->t.t_nested_nth = NULL;
KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
team->t.t_id));
return team;
}
* associated with it */
void __kmp_free_team(kmp_root_t *root,
kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
int f;
KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
team->t.t_id));
KMP_DEBUG_ASSERT(root);
KMP_DEBUG_ASSERT(team);
KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
KMP_DEBUG_ASSERT(team->t.t_threads);
int use_hot_team = team == root->r.r_hot_team;
#if KMP_NESTED_HOT_TEAMS
int level;
if (master) {
level = team->t.t_active_level - 1;
if (master->th.th_teams_microtask) {
if (master->th.th_teams_size.nteams > 1) {
++level;
}
if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
master->th.th_teams_level == team->t.t_level) {
++level;
}
}
#if KMP_DEBUG
kmp_hot_team_ptr_t *hot_teams = master->th.th_hot_teams;
#endif
if (level < __kmp_hot_teams_max_level) {
KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
use_hot_team = 1;
}
}
#endif
TCW_SYNC_PTR(team->t.t_pkfn,
NULL);
#if KMP_OS_WINDOWS
team->t.t_copyin_counter = 0;
#endif
if (!use_hot_team) {
if (__kmp_tasking_mode != tskm_immediate_exec) {
for (f = 1; f < team->t.t_nproc; ++f) {
KMP_DEBUG_ASSERT(team->t.t_threads[f]);
kmp_info_t *th = team->t.t_threads[f];
volatile kmp_uint32 *state = &th->th.th_reap_state;
while (*state != KMP_SAFE_TO_REAP) {
#if KMP_OS_WINDOWS
DWORD ecode;
if (!__kmp_is_thread_alive(th, &ecode)) {
*state = KMP_SAFE_TO_REAP;
break;
}
#endif
if (th->th.th_sleep_loc)
__kmp_null_resume_wrapper(th);
KMP_CPU_PAUSE();
}
}
int tt_idx;
for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
if (task_team != NULL) {
for (f = 0; f < team->t.t_nproc; ++f) {
KMP_DEBUG_ASSERT(team->t.t_threads[f]);
team->t.t_threads[f]->th.th_task_team = NULL;
}
KA_TRACE(
20,
("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
__kmp_get_gtid(), task_team, team->t.t_id));
#if KMP_NESTED_HOT_TEAMS
__kmp_free_task_team(master, task_team);
#endif
team->t.t_task_team[tt_idx] = NULL;
}
}
}
if (team->t.t_nested_nth && team->t.t_nested_nth != &__kmp_nested_nth &&
team->t.t_nested_nth != team->t.t_parent->t.t_nested_nth) {
KMP_INTERNAL_FREE(team->t.t_nested_nth->nth);
KMP_INTERNAL_FREE(team->t.t_nested_nth);
}
team->t.t_nested_nth = NULL;
team->t.t_parent = NULL;
team->t.t_level = 0;
team->t.t_active_level = 0;
for (f = 1; f < team->t.t_nproc; ++f) {
KMP_DEBUG_ASSERT(team->t.t_threads[f]);
if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team),
1, 2);
}
__kmp_free_thread(team->t.t_threads[f]);
}
if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
if (team->t.b) {
team->t.b->go_release();
if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
for (f = 1; f < team->t.t_nproc; ++f) {
if (team->t.b->sleep[f].sleep) {
__kmp_atomic_resume_64(
team->t.t_threads[f]->th.th_info.ds.ds_gtid,
(kmp_atomic_flag_64<> *)NULL);
}
}
}
for (int f = 1; f < team->t.t_nproc; ++f) {
while (team->t.t_threads[f]->th.th_used_in_team.load() != 0)
KMP_CPU_PAUSE();
}
}
}
for (f = 1; f < team->t.t_nproc; ++f) {
team->t.t_threads[f] = NULL;
}
if (team->t.t_max_nproc > 1 &&
__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
distributedBarrier::deallocate(team->t.b);
team->t.b = NULL;
}
team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
__kmp_team_pool = (volatile kmp_team_t *)team;
} else {
KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
team->t.t_threads[1]->th.th_cg_roots);
if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
for (f = 1; f < team->t.t_nproc; ++f) {
kmp_info_t *thr = team->t.t_threads[f];
KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
thr->th.th_cg_roots->cg_root == thr);
kmp_cg_root_t *tmp = thr->th.th_cg_roots;
thr->th.th_cg_roots = tmp->up;
KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
" up to node %p. cg_nthreads was %d\n",
thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
int i = tmp->cg_nthreads--;
if (i == 1) {
__kmp_free(tmp);
}
if (thr->th.th_cg_roots)
thr->th.th_current_task->td_icvs.thread_limit =
thr->th.th_cg_roots->cg_thread_limit;
}
}
}
KMP_MB();
}
kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
kmp_team_t *next_pool = team->t.t_next_pool;
KMP_DEBUG_ASSERT(team);
KMP_DEBUG_ASSERT(team->t.t_dispatch);
KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
KMP_DEBUG_ASSERT(team->t.t_threads);
KMP_DEBUG_ASSERT(team->t.t_argv);
__kmp_free_team_arrays(team);
if (team->t.t_argv != &team->t.t_inline_argv[0])
__kmp_free((void *)team->t.t_argv);
__kmp_free(team);
KMP_MB();
return next_pool;
}
void __kmp_free_thread(kmp_info_t *this_th) {
int gtid;
kmp_info_t **scan;
KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
__kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
KMP_DEBUG_ASSERT(this_th);
int b;
kmp_balign_t *balign = this_th->th.th_bar;
for (b = 0; b < bs_last_barrier; ++b) {
if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
balign[b].bb.team = NULL;
balign[b].bb.leaf_kids = 0;
}
this_th->th.th_task_state = 0;
this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
TCW_PTR(this_th->th.th_team, NULL);
TCW_PTR(this_th->th.th_root, NULL);
TCW_PTR(this_th->th.th_dispatch, NULL);
while (this_th->th.th_cg_roots) {
this_th->th.th_cg_roots->cg_nthreads--;
KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
" %p of thread %p to %d\n",
this_th, this_th->th.th_cg_roots,
this_th->th.th_cg_roots->cg_root,
this_th->th.th_cg_roots->cg_nthreads));
kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
if (tmp->cg_root == this_th) {
KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
KA_TRACE(
5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
this_th->th.th_cg_roots = tmp->up;
__kmp_free(tmp);
} else {
if (tmp->cg_nthreads == 0) {
__kmp_free(tmp);
}
this_th->th.th_cg_roots = NULL;
break;
}
}
* -> multiple threads can share the data and try to free the task at
* __kmp_reap_thread at exit. This duplicate use of the task data can happen
* with higher probability when hot team is disabled but can occurs even when
* the hot team is enabled */
__kmp_free_implicit_task(this_th);
this_th->th.th_current_task = NULL;
gtid = this_th->th.th_info.ds.ds_gtid;
if (__kmp_thread_pool_insert_pt != NULL) {
KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
__kmp_thread_pool_insert_pt = NULL;
}
}
if (__kmp_thread_pool_insert_pt != NULL) {
scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
} else {
scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
}
for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
scan = &((*scan)->th.th_next_pool))
;
TCW_PTR(this_th->th.th_next_pool, *scan);
__kmp_thread_pool_insert_pt = *scan = this_th;
KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
(this_th->th.th_info.ds.ds_gtid <
this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
TCW_4(this_th->th.th_in_pool, TRUE);
__kmp_suspend_initialize_thread(this_th);
__kmp_lock_suspend_mx(this_th);
if (this_th->th.th_active == TRUE) {
KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
this_th->th.th_active_in_pool = TRUE;
}
#if KMP_DEBUG
else {
KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE);
}
#endif
__kmp_unlock_suspend_mx(this_th);
TCW_4(__kmp_nth, __kmp_nth - 1);
#ifdef KMP_ADJUST_BLOCKTIME
if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
if (__kmp_nth <= __kmp_avail_proc) {
__kmp_zero_bt = FALSE;
}
}
#endif
KMP_MB();
}
void *__kmp_launch_thread(kmp_info_t *this_thr) {
#if OMP_PROFILING_SUPPORT
ProfileTraceFile = getenv("LIBOMPTARGET_PROFILE");
if (ProfileTraceFile)
llvm::timeTraceProfilerInitialize(500 , "libomptarget");
#endif
int gtid = this_thr->th.th_info.ds.ds_gtid;
kmp_team_t **volatile pteam;
KMP_MB();
KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
if (__kmp_env_consistency_check) {
this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid);
}
#if OMPD_SUPPORT
if (ompd_state & OMPD_ENABLE_BP)
ompd_bp_thread_begin();
#endif
#if OMPT_SUPPORT
ompt_data_t *thread_data = nullptr;
if (ompt_enabled.enabled) {
thread_data = &(this_thr->th.ompt_thread_info.thread_data);
*thread_data = ompt_data_none;
this_thr->th.ompt_thread_info.state = ompt_state_overhead;
this_thr->th.ompt_thread_info.wait_id = 0;
this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
this_thr->th.ompt_thread_info.parallel_flags = 0;
if (ompt_enabled.ompt_callback_thread_begin) {
ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
ompt_thread_worker, thread_data);
}
this_thr->th.ompt_thread_info.state = ompt_state_idle;
}
#endif
while (!TCR_4(__kmp_global.g.g_done)) {
KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
KMP_MB();
KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
__kmp_fork_barrier(gtid, KMP_GTID_DNE);
#if OMPT_SUPPORT
if (ompt_enabled.enabled) {
this_thr->th.ompt_thread_info.state = ompt_state_overhead;
}
#endif
pteam = &this_thr->th.th_team;
if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
int rc;
KA_TRACE(20,
("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
(*pteam)->t.t_pkfn));
updateHWFPControl(*pteam);
#if OMPT_SUPPORT
if (ompt_enabled.enabled) {
this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
}
#endif
rc = (*pteam)->t.t_invoke(gtid);
KMP_ASSERT(rc);
KMP_MB();
KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
(*pteam)->t.t_pkfn));
}
#if OMPT_SUPPORT
if (ompt_enabled.enabled) {
__ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
this_thr->th.ompt_thread_info.state = ompt_state_overhead;
}
#endif
__kmp_join_barrier(gtid);
}
}
#if OMPD_SUPPORT
if (ompd_state & OMPD_ENABLE_BP)
ompd_bp_thread_end();
#endif
#if OMPT_SUPPORT
if (ompt_enabled.ompt_callback_thread_end) {
ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
}
#endif
this_thr->th.th_task_team = NULL;
__kmp_common_destroy_gtid(gtid);
KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
KMP_MB();
#if OMP_PROFILING_SUPPORT
llvm::timeTraceProfilerFinishThread();
#endif
return this_thr;
}
void __kmp_internal_end_dest(void *specific_gtid) {
int gtid;
__kmp_type_convert((kmp_intptr_t)specific_gtid - 1, >id);
KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
* this is because 0 is reserved for the nothing-stored case */
__kmp_internal_end_thread(gtid);
}
#if KMP_OS_UNIX && KMP_DYNAMIC_LIB
__attribute__((destructor)) void __kmp_internal_end_dtor(void) {
__kmp_internal_end_atexit();
}
#endif
than one thread alive */
void __kmp_internal_end_atexit(void) {
KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
josh: ideally, we want to completely shutdown the library in this atexit
handler, but stat code that depends on thread specific data for gtid fails
because that data becomes unavailable at some point during the shutdown, so
we call __kmp_internal_end_thread instead. We should eventually remove the
dependency on __kmp_get_specific_gtid in the stat code and use
__kmp_internal_end_library to cleanly shutdown the library.
// TODO: Can some of this comment about GVS be removed?
I suspect that the offending stat code is executed when the calling thread
tries to clean up a dead root thread's data structures, resulting in GVS
code trying to close the GVS structures for that thread, but since the stat
code uses __kmp_get_specific_gtid to get the gtid with the assumption that
the calling thread is cleaning up itself instead of another thread, it get
confused. This happens because allowing a thread to unregister and cleanup
another thread is a recent modification for addressing an issue.
Based on the current design (20050722), a thread may end up
trying to unregister another thread only if thread death does not trigger
the calling of __kmp_internal_end_thread. For Linux* OS, there is the
thread specific data destructor function to detect thread death. For
Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
is nothing. Thus, the workaround is applicable only for Windows static
stat library. */
__kmp_internal_end_library(-1);
#if KMP_OS_WINDOWS
__kmp_close_console();
#endif
}
static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
int gtid;
KMP_DEBUG_ASSERT(thread != NULL);
gtid = thread->th.th_info.ds.ds_gtid;
if (!is_root) {
if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
KA_TRACE(
20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
gtid));
if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
while (
!KMP_COMPARE_AND_STORE_ACQ32(&(thread->th.th_used_in_team), 0, 3))
KMP_CPU_PAUSE();
__kmp_resume_32(gtid, (kmp_flag_32<false, false> *)NULL);
} else {
barrier (GEH) */
kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
thread);
__kmp_release_64(&flag);
}
}
__kmp_reap_worker(thread);
if (thread->th.th_active_in_pool) {
thread->th.th_active_in_pool = FALSE;
KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
}
}
__kmp_free_implicit_task(thread);
#if USE_FAST_MEMORY
__kmp_free_fast_memory(thread);
#endif
__kmp_suspend_uninitialize_thread(thread);
KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
--__kmp_all_nth;
#ifdef KMP_ADJUST_BLOCKTIME
if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
if (__kmp_nth <= __kmp_avail_proc) {
__kmp_zero_bt = FALSE;
}
}
#endif
if (__kmp_env_consistency_check) {
if (thread->th.th_cons) {
__kmp_free_cons_stack(thread->th.th_cons);
thread->th.th_cons = NULL;
}
}
if (thread->th.th_pri_common != NULL) {
__kmp_free(thread->th.th_pri_common);
thread->th.th_pri_common = NULL;
}
#if KMP_USE_BGET
if (thread->th.th_local.bget_data != NULL) {
__kmp_finalize_bget(thread);
}
#endif
#if KMP_AFFINITY_SUPPORTED
if (thread->th.th_affin_mask != NULL) {
KMP_CPU_FREE(thread->th.th_affin_mask);
thread->th.th_affin_mask = NULL;
}
#endif
#if KMP_USE_HIER_SCHED
if (thread->th.th_hier_bar_data != NULL) {
__kmp_free(thread->th.th_hier_bar_data);
thread->th.th_hier_bar_data = NULL;
}
#endif
__kmp_reap_team(thread->th.th_serial_team);
thread->th.th_serial_team = NULL;
__kmp_free(thread);
KMP_MB();
}
static void __kmp_itthash_clean(kmp_info_t *th) {
#if USE_ITT_NOTIFY
if (__kmp_itt_region_domains.count > 0) {
for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
kmp_itthash_entry_t *bucket = __kmp_itt_region_domains.buckets[i];
while (bucket) {
kmp_itthash_entry_t *next = bucket->next_in_bucket;
__kmp_thread_free(th, bucket);
bucket = next;
}
}
}
if (__kmp_itt_barrier_domains.count > 0) {
for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
kmp_itthash_entry_t *bucket = __kmp_itt_barrier_domains.buckets[i];
while (bucket) {
kmp_itthash_entry_t *next = bucket->next_in_bucket;
__kmp_thread_free(th, bucket);
bucket = next;
}
}
}
#endif
}
static void __kmp_internal_end(void) {
int i;
__kmp_unregister_library();
#if KMP_OS_WINDOWS
reclaim the data structures for any root threads that have died but not
unregistered themselves, in order to shut down cleanly.
In Win dynamic library we also can't tell when a thread dies. */
__kmp_reclaim_dead_roots();
#endif
for (i = 0; i < __kmp_threads_capacity; i++)
if (__kmp_root[i])
if (__kmp_root[i]->r.r_active)
break;
KMP_MB();
TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
if (i < __kmp_threads_capacity) {
#if KMP_USE_MONITOR
KMP_MB();
__kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
if (TCR_4(__kmp_init_monitor)) {
__kmp_reap_monitor(&__kmp_monitor);
TCW_4(__kmp_init_monitor, 0);
}
__kmp_release_bootstrap_lock(&__kmp_monitor_lock);
KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
#endif
} else {
#ifdef KMP_DEBUG
for (i = 0; i < __kmp_threads_capacity; i++) {
if (__kmp_root[i]) {
KMP_ASSERT(!__kmp_root[i]->r.r_active);
}
}
#endif
KMP_MB();
while (__kmp_thread_pool != NULL) {
kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
__kmp_thread_pool = thread->th.th_next_pool;
KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
thread->th.th_next_pool = NULL;
thread->th.th_in_pool = FALSE;
__kmp_reap_thread(thread, 0);
}
__kmp_thread_pool_insert_pt = NULL;
while (__kmp_team_pool != NULL) {
kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
__kmp_team_pool = team->t.t_next_pool;
team->t.t_next_pool = NULL;
__kmp_reap_team(team);
}
__kmp_reap_task_teams();
#if KMP_OS_UNIX
for (i = 0; i < __kmp_threads_capacity; i++) {
kmp_info_t *thr = __kmp_threads[i];
while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
KMP_CPU_PAUSE();
}
#endif
for (i = 0; i < __kmp_threads_capacity; ++i) {
}
worker threads before resetting this flag */
TCW_SYNC_4(__kmp_init_common, FALSE);
KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
KMP_MB();
#if KMP_USE_MONITOR
__kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
if (TCR_4(__kmp_init_monitor)) {
__kmp_reap_monitor(&__kmp_monitor);
TCW_4(__kmp_init_monitor, 0);
}
__kmp_release_bootstrap_lock(&__kmp_monitor_lock);
KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
#endif
}
TCW_4(__kmp_init_gtid, FALSE);
KMP_MB();
__kmp_cleanup();
#if OMPT_SUPPORT
ompt_fini();
#endif
}
void __kmp_internal_end_library(int gtid_req) {
only place to clear __kmp_serial_init */
if (__kmp_global.g.g_abort) {
KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
return;
}
if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
return;
}
if (TCR_4(__kmp_init_hidden_helper) &&
!TCR_4(__kmp_hidden_helper_team_done)) {
TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
__kmp_hidden_helper_main_thread_release();
__kmp_hidden_helper_threads_deinitz_wait();
}
KMP_MB();
{
int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
KA_TRACE(
10, ("__kmp_internal_end_library: enter T#%d (%d)\n", gtid, gtid_req));
if (gtid == KMP_GTID_SHUTDOWN) {
KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
"already shutdown\n"));
return;
} else if (gtid == KMP_GTID_MONITOR) {
KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
"registered, or system shutdown\n"));
return;
} else if (gtid == KMP_GTID_DNE) {
KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
"shutdown\n"));
} else if (KMP_UBER_GTID(gtid)) {
if (__kmp_root[gtid]->r.r_active) {
__kmp_global.g.g_abort = -1;
TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
__kmp_unregister_library();
KA_TRACE(10,
("__kmp_internal_end_library: root still active, abort T#%d\n",
gtid));
return;
} else {
__kmp_itthash_clean(__kmp_threads[gtid]);
KA_TRACE(
10,
("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
__kmp_unregister_root_current_thread(gtid);
}
} else {
* call exit() */
TODO: do a thorough shutdown instead */
#ifdef DUMP_DEBUG_ON_EXIT
if (__kmp_debug_buf)
__kmp_dump_debug_buffer();
#endif
__kmp_unregister_library();
return;
}
}
__kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
if (__kmp_global.g.g_abort) {
KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
__kmp_release_bootstrap_lock(&__kmp_initz_lock);
return;
}
if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
__kmp_release_bootstrap_lock(&__kmp_initz_lock);
return;
}
__kmp_threads_capacity and the writing by __kmp_register_root.
Alternatively, we can use a counter of roots that is atomically updated by
__kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
__kmp_internal_end_*. */
__kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
__kmp_internal_end();
__kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
__kmp_release_bootstrap_lock(&__kmp_initz_lock);
KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
#ifdef DUMP_DEBUG_ON_EXIT
if (__kmp_debug_buf)
__kmp_dump_debug_buffer();
#endif
#if KMP_OS_WINDOWS
__kmp_close_console();
#endif
__kmp_fini_allocator();
}
void __kmp_internal_end_thread(int gtid_req) {
int i;
* only place to clear __kmp_serial_init */
if (__kmp_global.g.g_abort) {
KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
return;
}
if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
return;
}
if (TCR_4(__kmp_init_hidden_helper) &&
!TCR_4(__kmp_hidden_helper_team_done)) {
TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
__kmp_hidden_helper_main_thread_release();
__kmp_hidden_helper_threads_deinitz_wait();
}
KMP_MB();
{
int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
KA_TRACE(10,
("__kmp_internal_end_thread: enter T#%d (%d)\n", gtid, gtid_req));
if (gtid == KMP_GTID_SHUTDOWN) {
KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
"already shutdown\n"));
return;
} else if (gtid == KMP_GTID_MONITOR) {
KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
"registered, or system shutdown\n"));
return;
} else if (gtid == KMP_GTID_DNE) {
KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
"shutdown\n"));
return;
} else if (KMP_UBER_GTID(gtid)) {
if (__kmp_root[gtid]->r.r_active) {
__kmp_global.g.g_abort = -1;
TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
KA_TRACE(10,
("__kmp_internal_end_thread: root still active, abort T#%d\n",
gtid));
return;
} else {
KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
gtid));
__kmp_unregister_root_current_thread(gtid);
}
} else {
KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
if (gtid >= 0) {
__kmp_threads[gtid]->th.th_task_team = NULL;
}
KA_TRACE(10,
("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
gtid));
return;
}
}
#if KMP_DYNAMIC_LIB
if (__kmp_pause_status != kmp_hard_paused)
{
KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
return;
}
#endif
__kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
if (__kmp_global.g.g_abort) {
KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
__kmp_release_bootstrap_lock(&__kmp_initz_lock);
return;
}
if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
__kmp_release_bootstrap_lock(&__kmp_initz_lock);
return;
}
__kmp_threads_capacity and the writing by __kmp_register_root.
Alternatively, we can use a counter of roots that is atomically updated by
__kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
__kmp_internal_end_*. */
__kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
for (i = 0; i < __kmp_threads_capacity; ++i) {
if (KMP_UBER_GTID(i)) {
KA_TRACE(
10,
("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
__kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
__kmp_release_bootstrap_lock(&__kmp_initz_lock);
return;
}
}
__kmp_internal_end();
__kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
__kmp_release_bootstrap_lock(&__kmp_initz_lock);
KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
#ifdef DUMP_DEBUG_ON_EXIT
if (__kmp_debug_buf)
__kmp_dump_debug_buffer();
#endif
}
static long __kmp_registration_flag = 0;
static char *__kmp_registration_str = NULL;
static inline char *__kmp_reg_status_name() {
each thread. If registration and unregistration go in different threads
(omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
env var can not be found, because the name will contain different pid. */
#if KMP_OS_UNIX && !KMP_OS_DARWIN && KMP_DYNAMIC_LIB
return __kmp_str_format("__KMP_REGISTERED_LIB_%d_%d", (int)getpid(),
(int)getuid());
#else
return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
#endif
}
#if defined(KMP_USE_SHM)
bool __kmp_shm_available = false;
bool __kmp_tmp_available = false;
char *temp_reg_status_file_name = nullptr;
#endif
void __kmp_register_library_startup(void) {
char *name = __kmp_reg_status_name();
int done = 0;
union {
double dtime;
long ltime;
} time;
#if KMP_ARCH_X86 || KMP_ARCH_X86_64
__kmp_initialize_system_tick();
#endif
__kmp_read_system_time(&time.dtime);
__kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
__kmp_registration_str =
__kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
__kmp_registration_flag, KMP_LIBRARY_FILE);
KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
__kmp_registration_str));
while (!done) {
char *value = NULL;
#if defined(KMP_USE_SHM)
char *shm_name = nullptr;
char *data1 = nullptr;
__kmp_shm_available = __kmp_detect_shm();
if (__kmp_shm_available) {
int fd1 = -1;
shm_name = __kmp_str_format("/%s", name);
int shm_preexist = 0;
fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0600);
if ((fd1 == -1) && (errno == EEXIST)) {
fd1 = shm_open(shm_name, O_RDWR, 0600);
if (fd1 == -1) {
KMP_WARNING(FunctionError, "Can't open SHM");
__kmp_shm_available = false;
} else {
shm_preexist = 1;
}
}
if (__kmp_shm_available && shm_preexist == 0) {
if (ftruncate(fd1, SHM_SIZE) == -1) {
KMP_WARNING(FunctionError, "Can't set size of SHM");
__kmp_shm_available = false;
}
}
if (__kmp_shm_available) {
data1 = (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED,
fd1, 0);
if (data1 == MAP_FAILED) {
KMP_WARNING(FunctionError, "Can't map SHM");
__kmp_shm_available = false;
}
}
if (__kmp_shm_available) {
if (shm_preexist == 0) {
KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
}
value = __kmp_str_format("%s", data1);
munmap(data1, SHM_SIZE);
}
if (fd1 != -1)
close(fd1);
}
if (!__kmp_shm_available)
__kmp_tmp_available = __kmp_detect_tmp();
if (!__kmp_shm_available && __kmp_tmp_available) {
int fd1 = -1;
temp_reg_status_file_name = __kmp_str_format("/tmp/%s", name);
int tmp_preexist = 0;
fd1 = open(temp_reg_status_file_name, O_CREAT | O_EXCL | O_RDWR, 0600);
if ((fd1 == -1) && (errno == EEXIST)) {
fd1 = open(temp_reg_status_file_name, O_RDWR, 0600);
if (fd1 == -1) {
KMP_WARNING(FunctionError, "Can't open TEMP");
__kmp_tmp_available = false;
} else {
tmp_preexist = 1;
}
}
if (__kmp_tmp_available && tmp_preexist == 0) {
if (ftruncate(fd1, SHM_SIZE) == -1) {
KMP_WARNING(FunctionError, "Can't set size of /tmp file");
__kmp_tmp_available = false;
}
}
if (__kmp_tmp_available) {
data1 = (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED,
fd1, 0);
if (data1 == MAP_FAILED) {
KMP_WARNING(FunctionError, "Can't map /tmp");
__kmp_tmp_available = false;
}
}
if (__kmp_tmp_available) {
if (tmp_preexist == 0) {
KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
}
value = __kmp_str_format("%s", data1);
munmap(data1, SHM_SIZE);
}
if (fd1 != -1)
close(fd1);
}
if (!__kmp_shm_available && !__kmp_tmp_available) {
__kmp_env_set(name, __kmp_registration_str, 0);
value = __kmp_env_get(name);
}
#else
__kmp_env_set(name, __kmp_registration_str, 0);
value = __kmp_env_get(name);
#endif
if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
done = 1;
} else {
int neighbor = 0;
char *tail = value;
char *flag_addr_str = NULL;
char *flag_val_str = NULL;
char const *file_name = NULL;
__kmp_str_split(tail, '-', &flag_addr_str, &tail);
__kmp_str_split(tail, '-', &flag_val_str, &tail);
file_name = tail;
if (tail != NULL) {
unsigned long *flag_addr = 0;
unsigned long flag_val = 0;
KMP_SSCANF(flag_addr_str, "%p", RCAST(void **, &flag_addr));
KMP_SSCANF(flag_val_str, "%lx", &flag_val);
if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
neighbor = 1;
} else {
neighbor = 2;
}
}
}
switch (neighbor) {
case 0:
file_name = "unknown library";
KMP_FALLTHROUGH();
case 1: {
char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
if (!__kmp_str_match_true(duplicate_ok)) {
__kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
KMP_HNT(DuplicateLibrary), __kmp_msg_null);
}
KMP_INTERNAL_FREE(duplicate_ok);
__kmp_duplicate_library_ok = 1;
done = 1;
} break;
case 2: {
#if defined(KMP_USE_SHM)
if (__kmp_shm_available) {
shm_unlink(shm_name);
} else if (__kmp_tmp_available) {
unlink(temp_reg_status_file_name);
} else {
__kmp_env_unset(name);
}
#else
__kmp_env_unset(name);
#endif
} break;
default: {
KMP_DEBUG_ASSERT(0);
} break;
}
}
KMP_INTERNAL_FREE((void *)value);
#if defined(KMP_USE_SHM)
if (shm_name)
KMP_INTERNAL_FREE((void *)shm_name);
#endif
}
KMP_INTERNAL_FREE((void *)name);
}
void __kmp_unregister_library(void) {
char *name = __kmp_reg_status_name();
char *value = NULL;
#if defined(KMP_USE_SHM)
char *shm_name = nullptr;
int fd1;
if (__kmp_shm_available) {
shm_name = __kmp_str_format("/%s", name);
fd1 = shm_open(shm_name, O_RDONLY, 0600);
if (fd1 != -1) {
char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
if (data1 != MAP_FAILED) {
value = __kmp_str_format("%s", data1);
munmap(data1, SHM_SIZE);
}
close(fd1);
}
} else if (__kmp_tmp_available) {
fd1 = open(temp_reg_status_file_name, O_RDONLY);
if (fd1 != -1) {
char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
if (data1 != MAP_FAILED) {
value = __kmp_str_format("%s", data1);
munmap(data1, SHM_SIZE);
}
close(fd1);
}
} else {
value = __kmp_env_get(name);
}
#else
value = __kmp_env_get(name);
#endif
KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
#if defined(KMP_USE_SHM)
if (__kmp_shm_available) {
shm_unlink(shm_name);
} else if (__kmp_tmp_available) {
unlink(temp_reg_status_file_name);
} else {
__kmp_env_unset(name);
}
#else
__kmp_env_unset(name);
#endif
}
#if defined(KMP_USE_SHM)
if (shm_name)
KMP_INTERNAL_FREE(shm_name);
if (temp_reg_status_file_name)
KMP_INTERNAL_FREE(temp_reg_status_file_name);
#endif
KMP_INTERNAL_FREE(__kmp_registration_str);
KMP_INTERNAL_FREE(value);
KMP_INTERNAL_FREE(name);
__kmp_registration_flag = 0;
__kmp_registration_str = NULL;
}
#if KMP_MIC_SUPPORTED
static void __kmp_check_mic_type() {
kmp_cpuid_t cpuid_state = {0};
kmp_cpuid_t *cs_p = &cpuid_state;
__kmp_x86_cpuid(1, 0, cs_p);
if ((cs_p->eax & 0xff0) == 0xB10) {
__kmp_mic_type = mic2;
} else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
__kmp_mic_type = mic3;
} else {
__kmp_mic_type = non_mic;
}
}
#endif
#if KMP_HAVE_UMWAIT
static void __kmp_user_level_mwait_init() {
struct kmp_cpuid buf;
__kmp_x86_cpuid(7, 0, &buf);
__kmp_waitpkg_enabled = ((buf.ecx >> 5) & 1);
__kmp_umwait_enabled = __kmp_waitpkg_enabled && __kmp_user_level_mwait;
__kmp_tpause_enabled = __kmp_waitpkg_enabled && (__kmp_tpause_state > 0);
KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n",
__kmp_umwait_enabled));
}
#elif KMP_HAVE_MWAIT
#ifndef AT_INTELPHIUSERMWAIT
#define AT_INTELPHIUSERMWAIT 10000
#endif
unsigned long getauxval(unsigned long) KMP_WEAK_ATTRIBUTE_EXTERNAL;
unsigned long getauxval(unsigned long) { return 0; }
static void __kmp_user_level_mwait_init() {
if (__kmp_mic_type == mic3) {
unsigned long res = getauxval(AT_INTELPHIUSERMWAIT);
if ((res & 0x1) || __kmp_user_level_mwait) {
__kmp_mwait_enabled = TRUE;
if (__kmp_user_level_mwait) {
KMP_INFORM(EnvMwaitWarn);
}
} else {
__kmp_mwait_enabled = FALSE;
}
}
KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_mic_type = %d, "
"__kmp_mwait_enabled = %d\n",
__kmp_mic_type, __kmp_mwait_enabled));
}
#endif
static void __kmp_do_serial_initialize(void) {
int i, gtid;
size_t size;
KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
#if OMPT_SUPPORT
ompt_pre_init();
#endif
#if OMPD_SUPPORT
__kmp_env_dump();
ompd_init();
#endif
__kmp_validate_locks();
#if ENABLE_LIBOMPTARGET
__kmp_init_omptarget();
#endif
__kmp_init_allocator();
shared memory file and check to see whether another copy of the library is
already registered. Since forked child process is often terminated, we
postpone the registration till middle initialization in the child */
if (__kmp_need_register_serial)
__kmp_register_library_startup();
if (TCR_4(__kmp_global.g.g_done)) {
KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
}
__kmp_global.g.g_abort = 0;
TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
#if KMP_USE_ADAPTIVE_LOCKS
#if KMP_DEBUG_ADAPTIVE_LOCKS
__kmp_init_speculative_stats();
#endif
#endif
#if KMP_STATS_ENABLED
__kmp_stats_init();
#endif
__kmp_init_lock(&__kmp_global_lock);
__kmp_init_queuing_lock(&__kmp_dispatch_lock);
__kmp_init_lock(&__kmp_debug_lock);
__kmp_init_atomic_lock(&__kmp_atomic_lock);
__kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
__kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
__kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
__kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
__kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
__kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
__kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
__kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
__kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
__kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
__kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
__kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
__kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
__kmp_init_bootstrap_lock(&__kmp_exit_lock);
#if KMP_USE_MONITOR
__kmp_init_bootstrap_lock(&__kmp_monitor_lock);
#endif
__kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
__kmp_runtime_initialize();
#if KMP_MIC_SUPPORTED
__kmp_check_mic_type();
#endif
#ifdef KMP_DEBUG
kmp_diag = 0;
#endif
__kmp_abort_delay = 0;
__kmp_dflt_team_nth_ub = __kmp_xproc;
if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
__kmp_dflt_team_nth_ub = KMP_MIN_NTH;
}
if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
__kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
}
__kmp_max_nth = __kmp_sys_max_nth;
__kmp_cg_max_nth = __kmp_sys_max_nth;
__kmp_teams_max_nth = __kmp_xproc;
if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
__kmp_teams_max_nth = __kmp_sys_max_nth;
}
__kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
#if KMP_USE_MONITOR
__kmp_monitor_wakeups =
KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
__kmp_bt_intervals =
KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
#endif
__kmp_library = library_throughput;
__kmp_static = kmp_sch_static_balanced;
#if KMP_FAST_REDUCTION_BARRIER
#define kmp_reduction_barrier_gather_bb ((int)1)
#define kmp_reduction_barrier_release_bb ((int)1)
#define kmp_reduction_barrier_gather_pat __kmp_barrier_gather_pat_dflt
#define kmp_reduction_barrier_release_pat __kmp_barrier_release_pat_dflt
#endif
for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
__kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
__kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
__kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
__kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
#if KMP_FAST_REDUCTION_BARRIER
if (i == bs_reduction_barrier) {
__kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
__kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
__kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
__kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
}
#endif
}
#if KMP_FAST_REDUCTION_BARRIER
#undef kmp_reduction_barrier_release_pat
#undef kmp_reduction_barrier_gather_pat
#undef kmp_reduction_barrier_release_bb
#undef kmp_reduction_barrier_gather_bb
#endif
#if KMP_MIC_SUPPORTED
if (__kmp_mic_type == mic2) {
__kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3;
__kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
1;
__kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
__kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
}
#if KMP_FAST_REDUCTION_BARRIER
if (__kmp_mic_type == mic2) {
__kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
__kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
}
#endif
#endif
#ifdef KMP_DEBUG
__kmp_env_checks = TRUE;
#else
__kmp_env_checks = FALSE;
#endif
__kmp_foreign_tp = TRUE;
__kmp_global.g.g_dynamic = FALSE;
__kmp_global.g.g_dynamic_mode = dynamic_default;
__kmp_init_nesting_mode();
__kmp_env_initialize(NULL);
#if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
__kmp_user_level_mwait_init();
#endif
#ifdef KMP_DEBUG
char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
if (__kmp_str_match_true(val)) {
kmp_str_buf_t buffer;
__kmp_str_buf_init(&buffer);
__kmp_i18n_dump_catalog(&buffer);
__kmp_printf("%s", buffer.str);
__kmp_str_buf_free(&buffer);
}
__kmp_env_free(&val);
#endif
__kmp_threads_capacity =
__kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
__kmp_tp_capacity = __kmp_default_tp_capacity(
__kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
__kmp_thread_pool = NULL;
__kmp_thread_pool_insert_pt = NULL;
__kmp_team_pool = NULL;
* expandable */
size =
(sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
CACHE_LINE;
__kmp_threads = (kmp_info_t **)__kmp_allocate(size);
__kmp_root = (kmp_root_t **)((char *)__kmp_threads +
sizeof(kmp_info_t *) * __kmp_threads_capacity);
KMP_DEBUG_ASSERT(__kmp_all_nth ==
0);
KMP_DEBUG_ASSERT(__kmp_nth == 0);
__kmp_all_nth = 0;
__kmp_nth = 0;
gtid = __kmp_register_root(TRUE);
KA_TRACE(10, ("__kmp_do_serial_initialize T#%d\n", gtid));
KMP_ASSERT(KMP_UBER_GTID(gtid));
KMP_ASSERT(KMP_INITIAL_GTID(gtid));
KMP_MB();
__kmp_common_initialize();
#if KMP_OS_UNIX
__kmp_register_atfork();
#endif
#if !KMP_DYNAMIC_LIB || \
((KMP_COMPILER_ICC || KMP_COMPILER_ICX) && KMP_OS_DARWIN)
{
library and macOS* dynamic. For other dynamic libraries, we already
have _fini and DllMain. */
int rc = atexit(__kmp_internal_end_atexit);
if (rc != 0) {
__kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
__kmp_msg_null);
}
}
#endif
#if KMP_HANDLE_SIGNALS
#if KMP_OS_UNIX
signal handlers so that the user handlers are called first. this way they
can return false, not call our handler, avoid terminating the library, and
continue execution where they left off. */
__kmp_install_signals(FALSE);
#endif
#if KMP_OS_WINDOWS
__kmp_install_signals(TRUE);
#endif
#endif
__kmp_init_counter++;
__kmp_init_serial = TRUE;
if (__kmp_version) {
__kmp_print_version_1();
}
if (__kmp_settings) {
__kmp_env_print();
}
if (__kmp_display_env || __kmp_display_env_verbose) {
__kmp_env_print_2();
}
#if OMPT_SUPPORT
ompt_post_init();
#endif
KMP_MB();
KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
}
void __kmp_serial_initialize(void) {
if (__kmp_init_serial) {
return;
}
__kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
if (__kmp_init_serial) {
__kmp_release_bootstrap_lock(&__kmp_initz_lock);
return;
}
__kmp_do_serial_initialize();
__kmp_release_bootstrap_lock(&__kmp_initz_lock);
}
static void __kmp_do_middle_initialize(void) {
int i, j;
int prev_dflt_team_nth;
if (!__kmp_init_serial) {
__kmp_do_serial_initialize();
}
KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
if (UNLIKELY(!__kmp_need_register_serial)) {
__kmp_register_library_startup();
}
prev_dflt_team_nth = __kmp_dflt_team_nth;
#if KMP_AFFINITY_SUPPORTED
__kmp_affinity_initialize(__kmp_affinity);
#endif
KMP_ASSERT(__kmp_xproc > 0);
if (__kmp_avail_proc == 0) {
__kmp_avail_proc = __kmp_xproc;
}
j = 0;
while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
__kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
__kmp_avail_proc;
j++;
}
if (__kmp_dflt_team_nth == 0) {
#ifdef KMP_DFLT_NTH_CORES
__kmp_dflt_team_nth = __kmp_ncores;
KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
"__kmp_ncores (%d)\n",
__kmp_dflt_team_nth));
#else
__kmp_dflt_team_nth = __kmp_avail_proc;
KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
"__kmp_avail_proc(%d)\n",
__kmp_dflt_team_nth));
#endif
}
if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
__kmp_dflt_team_nth = KMP_MIN_NTH;
}
if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
__kmp_dflt_team_nth = __kmp_sys_max_nth;
}
if (__kmp_nesting_mode > 0)
__kmp_set_nesting_mode_threads();
KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
for (i = 0; i < __kmp_threads_capacity; i++) {
kmp_info_t *thread = __kmp_threads[i];
if (thread == NULL)
continue;
if (thread->th.th_current_task->td_icvs.nproc != 0)
continue;
set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
}
}
KA_TRACE(
20,
("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
__kmp_dflt_team_nth));
#ifdef KMP_ADJUST_BLOCKTIME
if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
if (__kmp_nth > __kmp_avail_proc) {
__kmp_zero_bt = TRUE;
}
}
#endif
TCW_SYNC_4(__kmp_init_middle, TRUE);
KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
}
void __kmp_middle_initialize(void) {
if (__kmp_init_middle) {
return;
}
__kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
if (__kmp_init_middle) {
__kmp_release_bootstrap_lock(&__kmp_initz_lock);
return;
}
__kmp_do_middle_initialize();
__kmp_release_bootstrap_lock(&__kmp_initz_lock);
}
void __kmp_parallel_initialize(void) {
int gtid = __kmp_entry_gtid();
if (TCR_4(__kmp_init_parallel))
return;
__kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
if (TCR_4(__kmp_init_parallel)) {
__kmp_release_bootstrap_lock(&__kmp_initz_lock);
return;
}
if (TCR_4(__kmp_global.g.g_done)) {
KA_TRACE(
10,
("__kmp_parallel_initialize: attempt to init while shutting down\n"));
__kmp_infinite_loop();
}
__kmp_serial_initialize would cause a deadlock. So we call
__kmp_do_serial_initialize directly. */
if (!__kmp_init_middle) {
__kmp_do_middle_initialize();
}
__kmp_assign_root_init_mask();
__kmp_resume_if_hard_paused();
KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
KMP_ASSERT(KMP_UBER_GTID(gtid));
#if KMP_ARCH_X86 || KMP_ARCH_X86_64
__kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
__kmp_store_mxcsr(&__kmp_init_mxcsr);
__kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
#endif
#if KMP_OS_UNIX
#if KMP_HANDLE_SIGNALS
__kmp_install_signals(TRUE);
#endif
#endif
__kmp_suspend_initialize();
#if defined(USE_LOAD_BALANCE)
if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
__kmp_global.g.g_dynamic_mode = dynamic_load_balance;
}
#else
if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
__kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
}
#endif
if (__kmp_version) {
__kmp_print_version_2();
}
TCW_SYNC_4(__kmp_init_parallel, TRUE);
KMP_MB();
KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
__kmp_release_bootstrap_lock(&__kmp_initz_lock);
}
void __kmp_hidden_helper_initialize() {
if (TCR_4(__kmp_init_hidden_helper))
return;
if (!TCR_4(__kmp_init_parallel))
__kmp_parallel_initialize();
__kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
if (TCR_4(__kmp_init_hidden_helper)) {
__kmp_release_bootstrap_lock(&__kmp_initz_lock);
return;
}
#if KMP_AFFINITY_SUPPORTED
if (!__kmp_hh_affinity.flags.initialized)
__kmp_affinity_initialize(__kmp_hh_affinity);
#endif
KMP_ATOMIC_ST_REL(&__kmp_unexecuted_hidden_helper_tasks, 0);
TCW_SYNC_4(__kmp_init_hidden_helper_threads, TRUE);
__kmp_do_initialize_hidden_helper_threads();
__kmp_hidden_helper_threads_initz_wait();
TCW_SYNC_4(__kmp_init_hidden_helper, TRUE);
__kmp_release_bootstrap_lock(&__kmp_initz_lock);
}
void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
kmp_team_t *team) {
kmp_disp_t *dispatch;
KMP_MB();
this_thr->th.th_local.this_construct = 0;
#if KMP_CACHE_MANAGE
KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
#endif
dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
KMP_DEBUG_ASSERT(dispatch);
KMP_DEBUG_ASSERT(team->t.t_dispatch);
dispatch->th_disp_index = 0;
dispatch->th_doacross_buf_idx = 0;
if (__kmp_env_consistency_check)
__kmp_push_parallel(gtid, team->t.t_ident);
KMP_MB();
}
void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
kmp_team_t *team) {
if (__kmp_env_consistency_check)
__kmp_pop_parallel(gtid, team->t.t_ident);
__kmp_finish_implicit_task(this_thr);
}
int __kmp_invoke_task_func(int gtid) {
int rc;
int tid = __kmp_tid_from_gtid(gtid);
kmp_info_t *this_thr = __kmp_threads[gtid];
kmp_team_t *team = this_thr->th.th_team;
__kmp_run_before_invoked_task(gtid, tid, this_thr, team);
#if USE_ITT_BUILD
if (__itt_stack_caller_create_ptr) {
if (team->t.t_stack_id != NULL) {
__kmp_itt_stack_callee_enter((__itt_caller)team->t.t_stack_id);
} else {
KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
__kmp_itt_stack_callee_enter(
(__itt_caller)team->t.t_parent->t.t_stack_id);
}
}
#endif
#if INCLUDE_SSC_MARKS
SSC_MARK_INVOKING();
#endif
#if OMPT_SUPPORT
void *dummy;
void **exit_frame_p;
ompt_data_t *my_task_data;
ompt_data_t *my_parallel_data;
int ompt_team_size;
if (ompt_enabled.enabled) {
exit_frame_p = &(team->t.t_implicit_task_taskdata[tid]
.ompt_task_info.frame.exit_frame.ptr);
} else {
exit_frame_p = &dummy;
}
my_task_data =
&(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
my_parallel_data = &(team->t.ompt_team_info.parallel_data);
if (ompt_enabled.ompt_callback_implicit_task) {
ompt_team_size = team->t.t_nproc;
ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
__kmp_tid_from_gtid(gtid), ompt_task_implicit);
OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
}
#endif
#if KMP_STATS_ENABLED
stats_state_e previous_state = KMP_GET_THREAD_STATE();
if (previous_state == stats_state_e::TEAMS_REGION) {
KMP_PUSH_PARTITIONED_TIMER(OMP_teams);
} else {
KMP_PUSH_PARTITIONED_TIMER(OMP_parallel);
}
KMP_SET_THREAD_STATE(IMPLICIT_TASK);
#endif
rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
tid, (int)team->t.t_argc, (void **)team->t.t_argv
#if OMPT_SUPPORT
,
exit_frame_p
#endif
);
#if OMPT_SUPPORT
*exit_frame_p = NULL;
this_thr->th.ompt_thread_info.parallel_flags = ompt_parallel_team;
#endif
#if KMP_STATS_ENABLED
if (previous_state == stats_state_e::TEAMS_REGION) {
KMP_SET_THREAD_STATE(previous_state);
}
KMP_POP_PARTITIONED_TIMER();
#endif
#if USE_ITT_BUILD
if (__itt_stack_caller_create_ptr) {
if (team->t.t_stack_id != NULL) {
__kmp_itt_stack_callee_leave((__itt_caller)team->t.t_stack_id);
} else {
KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
__kmp_itt_stack_callee_leave(
(__itt_caller)team->t.t_parent->t.t_stack_id);
}
}
#endif
__kmp_run_after_invoked_task(gtid, tid, this_thr, team);
return rc;
}
void __kmp_teams_master(int gtid) {
kmp_info_t *thr = __kmp_threads[gtid];
kmp_team_t *team = thr->th.th_team;
ident_t *loc = team->t.t_ident;
thr->th.th_set_nproc = thr->th.th_teams_size.nth;
KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
__kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
tmp->cg_root = thr;
tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
tmp->cg_nthreads = 1;
KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
" cg_nthreads to 1\n",
thr, tmp));
tmp->up = thr->th.th_cg_roots;
thr->th.th_cg_roots = tmp;
#if INCLUDE_SSC_MARKS
SSC_MARK_FORKING();
#endif
__kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
(microtask_t)thr->th.th_teams_microtask,
VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
#if INCLUDE_SSC_MARKS
SSC_MARK_JOINING();
#endif
if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
thr->th.th_teams_size.nth = thr->th.th_team_nproc;
__kmp_join_call(loc, gtid
#if OMPT_SUPPORT
,
fork_context_intel
#endif
,
1);
}
int __kmp_invoke_teams_master(int gtid) {
kmp_info_t *this_thr = __kmp_threads[gtid];
kmp_team_t *team = this_thr->th.th_team;
#if KMP_DEBUG
if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
(void *)__kmp_teams_master);
#endif
__kmp_run_before_invoked_task(gtid, 0, this_thr, team);
#if OMPT_SUPPORT
int tid = __kmp_tid_from_gtid(gtid);
ompt_data_t *task_data =
&team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data;
ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data;
if (ompt_enabled.ompt_callback_implicit_task) {
ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid,
ompt_task_initial);
OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid;
}
#endif
__kmp_teams_master(gtid);
#if OMPT_SUPPORT
this_thr->th.ompt_thread_info.parallel_flags = ompt_parallel_league;
#endif
__kmp_run_after_invoked_task(gtid, 0, this_thr, team);
return 1;
}
encountered by this team. since this should be enclosed in the forkjoin
critical section it should avoid race conditions with asymmetrical nested
parallelism */
void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
kmp_info_t *thr = __kmp_threads[gtid];
if (num_threads > 0)
thr->th.th_set_nproc = num_threads;
}
void __kmp_push_num_threads_list(ident_t *id, int gtid, kmp_uint32 list_length,
int *num_threads_list) {
kmp_info_t *thr = __kmp_threads[gtid];
KMP_DEBUG_ASSERT(list_length > 1);
if (num_threads_list[0] > 0)
thr->th.th_set_nproc = num_threads_list[0];
thr->th.th_set_nested_nth =
(int *)KMP_INTERNAL_MALLOC(list_length * sizeof(int));
for (kmp_uint32 i = 0; i < list_length; ++i)
thr->th.th_set_nested_nth[i] = num_threads_list[i];
thr->th.th_set_nested_nth_sz = list_length;
}
void __kmp_set_strict_num_threads(ident_t *loc, int gtid, int sev,
const char *msg) {
kmp_info_t *thr = __kmp_threads[gtid];
thr->th.th_nt_strict = true;
thr->th.th_nt_loc = loc;
if (sev == severity_warning)
thr->th.th_nt_sev = sev;
else
thr->th.th_nt_sev = severity_fatal;
if (msg)
thr->th.th_nt_msg = msg;
else
thr->th.th_nt_msg = "Cannot form team with number of threads specified by "
"strict num_threads clause.";
}
static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams,
int num_threads) {
KMP_DEBUG_ASSERT(thr);
if (!TCR_4(__kmp_init_middle))
__kmp_middle_initialize();
__kmp_assign_root_init_mask();
KMP_DEBUG_ASSERT(__kmp_avail_proc);
KMP_DEBUG_ASSERT(__kmp_dflt_team_nth);
if (num_threads == 0) {
if (__kmp_teams_thread_limit > 0) {
num_threads = __kmp_teams_thread_limit;
} else {
num_threads = __kmp_avail_proc / num_teams;
}
if (num_threads > __kmp_dflt_team_nth) {
num_threads = __kmp_dflt_team_nth;
}
if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) {
num_threads = thr->th.th_current_task->td_icvs.thread_limit;
}
if (num_teams * num_threads > __kmp_teams_max_nth) {
num_threads = __kmp_teams_max_nth / num_teams;
}
if (num_threads == 0) {
num_threads = 1;
}
} else {
if (num_threads < 0) {
__kmp_msg(kmp_ms_warning, KMP_MSG(CantFormThrTeam, num_threads, 1),
__kmp_msg_null);
num_threads = 1;
}
thr->th.th_current_task->td_icvs.thread_limit = num_threads;
if (num_threads > __kmp_dflt_team_nth) {
num_threads = __kmp_dflt_team_nth;
}
if (num_teams * num_threads > __kmp_teams_max_nth) {
int new_threads = __kmp_teams_max_nth / num_teams;
if (new_threads == 0) {
new_threads = 1;
}
if (new_threads != num_threads) {
if (!__kmp_reserve_warn) {
__kmp_reserve_warn = 1;
__kmp_msg(kmp_ms_warning,
KMP_MSG(CantFormThrTeam, num_threads, new_threads),
KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
}
}
num_threads = new_threads;
}
}
thr->th.th_teams_size.nth = num_threads;
}
the number of threads for the next parallel region encountered */
void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
int num_threads) {
kmp_info_t *thr = __kmp_threads[gtid];
if (num_teams < 0) {
__kmp_msg(kmp_ms_warning, KMP_MSG(NumTeamsNotPositive, num_teams, 1),
__kmp_msg_null);
num_teams = 1;
}
if (num_teams == 0) {
if (__kmp_nteams > 0) {
num_teams = __kmp_nteams;
} else {
num_teams = 1;
}
}
if (num_teams > __kmp_teams_max_nth) {
if (!__kmp_reserve_warn) {
__kmp_reserve_warn = 1;
__kmp_msg(kmp_ms_warning,
KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
}
num_teams = __kmp_teams_max_nth;
}
thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
__kmp_push_thread_limit(thr, num_teams, num_threads);
}
the number of threads for the next parallel region encountered */
void __kmp_push_num_teams_51(ident_t *id, int gtid, int num_teams_lb,
int num_teams_ub, int num_threads) {
kmp_info_t *thr = __kmp_threads[gtid];
KMP_DEBUG_ASSERT(num_teams_lb >= 0 && num_teams_ub >= 0);
KMP_DEBUG_ASSERT(num_teams_ub >= num_teams_lb);
KMP_DEBUG_ASSERT(num_threads >= 0);
if (num_teams_lb > num_teams_ub) {
__kmp_fatal(KMP_MSG(FailedToCreateTeam, num_teams_lb, num_teams_ub),
KMP_HNT(SetNewBound, __kmp_teams_max_nth), __kmp_msg_null);
}
int num_teams = 1;
if (num_teams_lb == 0 && num_teams_ub > 0)
num_teams_lb = num_teams_ub;
if (num_teams_lb == 0 && num_teams_ub == 0) {
num_teams = (__kmp_nteams > 0) ? __kmp_nteams : num_teams;
if (num_teams > __kmp_teams_max_nth) {
if (!__kmp_reserve_warn) {
__kmp_reserve_warn = 1;
__kmp_msg(kmp_ms_warning,
KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
}
num_teams = __kmp_teams_max_nth;
}
} else if (num_teams_lb == num_teams_ub) {
num_teams = num_teams_ub;
} else {
if (num_threads <= 0) {
if (num_teams_ub > __kmp_teams_max_nth) {
num_teams = num_teams_lb;
} else {
num_teams = num_teams_ub;
}
} else {
num_teams = (num_threads > __kmp_teams_max_nth)
? num_teams
: __kmp_teams_max_nth / num_threads;
if (num_teams < num_teams_lb) {
num_teams = num_teams_lb;
} else if (num_teams > num_teams_ub) {
num_teams = num_teams_ub;
}
}
}
thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
__kmp_push_thread_limit(thr, num_teams, num_threads);
}
void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
kmp_info_t *thr = __kmp_threads[gtid];
thr->th.th_set_proc_bind = proc_bind;
}
void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
kmp_info_t *this_thr = __kmp_threads[gtid];
#ifdef KMP_DEBUG
int f;
#endif
KMP_DEBUG_ASSERT(team);
KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
KMP_ASSERT(KMP_MASTER_GTID(gtid));
KMP_MB();
team->t.t_construct = 0;
team->t.t_ordered.dt.t_value =
0;
KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
if (team->t.t_max_nproc > 1) {
int i;
for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
team->t.t_disp_buffer[i].buffer_index = i;
team->t.t_disp_buffer[i].doacross_buf_idx = i;
}
} else {
team->t.t_disp_buffer[0].buffer_index = 0;
team->t.t_disp_buffer[0].doacross_buf_idx = 0;
}
KMP_MB();
KMP_ASSERT(this_thr->th.th_team == team);
#ifdef KMP_DEBUG
for (f = 0; f < team->t.t_nproc; f++) {
KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
}
#endif
__kmp_fork_barrier(gtid, 0);
}
void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
kmp_info_t *this_thr = __kmp_threads[gtid];
KMP_DEBUG_ASSERT(team);
KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
KMP_ASSERT(KMP_MASTER_GTID(gtid));
KMP_MB();
#ifdef KMP_DEBUG
if (__kmp_threads[gtid] &&
__kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
__kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
__kmp_threads[gtid]);
__kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
"team->t.t_nproc=%d\n",
gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
team->t.t_nproc);
__kmp_print_structure();
}
KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
__kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
#endif
__kmp_join_barrier(gtid);
#if OMPT_SUPPORT
ompt_state_t ompt_state = this_thr->th.ompt_thread_info.state;
if (ompt_enabled.enabled &&
(ompt_state == ompt_state_wait_barrier_teams ||
ompt_state == ompt_state_wait_barrier_implicit_parallel)) {
int ds_tid = this_thr->th.th_info.ds.ds_tid;
ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
this_thr->th.ompt_thread_info.state = ompt_state_overhead;
#if OMPT_OPTIONAL
void *codeptr = NULL;
if (KMP_MASTER_TID(ds_tid) &&
(ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
ompt_sync_region_t sync_kind = ompt_sync_region_barrier_implicit_parallel;
if (this_thr->th.ompt_thread_info.parallel_flags & ompt_parallel_league)
sync_kind = ompt_sync_region_barrier_teams;
if (ompt_enabled.ompt_callback_sync_region_wait) {
ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
sync_kind, ompt_scope_end, NULL, task_data, codeptr);
}
if (ompt_enabled.ompt_callback_sync_region) {
ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
sync_kind, ompt_scope_end, NULL, task_data, codeptr);
}
#endif
if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
ompt_scope_end, NULL, task_data, 0, ds_tid,
ompt_task_implicit);
}
}
#endif
KMP_MB();
KMP_ASSERT(this_thr->th.th_team == team);
}
#ifdef USE_LOAD_BALANCE
static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
int i;
int retval;
kmp_team_t *hot_team;
if (root->r.r_active) {
return 0;
}
hot_team = root->r.r_hot_team;
if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
return hot_team->t.t_nproc - 1;
}
retval = 0;
for (i = 1; i < hot_team->t.t_nproc; i++) {
if (hot_team->t.t_threads[i]->th.th_active) {
retval++;
}
}
return retval;
}
static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
int retval;
int pool_active;
int hot_team_active;
int team_curr_active;
int system_active;
KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
set_nproc));
KMP_DEBUG_ASSERT(root);
KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
->th.th_current_task->td_icvs.dynamic == TRUE);
KMP_DEBUG_ASSERT(set_nproc > 1);
if (set_nproc == 1) {
KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
return 1;
}
pool_active = __kmp_thread_pool_active_nth;
hot_team_active = __kmp_active_hot_team_nproc(root);
team_curr_active = pool_active + hot_team_active + 1;
system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
"hot team active = %d\n",
system_active, pool_active, hot_team_active));
if (system_active < 0) {
__kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
retval = __kmp_avail_proc - __kmp_nth +
(root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
if (retval > set_nproc) {
retval = set_nproc;
}
if (retval < KMP_MIN_NTH) {
retval = KMP_MIN_NTH;
}
KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
retval));
return retval;
}
if (system_active < team_curr_active) {
system_active = team_curr_active;
}
retval = __kmp_avail_proc - system_active + team_curr_active;
if (retval > set_nproc) {
retval = set_nproc;
}
if (retval < KMP_MIN_NTH) {
retval = KMP_MIN_NTH;
}
KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
return retval;
}
#endif
void __kmp_cleanup(void) {
int f;
KA_TRACE(10, ("__kmp_cleanup: enter\n"));
if (TCR_4(__kmp_init_parallel)) {
#if KMP_HANDLE_SIGNALS
__kmp_remove_signals();
#endif
TCW_4(__kmp_init_parallel, FALSE);
}
if (TCR_4(__kmp_init_middle)) {
#if KMP_AFFINITY_SUPPORTED
__kmp_affinity_uninitialize();
#endif
__kmp_cleanup_hierarchy();
TCW_4(__kmp_init_middle, FALSE);
}
KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
if (__kmp_init_serial) {
__kmp_runtime_destroy();
__kmp_init_serial = FALSE;
}
__kmp_cleanup_threadprivate_caches();
for (f = 0; f < __kmp_threads_capacity; f++) {
if (__kmp_root[f] != NULL) {
__kmp_free(__kmp_root[f]);
__kmp_root[f] = NULL;
}
}
__kmp_free(__kmp_threads);
__kmp_threads = NULL;
__kmp_root = NULL;
__kmp_threads_capacity = 0;
kmp_old_threads_list_t *ptr = __kmp_old_threads_list;
while (ptr) {
kmp_old_threads_list_t *next = ptr->next;
__kmp_free(ptr->threads);
__kmp_free(ptr);
ptr = next;
}
#if KMP_USE_DYNAMIC_LOCK
__kmp_cleanup_indirect_user_locks();
#else
__kmp_cleanup_user_locks();
#endif
#if OMPD_SUPPORT
if (ompd_state) {
__kmp_free(ompd_env_block);
ompd_env_block = NULL;
ompd_env_block_size = 0;
}
#endif
#if KMP_AFFINITY_SUPPORTED
KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
__kmp_cpuinfo_file = NULL;
#endif
#if KMP_USE_ADAPTIVE_LOCKS
#if KMP_DEBUG_ADAPTIVE_LOCKS
__kmp_print_speculative_stats();
#endif
#endif
KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
__kmp_nested_nth.nth = NULL;
__kmp_nested_nth.size = 0;
__kmp_nested_nth.used = 0;
KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
__kmp_nested_proc_bind.bind_types = NULL;
__kmp_nested_proc_bind.size = 0;
__kmp_nested_proc_bind.used = 0;
if (__kmp_affinity_format) {
KMP_INTERNAL_FREE(__kmp_affinity_format);
__kmp_affinity_format = NULL;
}
__kmp_i18n_catclose();
#if KMP_USE_HIER_SCHED
__kmp_hier_scheds.deallocate();
#endif
#if KMP_STATS_ENABLED
__kmp_stats_fini();
#endif
KA_TRACE(10, ("__kmp_cleanup: exit\n"));
}
int __kmp_ignore_mppbeg(void) {
char *env;
if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
if (__kmp_str_match_false(env))
return FALSE;
}
return TRUE;
}
int __kmp_ignore_mppend(void) {
char *env;
if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
if (__kmp_str_match_false(env))
return FALSE;
}
return TRUE;
}
void __kmp_internal_begin(void) {
int gtid;
kmp_root_t *root;
and assign these new uber threads a new gtid */
gtid = __kmp_entry_gtid();
root = __kmp_threads[gtid]->th.th_root;
KMP_ASSERT(KMP_UBER_GTID(gtid));
if (root->r.r_begin)
return;
__kmp_acquire_lock(&root->r.r_begin_lock, gtid);
if (root->r.r_begin) {
__kmp_release_lock(&root->r.r_begin_lock, gtid);
return;
}
root->r.r_begin = TRUE;
__kmp_release_lock(&root->r.r_begin_lock, gtid);
}
void __kmp_user_set_library(enum library_type arg) {
int gtid;
kmp_root_t *root;
kmp_info_t *thread;
gtid = __kmp_entry_gtid();
thread = __kmp_threads[gtid];
root = thread->th.th_root;
KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
library_serial));
if (root->r.r_in_parallel) {
thread */
KMP_WARNING(SetLibraryIncorrectCall);
return;
}
switch (arg) {
case library_serial:
thread->th.th_set_nproc = 0;
set__nproc(thread, 1);
break;
case library_turnaround:
thread->th.th_set_nproc = 0;
set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
: __kmp_dflt_team_nth_ub);
break;
case library_throughput:
thread->th.th_set_nproc = 0;
set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
: __kmp_dflt_team_nth_ub);
break;
default:
KMP_FATAL(UnknownLibraryType, arg);
}
__kmp_aux_set_library(arg);
}
void __kmp_aux_set_stacksize(size_t arg) {
if (!__kmp_init_serial)
__kmp_serial_initialize();
#if KMP_OS_DARWIN
if (arg & (0x1000 - 1)) {
arg &= ~(0x1000 - 1);
if (arg + 0x1000)
arg += 0x1000;
}
#endif
__kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
if (!TCR_4(__kmp_init_parallel)) {
size_t value = arg;
if (value < __kmp_sys_min_stksize)
value = __kmp_sys_min_stksize;
else if (value > KMP_MAX_STKSIZE)
value = KMP_MAX_STKSIZE;
__kmp_stksize = value;
__kmp_env_stksize = TRUE;
}
__kmp_release_bootstrap_lock(&__kmp_initz_lock);
}
void __kmp_aux_set_library(enum library_type arg) {
__kmp_library = arg;
switch (__kmp_library) {
case library_serial: {
KMP_INFORM(LibraryIsSerial);
} break;
case library_turnaround:
if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
__kmp_use_yield = 2;
break;
case library_throughput:
if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
__kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
break;
default:
KMP_FATAL(UnknownLibraryType, arg);
}
}
static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
kmp_info_t *thr = __kmp_entry_thread();
teams_serialized = 0;
if (thr->th.th_teams_microtask) {
kmp_team_t *team = thr->th.th_team;
int tlevel = thr->th.th_teams_level;
int ii = team->t.t_level;
teams_serialized = team->t.t_serialized;
int level = tlevel + 1;
KMP_DEBUG_ASSERT(ii >= tlevel);
while (ii > level) {
for (teams_serialized = team->t.t_serialized;
(teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
}
if (team->t.t_serialized && (!teams_serialized)) {
team = team->t.t_parent;
continue;
}
if (ii > level) {
team = team->t.t_parent;
ii--;
}
}
return team;
}
return NULL;
}
int __kmp_aux_get_team_num() {
int serialized;
kmp_team_t *team = __kmp_aux_get_team_info(serialized);
if (team) {
if (serialized > 1) {
return 0;
} else {
return team->t.t_master_tid;
}
}
return 0;
}
int __kmp_aux_get_num_teams() {
int serialized;
kmp_team_t *team = __kmp_aux_get_team_info(serialized);
if (team) {
if (serialized > 1) {
return 1;
} else {
return team->t.t_parent->t.t_nproc;
}
}
return 1;
}
* Affinity Format Parser
*
* Field is in form of: %[[[0].]size]type
* % and type are required (%% means print a literal '%')
* type is either single char or long name surrounded by {},
* e.g., N or {num_threads}
* 0 => leading zeros
* . => right justified when size is specified
* by default output is left justified
* size is the *minimum* field length
* All other characters are printed as is
*
* Available field types:
* L {thread_level} - omp_get_level()
* n {thread_num} - omp_get_thread_num()
* h {host} - name of host machine
* P {process_id} - process id (integer)
* T {thread_identifier} - native thread identifier (integer)
* N {num_threads} - omp_get_num_threads()
* A {ancestor_tnum} - omp_get_ancestor_thread_num(omp_get_level()-1)
* a {thread_affinity} - comma separated list of integers or integer ranges
* (values of affinity mask)
*
* Implementation-specific field types can be added
* If a type is unknown, print "undefined"
*/
typedef struct kmp_affinity_format_field_t {
char short_name;
const char *long_name;
char field_format;
} kmp_affinity_format_field_t;
static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
#if KMP_AFFINITY_SUPPORTED
{'A', "thread_affinity", 's'},
#endif
{'t', "team_num", 'd'},
{'T', "num_teams", 'd'},
{'L', "nesting_level", 'd'},
{'n', "thread_num", 'd'},
{'N', "num_threads", 'd'},
{'a', "ancestor_tnum", 'd'},
{'H', "host", 's'},
{'P', "process_id", 'd'},
{'i', "native_thread_id", 'd'}};
static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
const char **ptr,
kmp_str_buf_t *field_buffer) {
int rc, format_index, field_value;
const char *width_left, *width_right;
bool pad_zeros, right_justify, parse_long_name, found_valid_name;
static const int FORMAT_SIZE = 20;
char format[FORMAT_SIZE] = {0};
char absolute_short_name = 0;
KMP_DEBUG_ASSERT(gtid >= 0);
KMP_DEBUG_ASSERT(th);
KMP_DEBUG_ASSERT(**ptr == '%');
KMP_DEBUG_ASSERT(field_buffer);
__kmp_str_buf_clear(field_buffer);
(*ptr)++;
if (**ptr == '%') {
__kmp_str_buf_cat(field_buffer, "%", 1);
(*ptr)++;
return 1;
}
pad_zeros = false;
if (**ptr == '0') {
pad_zeros = true;
(*ptr)++;
}
right_justify = false;
if (**ptr == '.') {
right_justify = true;
(*ptr)++;
}
width_left = width_right = NULL;
if (**ptr >= '0' && **ptr <= '9') {
width_left = *ptr;
SKIP_DIGITS(*ptr);
width_right = *ptr;
}
format_index = 0;
format[format_index++] = '%';
if (!right_justify)
format[format_index++] = '-';
if (pad_zeros)
format[format_index++] = '0';
if (width_left && width_right) {
int i = 0;
while (i < 8 && width_left < width_right) {
format[format_index++] = *width_left;
width_left++;
i++;
}
}
found_valid_name = false;
parse_long_name = (**ptr == '{');
if (parse_long_name)
(*ptr)++;
for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
sizeof(__kmp_affinity_format_table[0]);
++i) {
char short_name = __kmp_affinity_format_table[i].short_name;
const char *long_name = __kmp_affinity_format_table[i].long_name;
char field_format = __kmp_affinity_format_table[i].field_format;
if (parse_long_name) {
size_t length = KMP_STRLEN(long_name);
if (strncmp(*ptr, long_name, length) == 0) {
found_valid_name = true;
(*ptr) += length;
}
} else if (**ptr == short_name) {
found_valid_name = true;
(*ptr)++;
}
if (found_valid_name) {
format[format_index++] = field_format;
format[format_index++] = '\0';
absolute_short_name = short_name;
break;
}
}
if (parse_long_name) {
if (**ptr != '}') {
absolute_short_name = 0;
} else {
(*ptr)++;
}
}
switch (absolute_short_name) {
case 't':
rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
break;
case 'T':
rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
break;
case 'L':
rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
break;
case 'n':
rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
break;
case 'H': {
static const int BUFFER_SIZE = 256;
char buf[BUFFER_SIZE];
__kmp_expand_host_name(buf, BUFFER_SIZE);
rc = __kmp_str_buf_print(field_buffer, format, buf);
} break;
case 'P':
rc = __kmp_str_buf_print(field_buffer, format, getpid());
break;
case 'i':
rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
break;
case 'N':
rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
break;
case 'a':
field_value =
__kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
rc = __kmp_str_buf_print(field_buffer, format, field_value);
break;
#if KMP_AFFINITY_SUPPORTED
case 'A': {
kmp_str_buf_t buf;
__kmp_str_buf_init(&buf);
__kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
rc = __kmp_str_buf_print(field_buffer, format, buf.str);
__kmp_str_buf_free(&buf);
} break;
#endif
default:
rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
if (parse_long_name) {
SKIP_TOKEN(*ptr);
if (**ptr == '}')
(*ptr)++;
} else {
(*ptr)++;
}
}
KMP_ASSERT(format_index <= FORMAT_SIZE);
return rc;
}
* Return number of characters needed to hold the affinity string
* (not including null byte character)
* The resultant string is printed to buffer, which the caller can then
* handle afterwards
*/
size_t __kmp_aux_capture_affinity(int gtid, const char *format,
kmp_str_buf_t *buffer) {
const char *parse_ptr;
size_t retval;
const kmp_info_t *th;
kmp_str_buf_t field;
KMP_DEBUG_ASSERT(buffer);
KMP_DEBUG_ASSERT(gtid >= 0);
__kmp_str_buf_init(&field);
__kmp_str_buf_clear(buffer);
th = __kmp_threads[gtid];
retval = 0;
parse_ptr = format;
if (parse_ptr == NULL || *parse_ptr == '\0') {
parse_ptr = __kmp_affinity_format;
}
KMP_DEBUG_ASSERT(parse_ptr);
while (*parse_ptr != '\0') {
if (*parse_ptr == '%') {
int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
__kmp_str_buf_catbuf(buffer, &field);
retval += rc;
} else {
__kmp_str_buf_cat(buffer, parse_ptr, 1);
retval++;
parse_ptr++;
}
}
__kmp_str_buf_free(&field);
return retval;
}
void __kmp_aux_display_affinity(int gtid, const char *format) {
kmp_str_buf_t buf;
__kmp_str_buf_init(&buf);
__kmp_aux_capture_affinity(gtid, format, &buf);
__kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
__kmp_str_buf_free(&buf);
}
void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
int blocktime = arg;
#if KMP_USE_MONITOR
int bt_intervals;
#endif
kmp_int8 bt_set;
__kmp_save_internal_controls(thread);
if (blocktime < KMP_MIN_BLOCKTIME)
blocktime = KMP_MIN_BLOCKTIME;
else if (blocktime > KMP_MAX_BLOCKTIME)
blocktime = KMP_MAX_BLOCKTIME;
set__blocktime_team(thread->th.th_team, tid, blocktime);
set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
#if KMP_USE_MONITOR
bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
#endif
bt_set = TRUE;
set__bt_set_team(thread->th.th_team, tid, bt_set);
set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
#if KMP_USE_MONITOR
KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
"bt_intervals=%d, monitor_updates=%d\n",
__kmp_gtid_from_tid(tid, thread->th.th_team),
thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
__kmp_monitor_wakeups));
#else
KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
__kmp_gtid_from_tid(tid, thread->th.th_team),
thread->th.th_team->t.t_id, tid, blocktime));
#endif
}
void __kmp_aux_set_defaults(char const *str, size_t len) {
if (!__kmp_init_serial) {
__kmp_serial_initialize();
}
__kmp_env_initialize(str);
if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) {
__kmp_env_print();
}
}
PACKED_REDUCTION_METHOD_T
__kmp_determine_reduction_method(
ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
kmp_critical_name *lck) {
PACKED_REDUCTION_METHOD_T retval;
int team_size;
KMP_DEBUG_ASSERT(lck);
#define FAST_REDUCTION_ATOMIC_METHOD_GENERATED \
(loc && \
((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE)))
#define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
retval = critical_reduce_block;
team_size = __kmp_get_team_num_threads(global_tid);
if (team_size == 1) {
retval = empty_reduce_block;
} else {
int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
#if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || \
KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 || \
KMP_ARCH_VE || KMP_ARCH_S390X || KMP_ARCH_WASM
#if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \
KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD || \
KMP_OS_SOLARIS || KMP_OS_WASI || KMP_OS_AIX
int teamsize_cutoff = 4;
#if KMP_MIC_SUPPORTED
if (__kmp_mic_type != non_mic) {
teamsize_cutoff = 8;
}
#endif
int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
if (tree_available) {
if (team_size <= teamsize_cutoff) {
if (atomic_available) {
retval = atomic_reduce_block;
}
} else {
retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
}
} else if (atomic_available) {
retval = atomic_reduce_block;
}
#else
#error "Unknown or unsupported OS"
#endif
#elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS || \
KMP_ARCH_WASM || KMP_ARCH_PPC || KMP_ARCH_AARCH64_32
#if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \
KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_HURD || KMP_OS_SOLARIS || \
KMP_OS_WASI || KMP_OS_AIX
if (atomic_available) {
if (num_vars <= 2) {
retval = atomic_reduce_block;
}
}
#elif KMP_OS_DARWIN
int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
if (atomic_available && (num_vars <= 3)) {
retval = atomic_reduce_block;
} else if (tree_available) {
if ((reduce_size > (9 * sizeof(kmp_real64))) &&
(reduce_size < (2000 * sizeof(kmp_real64)))) {
retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
}
}
#else
#error "Unknown or unsupported OS"
#endif
#else
#error "Unknown or unsupported architecture"
#endif
}
if (__kmp_force_reduction_method != reduction_method_not_defined &&
team_size != 1) {
PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
int atomic_available, tree_available;
switch ((forced_retval = __kmp_force_reduction_method)) {
case critical_reduce_block:
KMP_ASSERT(lck);
break;
case atomic_reduce_block:
atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
if (!atomic_available) {
KMP_WARNING(RedMethodNotSupported, "atomic");
forced_retval = critical_reduce_block;
}
break;
case tree_reduce_block:
tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
if (!tree_available) {
KMP_WARNING(RedMethodNotSupported, "tree");
forced_retval = critical_reduce_block;
} else {
#if KMP_FAST_REDUCTION_BARRIER
forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
#endif
}
break;
default:
KMP_ASSERT(0);
}
retval = forced_retval;
}
KA_TRACE(10, ("reduction method selected=%08x\n", retval));
#undef FAST_REDUCTION_TREE_METHOD_GENERATED
#undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
return (retval);
}
kmp_int32 __kmp_get_reduce_method(void) {
return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
}
void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
void __kmp_hard_pause() {
__kmp_pause_status = kmp_hard_paused;
__kmp_internal_end_thread(-1);
}
void __kmp_resume_if_soft_paused() {
if (__kmp_pause_status == kmp_soft_paused) {
__kmp_pause_status = kmp_not_paused;
for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
kmp_info_t *thread = __kmp_threads[gtid];
if (thread) {
kmp_flag_64<> fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
thread);
if (fl.is_sleeping())
fl.resume(gtid);
else if (__kmp_try_suspend_mx(thread)) {
__kmp_unlock_suspend_mx(thread);
} else {
do {
if (fl.is_sleeping()) {
fl.resume(gtid);
break;
} else if (__kmp_try_suspend_mx(thread)) {
__kmp_unlock_suspend_mx(thread);
break;
}
} while (1);
}
}
}
}
}
int __kmp_pause_resource(kmp_pause_status_t level) {
if (level == kmp_not_paused) {
if (__kmp_pause_status == kmp_not_paused) {
return 1;
} else {
KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
__kmp_pause_status == kmp_hard_paused);
__kmp_pause_status = kmp_not_paused;
return 0;
}
} else if (level == kmp_soft_paused) {
if (__kmp_pause_status != kmp_not_paused) {
return 1;
} else {
__kmp_soft_pause();
return 0;
}
} else if (level == kmp_hard_paused) {
if (__kmp_pause_status != kmp_not_paused) {
return 1;
} else {
__kmp_hard_pause();
return 0;
}
} else {
return 1;
}
}
void __kmp_omp_display_env(int verbose) {
__kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
if (__kmp_init_serial == 0)
__kmp_do_serial_initialize();
__kmp_display_env_impl(!verbose, verbose);
__kmp_release_bootstrap_lock(&__kmp_initz_lock);
}
void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
int new_nthreads) {
KMP_DEBUG_ASSERT(__kmp_barrier_release_pattern[bs_forkjoin_barrier] ==
bp_dist_bar);
kmp_info_t **other_threads = team->t.t_threads;
for (int f = 1; f < old_nthreads; ++f) {
KMP_DEBUG_ASSERT(other_threads[f] != NULL);
if (team->t.t_threads[f]->th.th_used_in_team.load() == 0) {
continue;
}
if (team->t.t_threads[f]->th.th_used_in_team.load() == 3) {
while (team->t.t_threads[f]->th.th_used_in_team.load() == 3)
KMP_CPU_PAUSE();
}
KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 1);
team->t.t_threads[f]->th.th_used_in_team.store(2);
KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 2);
}
team->t.b->go_release();
KMP_MFENCE();
int count = old_nthreads - 1;
while (count > 0) {
count = old_nthreads - 1;
for (int f = 1; f < old_nthreads; ++f) {
if (other_threads[f]->th.th_used_in_team.load() != 0) {
if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
kmp_atomic_flag_64<> *flag = (kmp_atomic_flag_64<> *)CCAST(
void *, other_threads[f]->th.th_sleep_loc);
__kmp_atomic_resume_64(other_threads[f]->th.th_info.ds.ds_gtid, flag);
}
} else {
KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 0);
count--;
}
}
}
team->t.b->update_num_threads(new_nthreads);
team->t.b->go_reset();
}
void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads) {
KMP_DEBUG_ASSERT(team);
for (int f = 1; f < new_nthreads; ++f) {
KMP_DEBUG_ASSERT(team->t.t_threads[f]);
KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team), 0,
3);
if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
__kmp_resume_32(team->t.t_threads[f]->th.th_info.ds.ds_gtid,
(kmp_flag_32<false, false> *)NULL);
}
}
int count = new_nthreads - 1;
while (count > 0) {
count = new_nthreads - 1;
for (int f = 1; f < new_nthreads; ++f) {
if (team->t.t_threads[f]->th.th_used_in_team.load() == 1) {
count--;
}
}
}
}
kmp_info_t **__kmp_hidden_helper_threads;
kmp_info_t *__kmp_hidden_helper_main_thread;
std::atomic<kmp_int32> __kmp_unexecuted_hidden_helper_tasks;
#if KMP_OS_LINUX
kmp_int32 __kmp_hidden_helper_threads_num = 8;
kmp_int32 __kmp_enable_hidden_helper = TRUE;
#else
kmp_int32 __kmp_hidden_helper_threads_num = 0;
kmp_int32 __kmp_enable_hidden_helper = FALSE;
#endif
namespace {
std::atomic<kmp_int32> __kmp_hit_hidden_helper_threads_num;
void __kmp_hidden_helper_wrapper_fn(int *gtid, int *, ...) {
KMP_ATOMIC_INC(&__kmp_hit_hidden_helper_threads_num);
while (KMP_ATOMIC_LD_ACQ(&__kmp_hit_hidden_helper_threads_num) !=
__kmp_hidden_helper_threads_num)
;
if (__kmpc_master(nullptr, *gtid)) {
TCW_4(__kmp_init_hidden_helper_threads, FALSE);
__kmp_hidden_helper_initz_release();
__kmp_hidden_helper_main_thread_wait();
for (int i = 1; i < __kmp_hit_hidden_helper_threads_num; ++i) {
__kmp_hidden_helper_worker_thread_signal();
}
}
}
}
void __kmp_hidden_helper_threads_initz_routine() {
const int gtid = __kmp_register_root(TRUE);
__kmp_hidden_helper_main_thread = __kmp_threads[gtid];
__kmp_hidden_helper_threads = &__kmp_threads[gtid];
__kmp_hidden_helper_main_thread->th.th_set_nproc =
__kmp_hidden_helper_threads_num;
KMP_ATOMIC_ST_REL(&__kmp_hit_hidden_helper_threads_num, 0);
__kmpc_fork_call(nullptr, 0, __kmp_hidden_helper_wrapper_fn);
TCW_SYNC_4(__kmp_init_hidden_helper, FALSE);
__kmp_hidden_helper_threads_deinitz_release();
}
Set via KMP_NESTING_MODE, which takes an integer.
Note: we skip duplicate topology levels, and skip levels with only
one entity.
KMP_NESTING_MODE=0 is the default, and doesn't use nesting mode.
KMP_NESTING_MODE=1 sets as many nesting levels as there are distinct levels
in the topology, and initializes the number of threads at each of those
levels to the number of entities at each level, respectively, below the
entity at the parent level.
KMP_NESTING_MODE=N, where N>1, attempts to create up to N nesting levels,
but starts with nesting OFF -- max-active-levels-var is 1 -- and requires
the user to turn nesting on explicitly. This is an even more experimental
option to this experimental feature, and may change or go away in the
future.
*/
void __kmp_init_nesting_mode() {
int levels = KMP_HW_LAST;
__kmp_nesting_mode_nlevels = levels;
__kmp_nesting_nth_level = (int *)KMP_INTERNAL_MALLOC(levels * sizeof(int));
for (int i = 0; i < levels; ++i)
__kmp_nesting_nth_level[i] = 0;
if (__kmp_nested_nth.size < levels) {
__kmp_nested_nth.nth =
(int *)KMP_INTERNAL_REALLOC(__kmp_nested_nth.nth, levels * sizeof(int));
__kmp_nested_nth.size = levels;
}
}
void __kmp_set_nesting_mode_threads() {
kmp_info_t *thread = __kmp_threads[__kmp_entry_gtid()];
if (__kmp_nesting_mode == 1)
__kmp_nesting_mode_nlevels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
else if (__kmp_nesting_mode > 1)
__kmp_nesting_mode_nlevels = __kmp_nesting_mode;
if (__kmp_topology) {
int loc, hw_level;
for (loc = 0, hw_level = 0; hw_level < __kmp_topology->get_depth() &&
loc < __kmp_nesting_mode_nlevels;
loc++, hw_level++) {
__kmp_nesting_nth_level[loc] = __kmp_topology->get_ratio(hw_level);
if (__kmp_nesting_nth_level[loc] == 1)
loc--;
}
if (__kmp_nesting_mode > 1 && loc > 1) {
int core_level = __kmp_topology->get_level(KMP_HW_CORE);
int num_cores = __kmp_topology->get_count(core_level);
int upper_levels = 1;
for (int level = 0; level < loc - 1; ++level)
upper_levels *= __kmp_nesting_nth_level[level];
if (upper_levels * __kmp_nesting_nth_level[loc - 1] < num_cores)
__kmp_nesting_nth_level[loc - 1] =
num_cores / __kmp_nesting_nth_level[loc - 2];
}
__kmp_nesting_mode_nlevels = loc;
__kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
} else {
if (__kmp_avail_proc >= 4) {
__kmp_nesting_nth_level[0] = __kmp_avail_proc / 2;
__kmp_nesting_nth_level[1] = 2;
__kmp_nesting_mode_nlevels = 2;
} else {
__kmp_nesting_nth_level[0] = __kmp_avail_proc;
__kmp_nesting_mode_nlevels = 1;
}
__kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
}
for (int i = 0; i < __kmp_nesting_mode_nlevels; ++i) {
__kmp_nested_nth.nth[i] = __kmp_nesting_nth_level[i];
}
set__nproc(thread, __kmp_nesting_nth_level[0]);
if (__kmp_nesting_mode > 1 && __kmp_nesting_mode_nlevels > __kmp_nesting_mode)
__kmp_nesting_mode_nlevels = __kmp_nesting_mode;
if (get__max_active_levels(thread) > 1) {
__kmp_nesting_mode_nlevels = get__max_active_levels(thread);
}
if (__kmp_nesting_mode == 1)
set__max_active_levels(thread, __kmp_nesting_mode_nlevels);
}
extern "C" {
#if !KMP_STATS_ENABLED
void __kmp_reset_stats() {}
#endif
#if !USE_DEBUGGER
int __kmp_omp_debug_struct_info = FALSE;
int __kmp_debugging = FALSE;
#endif
#if !USE_ITT_BUILD || !USE_ITT_NOTIFY
void __kmp_itt_fini_ittlib() {}
void __kmp_itt_init_ittlib() {}
#endif
}