* kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
*/
*
* NOTE: __kmp_nth is a constant inside of any dispatch loop, however
* it may change values between parallel regions. __kmp_max_nth
* is the largest value __kmp_nth may take, 1 is the smallest.
*/
#include "kmp.h"
#include "kmp_error.h"
#include "kmp_i18n.h"
#include "kmp_itt.h"
#include "kmp_stats.h"
#include "kmp_str.h"
#if KMP_USE_X87CONTROL
#include <float.h>
#endif
#include "kmp_lock.h"
#include "kmp_dispatch.h"
#if KMP_USE_HIER_SCHED
#include "kmp_dispatch_hier.h"
#endif
#if OMPT_SUPPORT
#include "ompt-specific.h"
#endif
void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
kmp_info_t *th;
KMP_DEBUG_ASSERT(gtid_ref);
if (__kmp_env_consistency_check) {
th = __kmp_threads[*gtid_ref];
if (th->th.th_root->r.r_active &&
(th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) {
#if KMP_USE_DYNAMIC_LOCK
__kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0);
#else
__kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL);
#endif
}
}
}
void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
kmp_info_t *th;
if (__kmp_env_consistency_check) {
th = __kmp_threads[*gtid_ref];
if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) {
__kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref);
}
}
}
static inline int __kmp_get_monotonicity(ident_t *loc, enum sched_type schedule,
bool use_hier = false) {
int monotonicity = SCHEDULE_NONMONOTONIC;
if (loc != NULL && loc->get_openmp_version() < 50)
monotonicity = SCHEDULE_MONOTONIC;
if (use_hier || __kmp_force_monotonic)
monotonicity = SCHEDULE_MONOTONIC;
else if (SCHEDULE_HAS_NONMONOTONIC(schedule))
monotonicity = SCHEDULE_NONMONOTONIC;
else if (SCHEDULE_HAS_MONOTONIC(schedule))
monotonicity = SCHEDULE_MONOTONIC;
return monotonicity;
}
#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
static inline float __kmp_round_2decimal_val(float num) {
return (float)(static_cast<int>(num * 100 + 0.5)) / 100;
}
static inline int __kmp_get_round_val(float num) {
return static_cast<int>(num < 0 ? num - 0.5 : num + 0.5);
}
#endif
template <typename T>
inline void
__kmp_initialize_self_buffer(kmp_team_t *team, T id,
dispatch_private_info_template<T> *pr,
typename traits_t<T>::unsigned_t nchunks, T nproc,
typename traits_t<T>::unsigned_t &init,
T &small_chunk, T &extras, T &p_extra) {
#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
if (pr->flags.use_hybrid) {
kmp_info_t *th = __kmp_threads[__kmp_gtid_from_tid((int)id, team)];
kmp_hw_core_type_t type =
(kmp_hw_core_type_t)th->th.th_topology_attrs.core_type;
T pchunks = pr->u.p.pchunks;
T echunks = nchunks - pchunks;
T num_procs_with_pcore = pr->u.p.num_procs_with_pcore;
T num_procs_with_ecore = nproc - num_procs_with_pcore;
T first_thread_with_ecore = pr->u.p.first_thread_with_ecore;
T big_chunk =
pchunks / num_procs_with_pcore;
small_chunk =
echunks / num_procs_with_ecore;
extras =
(pchunks % num_procs_with_pcore) + (echunks % num_procs_with_ecore);
p_extra = (big_chunk - small_chunk);
if (type == KMP_HW_CORE_TYPE_CORE) {
if (id < first_thread_with_ecore) {
init = id * small_chunk + id * p_extra + (id < extras ? id : extras);
} else {
init = id * small_chunk + (id - num_procs_with_ecore) * p_extra +
(id < extras ? id : extras);
}
} else {
if (id == first_thread_with_ecore) {
init = id * small_chunk + id * p_extra + (id < extras ? id : extras);
} else {
init = id * small_chunk + first_thread_with_ecore * p_extra +
(id < extras ? id : extras);
}
}
p_extra = (type == KMP_HW_CORE_TYPE_CORE) ? p_extra : 0;
return;
}
#endif
small_chunk = nchunks / nproc;
extras = nchunks % nproc;
p_extra = 0;
init = id * small_chunk + (id < extras ? id : extras);
}
#if KMP_STATIC_STEAL_ENABLED
enum {
UNUSED = 0,
CLAIMED = 1,
READY = 2,
THIEF = 3
};
#endif
template <typename T>
void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
dispatch_private_info_template<T> *pr,
enum sched_type schedule, T lb, T ub,
typename traits_t<T>::signed_t st,
#if USE_ITT_BUILD
kmp_uint64 *cur_chunk,
#endif
typename traits_t<T>::signed_t chunk,
T nproc, T tid) {
typedef typename traits_t<T>::unsigned_t UT;
typedef typename traits_t<T>::floating_t DBL;
int active;
T tc;
kmp_info_t *th;
kmp_team_t *team;
int monotonicity;
bool use_hier;
#ifdef KMP_DEBUG
typedef typename traits_t<T>::signed_t ST;
{
char *buff;
buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d called "
"pr:%%p lb:%%%s ub:%%%s st:%%%s "
"schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n",
traits_t<T>::spec, traits_t<T>::spec,
traits_t<ST>::spec, traits_t<ST>::spec,
traits_t<T>::spec, traits_t<T>::spec);
KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid));
__kmp_str_free(&buff);
}
#endif
th = __kmp_threads[gtid];
team = th->th.th_team;
active = !team->t.t_serialized;
#if USE_ITT_BUILD
int itt_need_metadata_reporting =
__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
team->t.t_active_level == 1;
#endif
#if KMP_USE_HIER_SCHED
use_hier = pr->flags.use_hier;
#else
use_hier = false;
#endif
monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) {
pr->flags.nomerge = TRUE;
schedule =
(enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
} else {
pr->flags.nomerge = FALSE;
}
pr->type_size = traits_t<T>::type_size;
if (kmp_ord_lower & schedule) {
pr->flags.ordered = TRUE;
schedule =
(enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
} else {
pr->flags.ordered = FALSE;
}
if (pr->flags.ordered) {
monotonicity = SCHEDULE_MONOTONIC;
}
if (schedule == kmp_sch_static) {
schedule = __kmp_static;
} else {
if (schedule == kmp_sch_runtime) {
schedule = team->t.t_sched.r_sched_type;
monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
if (pr->flags.ordered)
monotonicity = SCHEDULE_MONOTONIC;
if (schedule == kmp_sch_guided_chunked) {
schedule = __kmp_guided;
} else if (schedule == kmp_sch_static) {
schedule = __kmp_static;
}
chunk = team->t.t_sched.chunk;
#if USE_ITT_BUILD
if (cur_chunk)
*cur_chunk = chunk;
#endif
#ifdef KMP_DEBUG
{
char *buff;
buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d new: "
"schedule:%%d chunk:%%%s\n",
traits_t<ST>::spec);
KD_TRACE(10, (buff, gtid, schedule, chunk));
__kmp_str_free(&buff);
}
#endif
} else {
if (schedule == kmp_sch_guided_chunked) {
schedule = __kmp_guided;
}
if (chunk <= 0) {
chunk = KMP_DEFAULT_CHUNK;
}
}
if (schedule == kmp_sch_auto) {
schedule = __kmp_auto;
#ifdef KMP_DEBUG
{
char *buff;
buff = __kmp_str_format(
"__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: "
"schedule:%%d chunk:%%%s\n",
traits_t<ST>::spec);
KD_TRACE(10, (buff, gtid, schedule, chunk));
__kmp_str_free(&buff);
}
#endif
}
#if KMP_STATIC_STEAL_ENABLED
if (schedule == kmp_sch_dynamic_chunked) {
if (monotonicity == SCHEDULE_NONMONOTONIC)
schedule = kmp_sch_static_steal;
}
#endif
if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) {
schedule = kmp_sch_guided_iterative_chunked;
KMP_WARNING(DispatchManyThreads);
}
if (schedule == kmp_sch_runtime_simd) {
schedule = team->t.t_sched.r_sched_type;
monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
if (schedule == kmp_sch_static || schedule == kmp_sch_auto ||
schedule == __kmp_static) {
schedule = kmp_sch_static_balanced_chunked;
} else {
if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) {
schedule = kmp_sch_guided_simd;
}
chunk = team->t.t_sched.chunk * chunk;
}
#if USE_ITT_BUILD
if (cur_chunk)
*cur_chunk = chunk;
#endif
#ifdef KMP_DEBUG
{
char *buff;
buff = __kmp_str_format(
"__kmp_dispatch_init_algorithm: T#%%d new: schedule:%%d"
" chunk:%%%s\n",
traits_t<ST>::spec);
KD_TRACE(10, (buff, gtid, schedule, chunk));
__kmp_str_free(&buff);
}
#endif
}
pr->u.p.parm1 = chunk;
}
KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper),
"unknown scheduling type");
pr->u.p.count = 0;
if (__kmp_env_consistency_check) {
if (st == 0) {
__kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited,
(pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc);
}
}
if (st == 1) {
if (ub >= lb) {
tc = ub - lb + 1;
} else {
tc = 0;
}
} else if (st < 0) {
if (lb >= ub) {
tc = (UT)(lb - ub) / (-st) + 1;
} else {
tc = 0;
}
} else {
if (ub >= lb) {
tc = (UT)(ub - lb) / st + 1;
} else {
tc = 0;
}
}
#if KMP_STATS_ENABLED
if (KMP_MASTER_GTID(gtid)) {
KMP_COUNT_VALUE(OMP_loop_dynamic_total_iterations, tc);
}
#endif
pr->u.p.lb = lb;
pr->u.p.ub = ub;
pr->u.p.st = st;
pr->u.p.tc = tc;
#if KMP_OS_WINDOWS
pr->u.p.last_upper = ub + st;
#endif
if (active) {
if (pr->flags.ordered) {
pr->ordered_bumped = 0;
pr->u.p.ordered_lower = 1;
pr->u.p.ordered_upper = 0;
}
}
switch (schedule) {
#if KMP_STATIC_STEAL_ENABLED
case kmp_sch_static_steal: {
T ntc, init = 0;
KD_TRACE(100,
("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n",
gtid));
ntc = (tc % chunk ? 1 : 0) + tc / chunk;
if (nproc > 1 && ntc >= nproc) {
KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL);
T id = tid;
T small_chunk, extras, p_extra = 0;
kmp_uint32 old = UNUSED;
int claimed = pr->steal_flag.compare_exchange_strong(old, CLAIMED);
if (traits_t<T>::type_size > 4) {
pr->u.p.steal_lock = (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
__kmp_init_lock(pr->u.p.steal_lock);
}
#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
bool use_hybrid = false;
kmp_hw_core_type_t core_type = KMP_HW_CORE_TYPE_UNKNOWN;
T first_thread_with_ecore = 0;
T num_procs_with_pcore = 0;
T num_procs_with_ecore = 0;
T p_ntc = 0, e_ntc = 0;
if (__kmp_is_hybrid_cpu() && __kmp_affinity.type != affinity_none &&
__kmp_affinity.type != affinity_explicit) {
use_hybrid = true;
core_type = (kmp_hw_core_type_t)th->th.th_topology_attrs.core_type;
if (core_type != KMP_HW_CORE_TYPE_UNKNOWN &&
__kmp_first_osid_with_ecore > -1) {
for (int i = 0; i < team->t.t_nproc; ++i) {
kmp_hw_core_type_t type = (kmp_hw_core_type_t)team->t.t_threads[i]
->th.th_topology_attrs.core_type;
int id = team->t.t_threads[i]->th.th_topology_ids.os_id;
if (id == __kmp_first_osid_with_ecore) {
first_thread_with_ecore =
team->t.t_threads[i]->th.th_info.ds.ds_tid;
}
if (type == KMP_HW_CORE_TYPE_CORE) {
num_procs_with_pcore++;
} else if (type == KMP_HW_CORE_TYPE_ATOM) {
num_procs_with_ecore++;
} else {
use_hybrid = false;
break;
}
}
}
if (num_procs_with_pcore > 0 && num_procs_with_ecore > 0) {
float multiplier = 60.0 / 40.0;
float p_ratio = (float)num_procs_with_pcore / nproc;
float e_ratio = (float)num_procs_with_ecore / nproc;
float e_multiplier =
(float)1 /
(((multiplier * num_procs_with_pcore) / nproc) + e_ratio);
float p_multiplier = multiplier * e_multiplier;
p_ntc = __kmp_get_round_val(ntc * p_ratio * p_multiplier);
if ((int)p_ntc > (int)(ntc * p_ratio * p_multiplier))
e_ntc =
(int)(__kmp_round_2decimal_val(ntc * e_ratio * e_multiplier));
else
e_ntc = __kmp_get_round_val(ntc * e_ratio * e_multiplier);
KMP_DEBUG_ASSERT(ntc == p_ntc + e_ntc);
use_hybrid = (use_hybrid && (p_ntc >= num_procs_with_pcore &&
e_ntc >= num_procs_with_ecore)
? true
: false);
} else {
use_hybrid = false;
}
}
pr->flags.use_hybrid = use_hybrid;
pr->u.p.pchunks = p_ntc;
pr->u.p.num_procs_with_pcore = num_procs_with_pcore;
pr->u.p.first_thread_with_ecore = first_thread_with_ecore;
if (use_hybrid) {
KMP_DEBUG_ASSERT(nproc == num_procs_with_pcore + num_procs_with_ecore);
T big_chunk = p_ntc / num_procs_with_pcore;
small_chunk = e_ntc / num_procs_with_ecore;
extras =
(p_ntc % num_procs_with_pcore) + (e_ntc % num_procs_with_ecore);
p_extra = (big_chunk - small_chunk);
if (core_type == KMP_HW_CORE_TYPE_CORE) {
if (id < first_thread_with_ecore) {
init =
id * small_chunk + id * p_extra + (id < extras ? id : extras);
} else {
init = id * small_chunk + (id - num_procs_with_ecore) * p_extra +
(id < extras ? id : extras);
}
} else {
if (id == first_thread_with_ecore) {
init =
id * small_chunk + id * p_extra + (id < extras ? id : extras);
} else {
init = id * small_chunk + first_thread_with_ecore * p_extra +
(id < extras ? id : extras);
}
}
p_extra = (core_type == KMP_HW_CORE_TYPE_CORE) ? p_extra : 0;
} else
#endif
{
small_chunk = ntc / nproc;
extras = ntc % nproc;
init = id * small_chunk + (id < extras ? id : extras);
p_extra = 0;
}
pr->u.p.count = init;
if (claimed) {
pr->u.p.ub = init + small_chunk + p_extra + (id < extras ? 1 : 0);
KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
} else {
KMP_DEBUG_ASSERT(pr->steal_flag == THIEF);
pr->u.p.ub = init;
}
pr->u.p.parm2 = ntc;
pr->u.p.parm3 = nproc;
pr->u.p.parm4 = (id + 1) % nproc;
break;
} else {
schedule = kmp_sch_dynamic_chunked;
KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d switching to "
"kmp_sch_dynamic_chunked\n",
gtid));
goto dynamic_init;
break;
}
}
#endif
case kmp_sch_static_balanced: {
T init, limit;
KD_TRACE(
100,
("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n",
gtid));
if (nproc > 1) {
T id = tid;
if (tc < nproc) {
if (id < tc) {
init = id;
limit = id;
pr->u.p.parm1 = (id == tc - 1);
} else {
pr->u.p.count = 1;
pr->u.p.parm1 = FALSE;
break;
}
} else {
T small_chunk = tc / nproc;
T extras = tc % nproc;
init = id * small_chunk + (id < extras ? id : extras);
limit = init + small_chunk - (id < extras ? 0 : 1);
pr->u.p.parm1 = (id == nproc - 1);
}
} else {
if (tc > 0) {
init = 0;
limit = tc - 1;
pr->u.p.parm1 = TRUE;
} else {
pr->u.p.count = 1;
pr->u.p.parm1 = FALSE;
break;
}
}
#if USE_ITT_BUILD
if (itt_need_metadata_reporting)
if (cur_chunk)
*cur_chunk = limit - init + 1;
#endif
if (st == 1) {
pr->u.p.lb = lb + init;
pr->u.p.ub = lb + limit;
} else {
T ub_tmp = lb + limit * st;
pr->u.p.lb = lb + init * st;
if (st > 0) {
pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp);
} else {
pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp);
}
}
if (pr->flags.ordered) {
pr->u.p.ordered_lower = init;
pr->u.p.ordered_upper = limit;
}
break;
}
case kmp_sch_static_balanced_chunked: {
T nth = nproc;
KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)"
" -> falling-through to static_greedy\n",
gtid));
schedule = kmp_sch_static_greedy;
if (nth > 1)
pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1);
else
pr->u.p.parm1 = tc;
break;
}
case kmp_sch_guided_simd:
case kmp_sch_guided_iterative_chunked: {
KD_TRACE(
100,
("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked"
" case\n",
gtid));
if (nproc > 1) {
if ((2L * chunk + 1) * nproc >= tc) {
schedule = kmp_sch_dynamic_chunked;
goto dynamic_init;
} else {
pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1);
*(double *)&pr->u.p.parm3 =
guided_flt_param / (double)nproc;
}
} else {
KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
"kmp_sch_static_greedy\n",
gtid));
schedule = kmp_sch_static_greedy;
KD_TRACE(
100,
("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
gtid));
pr->u.p.parm1 = tc;
}
}
break;
case kmp_sch_guided_analytical_chunked: {
KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
"kmp_sch_guided_analytical_chunked case\n",
gtid));
if (nproc > 1) {
if ((2L * chunk + 1) * nproc >= tc) {
schedule = kmp_sch_dynamic_chunked;
goto dynamic_init;
} else {
DBL x;
#if KMP_USE_X87CONTROL
and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On
Windows* OS on IA-32 architecture, we need to set precision to 64-bit
instead of the default 53-bit. Even though long double doesn't work
on Windows* OS on Intel(R) 64, the resulting lack of precision is not
expected to impact the correctness of the algorithm, but this has not
been mathematically proven. */
unsigned int oldFpcw = _control87(0, 0);
_control87(_PC_64, _MCW_PC);
#endif
KMP_ASSERT(tc > 0);
long double target = ((long double)chunk * 2 + 1) * nproc / tc;
this point switch to dynamic-style scheduling */
UT cross;
x = 1.0 - 0.5 / (double)nproc;
#ifdef KMP_DEBUG
{
struct _test_a {
char a;
union {
char b;
DBL d;
};
} t;
ptrdiff_t natural_alignment =
(ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
KMP_DEBUG_ASSERT(
(((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0);
}
#endif
*(DBL *)&pr->u.p.parm3 = x;
<= chunk */
{
UT left, right, mid;
long double p;
it affects performance of the solver */
right = 229;
p = __kmp_pow<UT>(x, right);
if (p > target) {
do {
p *= p;
right <<= 1;
} while (p > target && right < (1 << 27));
left = right >> 1;
} else {
left = 0;
}
while (left + 1 < right) {
mid = (left + right) / 2;
if (__kmp_pow<UT>(x, mid) > target) {
left = mid;
} else {
right = mid;
}
}
cross = right;
}
KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target &&
__kmp_pow<UT>(x, cross) <= target);
pr->u.p.parm2 = cross;
#if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8))
#define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3)
#else
#define GUIDED_ANALYTICAL_WORKAROUND (x)
#endif
pr->u.p.count = tc -
__kmp_dispatch_guided_remaining(
tc, GUIDED_ANALYTICAL_WORKAROUND, cross) -
cross * chunk;
#if KMP_USE_X87CONTROL
_control87(oldFpcw, _MCW_PC);
#endif
}
} else {
KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
"kmp_sch_static_greedy\n",
gtid));
schedule = kmp_sch_static_greedy;
pr->u.p.parm1 = tc;
}
}
break;
case kmp_sch_static_greedy:
KD_TRACE(
100,
("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
gtid));
pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc;
break;
case kmp_sch_static_chunked:
case kmp_sch_dynamic_chunked:
dynamic_init:
if (tc == 0)
break;
if (pr->u.p.parm1 <= 0)
pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
else if (pr->u.p.parm1 > tc)
pr->u.p.parm1 = tc;
pr->u.p.parm2 = (tc / pr->u.p.parm1) + (tc % pr->u.p.parm1 ? 1 : 0);
KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
"kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",
gtid));
break;
case kmp_sch_trapezoidal: {
T parm1, parm2, parm3, parm4;
KD_TRACE(100,
("__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n",
gtid));
parm1 = chunk;
parm2 = (tc / (2 * nproc));
if (parm2 < 1) {
parm2 = 1;
}
than the first cycle. */
if (parm1 < 1) {
parm1 = 1;
} else if (parm1 > parm2) {
parm1 = parm2;
}
parm3 = (parm2 + parm1);
parm3 = (2 * tc + parm3 - 1) / parm3;
if (parm3 < 2) {
parm3 = 2;
}
parm4 = (parm3 - 1);
parm4 = (parm2 - parm1) / parm4;
pr->u.p.parm1 = parm1;
pr->u.p.parm2 = parm2;
pr->u.p.parm3 = parm3;
pr->u.p.parm4 = parm4;
}
break;
default: {
__kmp_fatal(KMP_MSG(UnknownSchedTypeDetected),
KMP_HNT(GetNewerLibrary),
__kmp_msg_null
);
} break;
}
pr->schedule = schedule;
}
#if KMP_USE_HIER_SCHED
template <typename T>
inline void __kmp_dispatch_init_hier_runtime(ident_t *loc, T lb, T ub,
typename traits_t<T>::signed_t st);
template <>
inline void
__kmp_dispatch_init_hier_runtime<kmp_int32>(ident_t *loc, kmp_int32 lb,
kmp_int32 ub, kmp_int32 st) {
__kmp_dispatch_init_hierarchy<kmp_int32>(
loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
__kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
}
template <>
inline void
__kmp_dispatch_init_hier_runtime<kmp_uint32>(ident_t *loc, kmp_uint32 lb,
kmp_uint32 ub, kmp_int32 st) {
__kmp_dispatch_init_hierarchy<kmp_uint32>(
loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
__kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
}
template <>
inline void
__kmp_dispatch_init_hier_runtime<kmp_int64>(ident_t *loc, kmp_int64 lb,
kmp_int64 ub, kmp_int64 st) {
__kmp_dispatch_init_hierarchy<kmp_int64>(
loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
__kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
}
template <>
inline void
__kmp_dispatch_init_hier_runtime<kmp_uint64>(ident_t *loc, kmp_uint64 lb,
kmp_uint64 ub, kmp_int64 st) {
__kmp_dispatch_init_hierarchy<kmp_uint64>(
loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
__kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
}
void __kmp_dispatch_free_hierarchies(kmp_team_t *team) {
int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
for (int i = 0; i < num_disp_buff; ++i) {
auto sh =
reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
&team->t.t_disp_buffer[i]);
if (sh->hier) {
sh->hier->deallocate();
__kmp_free(sh->hier);
}
}
}
#endif
template <typename T>
static void
__kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
T ub, typename traits_t<T>::signed_t st,
typename traits_t<T>::signed_t chunk, int push_ws) {
typedef typename traits_t<T>::unsigned_t UT;
int active;
kmp_info_t *th;
kmp_team_t *team;
kmp_uint32 my_buffer_index;
dispatch_private_info_template<T> *pr;
dispatch_shared_info_template<T> volatile *sh;
KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) ==
sizeof(dispatch_private_info));
KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) ==
sizeof(dispatch_shared_info));
__kmp_assert_valid_gtid(gtid);
if (!TCR_4(__kmp_init_parallel))
__kmp_parallel_initialize();
__kmp_resume_if_soft_paused();
#if INCLUDE_SSC_MARKS
SSC_MARK_DISPATCH_INIT();
#endif
#ifdef KMP_DEBUG
typedef typename traits_t<T>::signed_t ST;
{
char *buff;
buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d "
"chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
traits_t<ST>::spec, traits_t<T>::spec,
traits_t<T>::spec, traits_t<ST>::spec);
KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st));
__kmp_str_free(&buff);
}
#endif
th = __kmp_threads[gtid];
team = th->th.th_team;
active = !team->t.t_serialized;
th->th.th_ident = loc;
if (schedule == __kmp_static) {
KMP_COUNT_BLOCK(OMP_LOOP_STATIC);
} else {
KMP_COUNT_BLOCK(OMP_LOOP_DYNAMIC);
}
#if KMP_USE_HIER_SCHED
bool ordered;
enum sched_type my_sched = schedule;
my_buffer_index = th->th.th_dispatch->th_disp_index;
pr = reinterpret_cast<dispatch_private_info_template<T> *>(
&th->th.th_dispatch
->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched);
if ((my_sched >= kmp_nm_lower) && (my_sched < kmp_nm_upper))
my_sched =
(enum sched_type)(((int)my_sched) - (kmp_nm_lower - kmp_sch_lower));
ordered = (kmp_ord_lower & my_sched);
if (pr->flags.use_hier) {
if (ordered) {
KD_TRACE(100, ("__kmp_dispatch_init: T#%d ordered loop detected. "
"Disabling hierarchical scheduling.\n",
gtid));
pr->flags.use_hier = FALSE;
}
}
if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) {
if (!ordered && !pr->flags.use_hier)
__kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st);
}
#endif
#if USE_ITT_BUILD
kmp_uint64 cur_chunk = chunk;
int itt_need_metadata_reporting =
__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
team->t.t_active_level == 1;
#endif
if (!active) {
pr = reinterpret_cast<dispatch_private_info_template<T> *>(
th->th.th_dispatch->th_disp_buffer);
} else {
KMP_DEBUG_ASSERT(th->th.th_dispatch ==
&th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
my_buffer_index = th->th.th_dispatch->th_disp_index++;
pr = reinterpret_cast<dispatch_private_info_template<T> *>(
&th->th.th_dispatch
->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
&team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
KD_TRACE(10, ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid,
my_buffer_index));
if (sh->buffer_index != my_buffer_index) {
KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d"
" sh->buffer_index:%d\n",
gtid, my_buffer_index, sh->buffer_index));
__kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index,
__kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d "
"sh->buffer_index:%d\n",
gtid, my_buffer_index, sh->buffer_index));
}
}
__kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st,
#if USE_ITT_BUILD
&cur_chunk,
#endif
chunk, (T)th->th.th_team_nproc,
(T)th->th.th_info.ds.ds_tid);
if (active) {
if (pr->flags.ordered == 0) {
th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
} else {
th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
}
th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr;
th->th.th_dispatch->th_dispatch_sh_current =
CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);
#if USE_ITT_BUILD
if (pr->flags.ordered) {
__kmp_itt_ordered_init(gtid);
}
if (itt_need_metadata_reporting) {
kmp_uint64 schedtype = 0;
switch (schedule) {
case kmp_sch_static_chunked:
case kmp_sch_static_balanced:
break;
case kmp_sch_static_greedy:
cur_chunk = pr->u.p.parm1;
break;
case kmp_sch_dynamic_chunked:
schedtype = 1;
break;
case kmp_sch_guided_iterative_chunked:
case kmp_sch_guided_analytical_chunked:
case kmp_sch_guided_simd:
schedtype = 2;
break;
default:
schedtype = 3;
break;
}
__kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk);
}
#if KMP_USE_HIER_SCHED
if (pr->flags.use_hier) {
pr->u.p.count = 0;
pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0;
}
#endif
#endif
}
#ifdef KMP_DEBUG
{
char *buff;
buff = __kmp_str_format(
"__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s "
"lb:%%%s ub:%%%s"
" st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s"
" parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec,
traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec,
traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);
KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb,
pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count,
pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4));
__kmp_str_free(&buff);
}
#endif
#if OMPT_SUPPORT && OMPT_OPTIONAL
if (ompt_enabled.ompt_callback_work) {
ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
ompt_callbacks.ompt_callback(ompt_callback_work)(
ompt_get_work_schedule(pr->schedule), ompt_scope_begin,
&(team_info->parallel_data), &(task_info->task_data), pr->u.p.tc,
OMPT_LOAD_RETURN_ADDRESS(gtid));
}
#endif
KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic);
}
* every iteration, or __kmp_dispatch_finish_chunk() should be called after
* every chunk of iterations. If the ordered section(s) were not executed
* for this iteration (or every iteration in this chunk), we need to set the
* ordered iteration counters so that the next thread can proceed. */
template <typename UT>
static void __kmp_dispatch_finish(int gtid, ident_t *loc) {
typedef typename traits_t<UT>::signed_t ST;
__kmp_assert_valid_gtid(gtid);
kmp_info_t *th = __kmp_threads[gtid];
KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid));
if (!th->th.th_team->t.t_serialized) {
dispatch_private_info_template<UT> *pr =
reinterpret_cast<dispatch_private_info_template<UT> *>(
th->th.th_dispatch->th_dispatch_pr_current);
dispatch_shared_info_template<UT> volatile *sh =
reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
th->th.th_dispatch->th_dispatch_sh_current);
KMP_DEBUG_ASSERT(pr);
KMP_DEBUG_ASSERT(sh);
KMP_DEBUG_ASSERT(th->th.th_dispatch ==
&th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
if (pr->ordered_bumped) {
KD_TRACE(
1000,
("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
gtid));
pr->ordered_bumped = 0;
} else {
UT lower = pr->u.p.ordered_lower;
#ifdef KMP_DEBUG
{
char *buff;
buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: "
"ordered_iteration:%%%s lower:%%%s\n",
traits_t<UT>::spec, traits_t<UT>::spec);
KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
__kmp_str_free(&buff);
}
#endif
__kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
__kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
KMP_MB();
#ifdef KMP_DEBUG
{
char *buff;
buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: "
"ordered_iteration:%%%s lower:%%%s\n",
traits_t<UT>::spec, traits_t<UT>::spec);
KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
__kmp_str_free(&buff);
}
#endif
test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
}
}
KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid));
}
#ifdef KMP_GOMP_COMPAT
template <typename UT>
static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) {
typedef typename traits_t<UT>::signed_t ST;
__kmp_assert_valid_gtid(gtid);
kmp_info_t *th = __kmp_threads[gtid];
KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid));
if (!th->th.th_team->t.t_serialized) {
dispatch_private_info_template<UT> *pr =
reinterpret_cast<dispatch_private_info_template<UT> *>(
th->th.th_dispatch->th_dispatch_pr_current);
dispatch_shared_info_template<UT> volatile *sh =
reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
th->th.th_dispatch->th_dispatch_sh_current);
KMP_DEBUG_ASSERT(pr);
KMP_DEBUG_ASSERT(sh);
KMP_DEBUG_ASSERT(th->th.th_dispatch ==
&th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
UT lower = pr->u.p.ordered_lower;
UT upper = pr->u.p.ordered_upper;
UT inc = upper - lower + 1;
if (pr->ordered_bumped == inc) {
KD_TRACE(
1000,
("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
gtid));
pr->ordered_bumped = 0;
} else {
inc -= pr->ordered_bumped;
#ifdef KMP_DEBUG
{
char *buff;
buff = __kmp_str_format(
"__kmp_dispatch_finish_chunk: T#%%d before wait: "
"ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec);
KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper));
__kmp_str_free(&buff);
}
#endif
__kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
__kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
KMP_MB();
KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting "
"ordered_bumped to zero\n",
gtid));
pr->ordered_bumped = 0;
#ifdef KMP_DEBUG
{
char *buff;
buff = __kmp_str_format(
"__kmp_dispatch_finish_chunk: T#%%d after wait: "
"ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
traits_t<UT>::spec);
KD_TRACE(1000,
(buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper));
__kmp_str_free(&buff);
}
#endif
test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc);
}
}
KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid));
}
#endif
template <typename T>
int __kmp_dispatch_next_algorithm(int gtid,
dispatch_private_info_template<T> *pr,
dispatch_shared_info_template<T> volatile *sh,
kmp_int32 *p_last, T *p_lb, T *p_ub,
typename traits_t<T>::signed_t *p_st, T nproc,
T tid) {
typedef typename traits_t<T>::unsigned_t UT;
typedef typename traits_t<T>::signed_t ST;
typedef typename traits_t<T>::floating_t DBL;
int status = 0;
bool last = false;
T start;
ST incr;
UT limit, trip, init;
kmp_info_t *th = __kmp_threads[gtid];
kmp_team_t *team = th->th.th_team;
KMP_DEBUG_ASSERT(th->th.th_dispatch ==
&th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
KMP_DEBUG_ASSERT(pr);
KMP_DEBUG_ASSERT(sh);
KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc);
#ifdef KMP_DEBUG
{
char *buff;
buff =
__kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d called pr:%%p "
"sh:%%p nproc:%%%s tid:%%%s\n",
traits_t<T>::spec, traits_t<T>::spec);
KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid));
__kmp_str_free(&buff);
}
#endif
if (pr->u.p.tc == 0) {
KD_TRACE(10,
("__kmp_dispatch_next_algorithm: T#%d early exit trip count is "
"zero status:%d\n",
gtid, status));
return 0;
}
switch (pr->schedule) {
#if KMP_STATIC_STEAL_ENABLED
case kmp_sch_static_steal: {
T chunk = pr->u.p.parm1;
UT nchunks = pr->u.p.parm2;
KD_TRACE(100,
("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n",
gtid));
trip = pr->u.p.tc - 1;
if (traits_t<T>::type_size > 4) {
kmp_lock_t *lck = pr->u.p.steal_lock;
KMP_DEBUG_ASSERT(lck != NULL);
if (pr->u.p.count < (UT)pr->u.p.ub) {
KMP_DEBUG_ASSERT(pr->steal_flag == READY);
__kmp_acquire_lock(lck, gtid);
init = (pr->u.p.count)++;
status = (init < (UT)pr->u.p.ub);
__kmp_release_lock(lck, gtid);
} else {
status = 0;
}
if (!status) {
kmp_lock_t *lckv;
T while_limit = pr->u.p.parm3;
T while_index = 0;
int idx = (th->th.th_dispatch->th_disp_index - 1) %
__kmp_dispatch_num_buffers;
KMP_ATOMIC_ST_REL(&pr->steal_flag, THIEF);
while ((!status) && (while_limit != ++while_index)) {
dispatch_private_info_template<T> *v;
T remaining;
T victimId = pr->u.p.parm4;
T oldVictimId = victimId ? victimId - 1 : nproc - 1;
v = reinterpret_cast<dispatch_private_info_template<T> *>(
&team->t.t_dispatch[victimId].th_disp_buffer[idx]);
KMP_DEBUG_ASSERT(v);
while ((v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) &&
oldVictimId != victimId) {
victimId = (victimId + 1) % nproc;
v = reinterpret_cast<dispatch_private_info_template<T> *>(
&team->t.t_dispatch[victimId].th_disp_buffer[idx]);
KMP_DEBUG_ASSERT(v);
}
if (v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) {
continue;
}
if (KMP_ATOMIC_LD_RLX(&v->steal_flag) == UNUSED) {
kmp_uint32 old = UNUSED;
status = v->steal_flag.compare_exchange_strong(old, THIEF);
if (status) {
T id = victimId;
T small_chunk = 0, extras = 0, p_extra = 0;
__kmp_initialize_self_buffer<T>(team, id, pr, nchunks, nproc,
init, small_chunk, extras,
p_extra);
__kmp_acquire_lock(lck, gtid);
pr->u.p.count = init + 1;
pr->u.p.ub = init + small_chunk + p_extra + (id < extras ? 1 : 0);
__kmp_release_lock(lck, gtid);
pr->u.p.parm4 = (id + 1) % nproc;
#ifdef KMP_DEBUG
{
char *buff;
buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
"stolen chunks from T#%%d, "
"count:%%%s ub:%%%s\n",
traits_t<UT>::spec, traits_t<T>::spec);
KD_TRACE(10, (buff, gtid, id, pr->u.p.count, pr->u.p.ub));
__kmp_str_free(&buff);
}
#endif
if (pr->u.p.count < (UT)pr->u.p.ub)
KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
break;
}
}
if (KMP_ATOMIC_LD_ACQ(&v->steal_flag) != READY ||
v->u.p.count >= (UT)v->u.p.ub) {
pr->u.p.parm4 = (victimId + 1) % nproc;
continue;
}
lckv = v->u.p.steal_lock;
KMP_ASSERT(lckv != NULL);
__kmp_acquire_lock(lckv, gtid);
limit = v->u.p.ub;
if (v->u.p.count >= limit) {
__kmp_release_lock(lckv, gtid);
pr->u.p.parm4 = (victimId + 1) % nproc;
continue;
}
remaining = limit - v->u.p.count;
if (remaining > 7) {
KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2);
init = (v->u.p.ub -= (remaining >> 2));
} else {
KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1);
init = (v->u.p.ub -= 1);
}
__kmp_release_lock(lckv, gtid);
#ifdef KMP_DEBUG
{
char *buff;
buff = __kmp_str_format(
"__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
"count:%%%s ub:%%%s\n",
traits_t<UT>::spec, traits_t<UT>::spec);
KD_TRACE(10, (buff, gtid, victimId, init, limit));
__kmp_str_free(&buff);
}
#endif
KMP_DEBUG_ASSERT(init + 1 <= limit);
pr->u.p.parm4 = victimId;
status = 1;
__kmp_acquire_lock(lck, gtid);
pr->u.p.count = init + 1;
pr->u.p.ub = limit;
__kmp_release_lock(lck, gtid);
if (init + 1 < limit)
KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
}
}
} else {
typedef union {
struct {
UT count;
T ub;
} p;
kmp_int64 b;
} union_i4;
union_i4 vold, vnew;
if (pr->u.p.count < (UT)pr->u.p.ub) {
KMP_DEBUG_ASSERT(pr->steal_flag == READY);
vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
vnew.b = vold.b;
vnew.p.count++;
while (!KMP_COMPARE_AND_STORE_REL64(
(volatile kmp_int64 *)&pr->u.p.count,
*VOLATILE_CAST(kmp_int64 *) & vold.b,
*VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
KMP_CPU_PAUSE();
vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
vnew.b = vold.b;
vnew.p.count++;
}
init = vold.p.count;
status = (init < (UT)vold.p.ub);
} else {
status = 0;
}
if (!status) {
T while_limit = pr->u.p.parm3;
T while_index = 0;
int idx = (th->th.th_dispatch->th_disp_index - 1) %
__kmp_dispatch_num_buffers;
KMP_ATOMIC_ST_REL(&pr->steal_flag, THIEF);
while ((!status) && (while_limit != ++while_index)) {
dispatch_private_info_template<T> *v;
T remaining;
T victimId = pr->u.p.parm4;
T oldVictimId = victimId ? victimId - 1 : nproc - 1;
v = reinterpret_cast<dispatch_private_info_template<T> *>(
&team->t.t_dispatch[victimId].th_disp_buffer[idx]);
KMP_DEBUG_ASSERT(v);
while ((v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) &&
oldVictimId != victimId) {
victimId = (victimId + 1) % nproc;
v = reinterpret_cast<dispatch_private_info_template<T> *>(
&team->t.t_dispatch[victimId].th_disp_buffer[idx]);
KMP_DEBUG_ASSERT(v);
}
if (v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) {
continue;
}
if (KMP_ATOMIC_LD_RLX(&v->steal_flag) == UNUSED) {
kmp_uint32 old = UNUSED;
status = v->steal_flag.compare_exchange_strong(old, THIEF);
if (status) {
T id = victimId;
T small_chunk = 0, extras = 0, p_extra = 0;
__kmp_initialize_self_buffer<T>(team, id, pr, nchunks, nproc,
init, small_chunk, extras,
p_extra);
vnew.p.count = init + 1;
vnew.p.ub = init + small_chunk + p_extra + (id < extras ? 1 : 0);
#if KMP_ARCH_X86
KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vnew.b);
#else
*(volatile kmp_int64 *)(&pr->u.p.count) = vnew.b;
#endif
pr->u.p.parm4 = (id + 1) % nproc;
#ifdef KMP_DEBUG
{
char *buff;
buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
"stolen chunks from T#%%d, "
"count:%%%s ub:%%%s\n",
traits_t<UT>::spec, traits_t<T>::spec);
KD_TRACE(10, (buff, gtid, id, pr->u.p.count, pr->u.p.ub));
__kmp_str_free(&buff);
}
#endif
if (pr->u.p.count < (UT)pr->u.p.ub)
KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
break;
}
}
while (1) {
vold.b = *(volatile kmp_int64 *)(&v->u.p.count);
if (KMP_ATOMIC_LD_ACQ(&v->steal_flag) != READY ||
vold.p.count >= (UT)vold.p.ub) {
pr->u.p.parm4 = (victimId + 1) % nproc;
break;
}
vnew.b = vold.b;
remaining = vold.p.ub - vold.p.count;
if (remaining > 7) {
vnew.p.ub -= remaining >> 2;
} else {
vnew.p.ub -= 1;
}
KMP_DEBUG_ASSERT(vnew.p.ub * (UT)chunk <= trip);
if (KMP_COMPARE_AND_STORE_REL64(
(volatile kmp_int64 *)&v->u.p.count,
*VOLATILE_CAST(kmp_int64 *) & vold.b,
*VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
#ifdef KMP_DEBUG
{
char *buff;
buff = __kmp_str_format(
"__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
"count:%%%s ub:%%%s\n",
traits_t<T>::spec, traits_t<T>::spec);
KD_TRACE(10, (buff, gtid, victimId, vnew.p.ub, vold.p.ub));
__kmp_str_free(&buff);
}
#endif
KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen,
vold.p.ub - vnew.p.ub);
status = 1;
pr->u.p.parm4 = victimId;
init = vnew.p.ub;
vold.p.count = init + 1;
#if KMP_ARCH_X86
KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vold.b);
#else
*(volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
#endif
if (vold.p.count < (UT)vold.p.ub)
KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
break;
}
KMP_CPU_PAUSE();
}
}
}
}
if (!status) {
*p_lb = 0;
*p_ub = 0;
if (p_st != NULL)
*p_st = 0;
} else {
start = pr->u.p.lb;
init *= chunk;
limit = chunk + init - 1;
incr = pr->u.p.st;
KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1);
KMP_DEBUG_ASSERT(init <= trip);
if ((last = (limit >= trip)) != 0)
limit = trip;
if (p_st != NULL)
*p_st = incr;
if (incr == 1) {
*p_lb = start + init;
*p_ub = start + limit;
} else {
*p_lb = start + init * incr;
*p_ub = start + limit * incr;
}
}
break;
}
#endif
case kmp_sch_static_balanced: {
KD_TRACE(
10,
("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n",
gtid));
if ((status = !pr->u.p.count) != 0) {
pr->u.p.count = 1;
*p_lb = pr->u.p.lb;
*p_ub = pr->u.p.ub;
last = (pr->u.p.parm1 != 0);
if (p_st != NULL)
*p_st = pr->u.p.st;
} else {
pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
}
}
break;
case kmp_sch_static_greedy:
merged here */
case kmp_sch_static_chunked: {
T parm1;
KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
"kmp_sch_static_[affinity|chunked] case\n",
gtid));
parm1 = pr->u.p.parm1;
trip = pr->u.p.tc - 1;
init = parm1 * (pr->u.p.count + tid);
if ((status = (init <= trip)) != 0) {
start = pr->u.p.lb;
incr = pr->u.p.st;
limit = parm1 + init - 1;
if ((last = (limit >= trip)) != 0)
limit = trip;
if (p_st != NULL)
*p_st = incr;
pr->u.p.count += nproc;
if (incr == 1) {
*p_lb = start + init;
*p_ub = start + limit;
} else {
*p_lb = start + init * incr;
*p_ub = start + limit * incr;
}
if (pr->flags.ordered) {
pr->u.p.ordered_lower = init;
pr->u.p.ordered_upper = limit;
}
}
}
break;
case kmp_sch_dynamic_chunked: {
UT chunk_number;
UT chunk_size = pr->u.p.parm1;
UT nchunks = pr->u.p.parm2;
KD_TRACE(
100,
("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n",
gtid));
chunk_number = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
status = (chunk_number < nchunks);
if (!status) {
*p_lb = 0;
*p_ub = 0;
if (p_st != NULL)
*p_st = 0;
} else {
init = chunk_size * chunk_number;
trip = pr->u.p.tc - 1;
start = pr->u.p.lb;
incr = pr->u.p.st;
if ((last = (trip - init < (UT)chunk_size)))
limit = trip;
else
limit = chunk_size + init - 1;
if (p_st != NULL)
*p_st = incr;
if (incr == 1) {
*p_lb = start + init;
*p_ub = start + limit;
} else {
*p_lb = start + init * incr;
*p_ub = start + limit * incr;
}
if (pr->flags.ordered) {
pr->u.p.ordered_lower = init;
pr->u.p.ordered_upper = limit;
}
}
}
break;
case kmp_sch_guided_iterative_chunked: {
T chunkspec = pr->u.p.parm1;
KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked "
"iterative case\n",
gtid));
trip = pr->u.p.tc;
while (1) {
ST remaining;
init = sh->u.s.iteration;
remaining = trip - init;
if (remaining <= 0) {
status = 0;
break;
}
if ((T)remaining <
pr->u.p.parm2) {
init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
(ST)chunkspec);
remaining = trip - init;
if (remaining <= 0) {
status = 0;
} else {
status = 1;
if ((T)remaining > chunkspec) {
limit = init + chunkspec - 1;
} else {
last = true;
limit = init + remaining - 1;
}
}
break;
}
limit = init + (UT)((double)remaining *
*(double *)&pr->u.p.parm3);
if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
(ST)init, (ST)limit)) {
status = 1;
--limit;
break;
}
}
if (status != 0) {
start = pr->u.p.lb;
incr = pr->u.p.st;
if (p_st != NULL)
*p_st = incr;
*p_lb = start + init * incr;
*p_ub = start + limit * incr;
if (pr->flags.ordered) {
pr->u.p.ordered_lower = init;
pr->u.p.ordered_upper = limit;
}
} else {
*p_lb = 0;
*p_ub = 0;
if (p_st != NULL)
*p_st = 0;
}
}
break;
case kmp_sch_guided_simd: {
T chunk = pr->u.p.parm1;
KD_TRACE(100,
("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n",
gtid));
trip = pr->u.p.tc;
while (1) {
ST remaining;
init = sh->u.s.iteration;
remaining = trip - init;
if (remaining <= 0) {
status = 0;
break;
}
KMP_DEBUG_ASSERT(chunk && init % chunk == 0);
if ((T)remaining < pr->u.p.parm2) {
init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
(ST)chunk);
remaining = trip - init;
if (remaining <= 0) {
status = 0;
} else {
status = 1;
if ((T)remaining > chunk) {
limit = init + chunk - 1;
} else {
last = true;
limit = init + remaining - 1;
}
}
break;
}
UT span;
__kmp_type_convert((double)remaining * (*(double *)&pr->u.p.parm3),
&span);
UT rem = span % chunk;
if (rem)
span += chunk - rem;
limit = init + span;
if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
(ST)init, (ST)limit)) {
status = 1;
--limit;
break;
}
}
if (status != 0) {
start = pr->u.p.lb;
incr = pr->u.p.st;
if (p_st != NULL)
*p_st = incr;
*p_lb = start + init * incr;
*p_ub = start + limit * incr;
if (pr->flags.ordered) {
pr->u.p.ordered_lower = init;
pr->u.p.ordered_upper = limit;
}
} else {
*p_lb = 0;
*p_ub = 0;
if (p_st != NULL)
*p_st = 0;
}
}
break;
case kmp_sch_guided_analytical_chunked: {
T chunkspec = pr->u.p.parm1;
UT chunkIdx;
#if KMP_USE_X87CONTROL
IA-32 architecture 8-byte version */
unsigned int oldFpcw;
unsigned int fpcwSet = 0;
#endif
KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
"kmp_sch_guided_analytical_chunked case\n",
gtid));
trip = pr->u.p.tc;
KMP_DEBUG_ASSERT(nproc > 1);
KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip);
while (1) {
chunk sizes */
chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
if (chunkIdx >= (UT)pr->u.p.parm2) {
--trip;
init = chunkIdx * chunkspec + pr->u.p.count;
* calculation */
if ((status = (init > 0 && init <= trip)) != 0) {
limit = init + chunkspec - 1;
if ((last = (limit >= trip)) != 0)
limit = trip;
}
break;
} else {
Windows* OS.
This check works around the possible effect that init != 0 for chunkIdx == 0.
*/
#if KMP_USE_X87CONTROL
FPCW and set precision to 64-bit, as Windows* OS
on IA-32 architecture defaults to 53-bit */
if (!fpcwSet) {
oldFpcw = _control87(0, 0);
_control87(_PC_64, _MCW_PC);
fpcwSet = 0x30000;
}
#endif
if (chunkIdx) {
init = __kmp_dispatch_guided_remaining<T>(
trip, *(DBL *)&pr->u.p.parm3, chunkIdx);
KMP_DEBUG_ASSERT(init);
init = trip - init;
} else
init = 0;
limit = trip - __kmp_dispatch_guided_remaining<T>(
trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);
KMP_ASSERT(init <= limit);
if (init < limit) {
KMP_DEBUG_ASSERT(limit <= trip);
--limit;
status = 1;
break;
}
}
}
#if KMP_USE_X87CONTROL
AC: check fpcwSet flag first because oldFpcw can be uninitialized here
*/
if (fpcwSet && (oldFpcw & fpcwSet))
_control87(oldFpcw, _MCW_PC);
#endif
if (status != 0) {
start = pr->u.p.lb;
incr = pr->u.p.st;
if (p_st != NULL)
*p_st = incr;
*p_lb = start + init * incr;
*p_ub = start + limit * incr;
if (pr->flags.ordered) {
pr->u.p.ordered_lower = init;
pr->u.p.ordered_upper = limit;
}
} else {
*p_lb = 0;
*p_ub = 0;
if (p_st != NULL)
*p_st = 0;
}
}
break;
case kmp_sch_trapezoidal: {
UT index;
T parm2 = pr->u.p.parm2;
T parm3 = pr->u.p.parm3;
T parm4 = pr->u.p.parm4;
KD_TRACE(100,
("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n",
gtid));
index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;
trip = pr->u.p.tc - 1;
if ((status = ((T)index < parm3 && init <= trip)) == 0) {
*p_lb = 0;
*p_ub = 0;
if (p_st != NULL)
*p_st = 0;
} else {
start = pr->u.p.lb;
limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;
incr = pr->u.p.st;
if ((last = (limit >= trip)) != 0)
limit = trip;
if (p_st != NULL)
*p_st = incr;
if (incr == 1) {
*p_lb = start + init;
*p_ub = start + limit;
} else {
*p_lb = start + init * incr;
*p_ub = start + limit * incr;
}
if (pr->flags.ordered) {
pr->u.p.ordered_lower = init;
pr->u.p.ordered_upper = limit;
}
}
}
break;
default: {
status = 0;
__kmp_fatal(KMP_MSG(UnknownSchedTypeDetected),
KMP_HNT(GetNewerLibrary),
__kmp_msg_null
);
} break;
}
if (p_last)
*p_last = last;
#ifdef KMP_DEBUG
if (pr->flags.ordered) {
char *buff;
buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
"ordered_lower:%%%s ordered_upper:%%%s\n",
traits_t<UT>::spec, traits_t<UT>::spec);
KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper));
__kmp_str_free(&buff);
}
{
char *buff;
buff = __kmp_str_format(
"__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d "
"p_lb:%%%s p_ub:%%%s p_st:%%%s\n",
traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
KMP_DEBUG_ASSERT(p_last);
KMP_DEBUG_ASSERT(p_st);
KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st));
__kmp_str_free(&buff);
}
#endif
return status;
}
work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini()
is not called. */
#if OMPT_SUPPORT && OMPT_OPTIONAL
#define OMPT_LOOP_END \
if (status == 0) { \
if (ompt_enabled.ompt_callback_work) { \
ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \
ompt_task_info_t *task_info = __ompt_get_task_info_object(0); \
ompt_callbacks.ompt_callback(ompt_callback_work)( \
ompt_get_work_schedule(pr->schedule), ompt_scope_end, \
&(team_info->parallel_data), &(task_info->task_data), 0, codeptr); \
} \
}
#define OMPT_LOOP_DISPATCH(lb, ub, st, status) \
if (ompt_enabled.ompt_callback_dispatch && status) { \
ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \
ompt_task_info_t *task_info = __ompt_get_task_info_object(0); \
ompt_dispatch_chunk_t chunk; \
ompt_data_t instance = ompt_data_none; \
OMPT_GET_DISPATCH_CHUNK(chunk, lb, ub, st); \
instance.ptr = &chunk; \
ompt_callbacks.ompt_callback(ompt_callback_dispatch)( \
&(team_info->parallel_data), &(task_info->task_data), \
ompt_dispatch_ws_loop_chunk, instance); \
}
#else
#define OMPT_LOOP_END
#define OMPT_LOOP_DISPATCH(lb, ub, st, status)
#endif
#if KMP_STATS_ENABLED
#define KMP_STATS_LOOP_END \
{ \
kmp_int64 u, l, t, i; \
l = (kmp_int64)(*p_lb); \
u = (kmp_int64)(*p_ub); \
i = (kmp_int64)(pr->u.p.st); \
if (status == 0) { \
t = 0; \
KMP_POP_PARTITIONED_TIMER(); \
} else if (i == 1) { \
if (u >= l) \
t = u - l + 1; \
else \
t = 0; \
} else if (i < 0) { \
if (l >= u) \
t = (l - u) / (-i) + 1; \
else \
t = 0; \
} else { \
if (u >= l) \
t = (u - l) / i + 1; \
else \
t = 0; \
} \
KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t); \
}
#else
#define KMP_STATS_LOOP_END
#endif
template <typename T>
static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
T *p_lb, T *p_ub,
typename traits_t<T>::signed_t *p_st
#if OMPT_SUPPORT && OMPT_OPTIONAL
,
void *codeptr
#endif
) {
typedef typename traits_t<T>::unsigned_t UT;
typedef typename traits_t<T>::signed_t ST;
KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling);
int status;
dispatch_private_info_template<T> *pr;
__kmp_assert_valid_gtid(gtid);
kmp_info_t *th = __kmp_threads[gtid];
kmp_team_t *team = th->th.th_team;
KMP_DEBUG_ASSERT(p_lb && p_ub && p_st);
KD_TRACE(
1000,
("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n",
gtid, p_lb, p_ub, p_st, p_last));
if (team->t.t_serialized) {
pr = reinterpret_cast<dispatch_private_info_template<T> *>(
th->th.th_dispatch->th_disp_buffer);
KMP_DEBUG_ASSERT(pr);
if ((status = (pr->u.p.tc != 0)) == 0) {
*p_lb = 0;
*p_ub = 0;
if (p_st != NULL)
*p_st = 0;
if (__kmp_env_consistency_check) {
if (pr->pushed_ws != ct_none) {
pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
}
}
} else if (pr->flags.nomerge) {
kmp_int32 last;
T start;
UT limit, trip, init;
ST incr;
T chunk = pr->u.p.parm1;
KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
gtid));
init = chunk * pr->u.p.count++;
trip = pr->u.p.tc - 1;
if ((status = (init <= trip)) == 0) {
*p_lb = 0;
*p_ub = 0;
if (p_st != NULL)
*p_st = 0;
if (__kmp_env_consistency_check) {
if (pr->pushed_ws != ct_none) {
pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
}
}
} else {
start = pr->u.p.lb;
limit = chunk + init - 1;
incr = pr->u.p.st;
if ((last = (limit >= trip)) != 0) {
limit = trip;
#if KMP_OS_WINDOWS
pr->u.p.last_upper = pr->u.p.ub;
#endif
}
if (p_last != NULL)
*p_last = last;
if (p_st != NULL)
*p_st = incr;
if (incr == 1) {
*p_lb = start + init;
*p_ub = start + limit;
} else {
*p_lb = start + init * incr;
*p_ub = start + limit * incr;
}
if (pr->flags.ordered) {
pr->u.p.ordered_lower = init;
pr->u.p.ordered_upper = limit;
#ifdef KMP_DEBUG
{
char *buff;
buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
"ordered_lower:%%%s ordered_upper:%%%s\n",
traits_t<UT>::spec, traits_t<UT>::spec);
KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
pr->u.p.ordered_upper));
__kmp_str_free(&buff);
}
#endif
}
}
} else {
pr->u.p.tc = 0;
*p_lb = pr->u.p.lb;
*p_ub = pr->u.p.ub;
#if KMP_OS_WINDOWS
pr->u.p.last_upper = *p_ub;
#endif
if (p_last != NULL)
*p_last = TRUE;
if (p_st != NULL)
*p_st = pr->u.p.st;
}
#ifdef KMP_DEBUG
{
char *buff;
buff = __kmp_str_format(
"__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s "
"p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n",
traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last,
(p_last ? *p_last : 0), status));
__kmp_str_free(&buff);
}
#endif
#if INCLUDE_SSC_MARKS
SSC_MARK_DISPATCH_NEXT();
#endif
OMPT_LOOP_DISPATCH(*p_lb, *p_ub, pr->u.p.st, status);
OMPT_LOOP_END;
KMP_STATS_LOOP_END;
return status;
} else {
kmp_int32 last = 0;
dispatch_shared_info_template<T> volatile *sh;
KMP_DEBUG_ASSERT(th->th.th_dispatch ==
&th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
pr = reinterpret_cast<dispatch_private_info_template<T> *>(
th->th.th_dispatch->th_dispatch_pr_current);
KMP_DEBUG_ASSERT(pr);
sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
th->th.th_dispatch->th_dispatch_sh_current);
KMP_DEBUG_ASSERT(sh);
#if KMP_USE_HIER_SCHED
if (pr->flags.use_hier)
status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st);
else
#endif
status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub,
p_st, th->th.th_team_nproc,
th->th.th_info.ds.ds_tid);
if (status == 0) {
ST num_done;
num_done = test_then_inc<ST>(&sh->u.s.num_done);
#ifdef KMP_DEBUG
{
char *buff;
buff = __kmp_str_format(
"__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
traits_t<ST>::spec);
KD_TRACE(10, (buff, gtid, sh->u.s.num_done));
__kmp_str_free(&buff);
}
#endif
#if KMP_USE_HIER_SCHED
pr->flags.use_hier = FALSE;
#endif
if (num_done == th->th.th_team_nproc - 1) {
#if KMP_STATIC_STEAL_ENABLED
if (pr->schedule == kmp_sch_static_steal) {
int i;
int idx = (th->th.th_dispatch->th_disp_index - 1) %
__kmp_dispatch_num_buffers;
for (i = 0; i < th->th.th_team_nproc; ++i) {
dispatch_private_info_template<T> *buf =
reinterpret_cast<dispatch_private_info_template<T> *>(
&team->t.t_dispatch[i].th_disp_buffer[idx]);
KMP_ASSERT(buf->steal_flag == THIEF);
KMP_ATOMIC_ST_RLX(&buf->steal_flag, UNUSED);
if (traits_t<T>::type_size > 4) {
kmp_lock_t *lck = buf->u.p.steal_lock;
KMP_ASSERT(lck != NULL);
__kmp_destroy_lock(lck);
__kmp_free(lck);
buf->u.p.steal_lock = NULL;
}
}
}
#endif
KMP_MB();
sh->u.s.num_done = 0;
sh->u.s.iteration = 0;
if (pr->flags.ordered) {
sh->u.s.ordered_iteration = 0;
}
KMP_MB();
sh->buffer_index += __kmp_dispatch_num_buffers;
KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
gtid, sh->buffer_index));
KMP_MB();
}
if (__kmp_env_consistency_check) {
if (pr->pushed_ws != ct_none) {
pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
}
}
th->th.th_dispatch->th_deo_fcn = NULL;
th->th.th_dispatch->th_dxo_fcn = NULL;
th->th.th_dispatch->th_dispatch_sh_current = NULL;
th->th.th_dispatch->th_dispatch_pr_current = NULL;
}
#if KMP_OS_WINDOWS
else if (last) {
pr->u.p.last_upper = pr->u.p.ub;
}
#endif
if (p_last != NULL && status != 0)
*p_last = last;
}
#ifdef KMP_DEBUG
{
char *buff;
buff = __kmp_str_format(
"__kmp_dispatch_next: T#%%d normal case: "
"p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n",
traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last,
(p_last ? *p_last : 0), status));
__kmp_str_free(&buff);
}
#endif
#if INCLUDE_SSC_MARKS
SSC_MARK_DISPATCH_NEXT();
#endif
OMPT_LOOP_DISPATCH(*p_lb, *p_ub, pr->u.p.st, status);
OMPT_LOOP_END;
KMP_STATS_LOOP_END;
return status;
}
@ingroup WORK_SHARING
@param loc source location information
@param global_tid global thread number
@return Zero if the parallel region is not active and this thread should execute
all sections, non-zero otherwise.
Beginning of sections construct.
There are no implicit barriers in the "sections" calls, rather the compiler
should introduce an explicit barrier if it is required.
This implementation is based on __kmp_dispatch_init, using same constructs for
shared data (we can't have sections nested directly in omp for loop, there
should be a parallel region in between)
*/
kmp_int32 __kmpc_sections_init(ident_t *loc, kmp_int32 gtid) {
int active;
kmp_info_t *th;
kmp_team_t *team;
kmp_uint32 my_buffer_index;
dispatch_shared_info_template<kmp_int32> volatile *sh;
KMP_DEBUG_ASSERT(__kmp_init_serial);
if (!TCR_4(__kmp_init_parallel))
__kmp_parallel_initialize();
__kmp_resume_if_soft_paused();
th = __kmp_threads[gtid];
team = th->th.th_team;
active = !team->t.t_serialized;
th->th.th_ident = loc;
KMP_COUNT_BLOCK(OMP_SECTIONS);
KD_TRACE(10, ("__kmpc_sections: called by T#%d\n", gtid));
if (active) {
KMP_DEBUG_ASSERT(th->th.th_dispatch ==
&th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
my_buffer_index = th->th.th_dispatch->th_disp_index++;
sh = reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
&team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
KD_TRACE(10, ("__kmpc_sections_init: T#%d my_buffer_index:%d\n", gtid,
my_buffer_index));
th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
KD_TRACE(100, ("__kmpc_sections_init: T#%d before wait: my_buffer_index:%d "
"sh->buffer_index:%d\n",
gtid, my_buffer_index, sh->buffer_index));
__kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index,
__kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
KMP_MB();
KD_TRACE(100, ("__kmpc_sections_init: T#%d after wait: my_buffer_index:%d "
"sh->buffer_index:%d\n",
gtid, my_buffer_index, sh->buffer_index));
th->th.th_dispatch->th_dispatch_pr_current =
nullptr;
th->th.th_dispatch->th_dispatch_sh_current =
CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);
}
#if OMPT_SUPPORT && OMPT_OPTIONAL
if (ompt_enabled.ompt_callback_work) {
ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
ompt_callbacks.ompt_callback(ompt_callback_work)(
ompt_work_sections, ompt_scope_begin, &(team_info->parallel_data),
&(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
}
#endif
KMP_PUSH_PARTITIONED_TIMER(OMP_sections);
return active;
}
@ingroup WORK_SHARING
@param loc source location information
@param global_tid global thread number
@param numberOfSections number of sections in the 'sections' construct
@return unsigned [from 0 to n) - number (id) of the section to execute next on
this thread. n (or any other number not in range) - nothing to execute on this
thread
*/
kmp_int32 __kmpc_next_section(ident_t *loc, kmp_int32 gtid,
kmp_int32 numberOfSections) {
KMP_TIME_PARTITIONED_BLOCK(OMP_sections_overhead);
kmp_info_t *th = __kmp_threads[gtid];
#ifdef KMP_DEBUG
kmp_team_t *team = th->th.th_team;
#endif
KD_TRACE(1000, ("__kmp_dispatch_next: T#%d; number of sections:%d\n", gtid,
numberOfSections));
KMP_DEBUG_ASSERT(!team->t.t_serialized);
dispatch_shared_info_template<kmp_int32> volatile *sh;
KMP_DEBUG_ASSERT(th->th.th_dispatch ==
&th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
KMP_DEBUG_ASSERT(!(th->th.th_dispatch->th_dispatch_pr_current));
sh = reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
th->th.th_dispatch->th_dispatch_sh_current);
KMP_DEBUG_ASSERT(sh);
kmp_int32 sectionIndex = 0;
bool moreSectionsToExecute = true;
sectionIndex = test_then_inc<kmp_int32>((kmp_int32 *)&sh->u.s.iteration);
if (sectionIndex >= numberOfSections) {
moreSectionsToExecute = false;
}
if (!moreSectionsToExecute) {
kmp_int32 num_done;
num_done = test_then_inc<kmp_int32>((kmp_int32 *)(&sh->u.s.num_done));
if (num_done == th->th.th_team_nproc - 1) {
KMP_MB();
sh->u.s.num_done = 0;
sh->u.s.iteration = 0;
KMP_MB();
sh->buffer_index += __kmp_dispatch_num_buffers;
KD_TRACE(100, ("__kmpc_next_section: T#%d change buffer_index:%d\n", gtid,
sh->buffer_index));
KMP_MB();
}
th->th.th_dispatch->th_deo_fcn = NULL;
th->th.th_dispatch->th_dxo_fcn = NULL;
th->th.th_dispatch->th_dispatch_sh_current = NULL;
th->th.th_dispatch->th_dispatch_pr_current = NULL;
#if OMPT_SUPPORT && OMPT_OPTIONAL
if (ompt_enabled.ompt_callback_dispatch) {
ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
ompt_data_t instance = ompt_data_none;
instance.ptr = OMPT_GET_RETURN_ADDRESS(0);
ompt_callbacks.ompt_callback(ompt_callback_dispatch)(
&(team_info->parallel_data), &(task_info->task_data),
ompt_dispatch_section, instance);
}
#endif
}
return sectionIndex;
}
@ingroup WORK_SHARING
@param loc source location information
@param global_tid global thread number
End of "sections" construct.
Don't need to wait here: barrier is added separately when needed.
*/
void __kmpc_end_sections(ident_t *loc, kmp_int32 gtid) {
kmp_info_t *th = __kmp_threads[gtid];
int active = !th->th.th_team->t.t_serialized;
KD_TRACE(100, ("__kmpc_end_sections: T#%d called\n", gtid));
if (!active) {
#if OMPT_SUPPORT && OMPT_OPTIONAL
if (ompt_enabled.ompt_callback_work) {
ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
ompt_callbacks.ompt_callback(ompt_callback_work)(
ompt_work_sections, ompt_scope_end, &(team_info->parallel_data),
&(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
}
#endif
}
KMP_POP_PARTITIONED_TIMER();
KD_TRACE(100, ("__kmpc_end_sections: T#%d returned\n", gtid));
}
template <typename T>
static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid,
kmp_int32 *plastiter, T *plower, T *pupper,
typename traits_t<T>::signed_t incr) {
typedef typename traits_t<T>::unsigned_t UT;
kmp_uint32 team_id;
kmp_uint32 nteams;
UT trip_count;
kmp_team_t *team;
kmp_info_t *th;
KMP_DEBUG_ASSERT(plastiter && plower && pupper);
KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
#ifdef KMP_DEBUG
typedef typename traits_t<T>::signed_t ST;
{
char *buff;
buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d "
"iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
traits_t<T>::spec, traits_t<T>::spec,
traits_t<ST>::spec, traits_t<T>::spec);
KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr));
__kmp_str_free(&buff);
}
#endif
if (__kmp_env_consistency_check) {
if (incr == 0) {
__kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
loc);
}
if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
__kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
}
}
__kmp_assert_valid_gtid(gtid);
th = __kmp_threads[gtid];
team = th->th.th_team;
KMP_DEBUG_ASSERT(th->th.th_teams_microtask);
nteams = th->th.th_teams_size.nteams;
team_id = team->t.t_master_tid;
KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc);
if (incr == 1) {
trip_count = *pupper - *plower + 1;
} else if (incr == -1) {
trip_count = *plower - *pupper + 1;
} else if (incr > 0) {
trip_count = (UT)(*pupper - *plower) / incr + 1;
} else {
trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
}
if (trip_count <= nteams) {
KMP_DEBUG_ASSERT(
__kmp_static == kmp_sch_static_greedy ||
__kmp_static ==
kmp_sch_static_balanced);
if (team_id < trip_count) {
*pupper = *plower = *plower + team_id * incr;
} else {
*plower = *pupper + incr;
}
if (plastiter != NULL)
*plastiter = (team_id == trip_count - 1);
} else {
if (__kmp_static == kmp_sch_static_balanced) {
UT chunk = trip_count / nteams;
UT extras = trip_count % nteams;
*plower +=
incr * (team_id * chunk + (team_id < extras ? team_id : extras));
*pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr);
if (plastiter != NULL)
*plastiter = (team_id == nteams - 1);
} else {
T chunk_inc_count =
(trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr;
T upper = *pupper;
KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
*plower += team_id * chunk_inc_count;
*pupper = *plower + chunk_inc_count - incr;
if (incr > 0) {
if (*pupper < *plower)
*pupper = traits_t<T>::max_value;
if (plastiter != NULL)
*plastiter = *plower <= upper && *pupper > upper - incr;
if (*pupper > upper)
*pupper = upper;
} else {
if (*pupper > *plower)
*pupper = traits_t<T>::min_value;
if (plastiter != NULL)
*plastiter = *plower >= upper && *pupper < upper - incr;
if (*pupper < upper)
*pupper = upper;
}
}
}
}
extern "C" {
@ingroup WORK_SHARING
@{
@param loc Source location
@param gtid Global thread id
@param schedule Schedule type
@param lb Lower bound
@param ub Upper bound
@param st Step (or increment if you prefer)
@param chunk The chunk size to block with
This function prepares the runtime to start a dynamically scheduled for loop,
saving the loop arguments.
These functions are all identical apart from the types of the arguments.
*/
void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
enum sched_type schedule, kmp_int32 lb,
kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) {
KMP_DEBUG_ASSERT(__kmp_init_serial);
#if OMPT_SUPPORT && OMPT_OPTIONAL
OMPT_STORE_RETURN_ADDRESS(gtid);
#endif
__kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
}
See @ref __kmpc_dispatch_init_4
*/
void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
enum sched_type schedule, kmp_uint32 lb,
kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) {
KMP_DEBUG_ASSERT(__kmp_init_serial);
#if OMPT_SUPPORT && OMPT_OPTIONAL
OMPT_STORE_RETURN_ADDRESS(gtid);
#endif
__kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
}
See @ref __kmpc_dispatch_init_4
*/
void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
enum sched_type schedule, kmp_int64 lb,
kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) {
KMP_DEBUG_ASSERT(__kmp_init_serial);
#if OMPT_SUPPORT && OMPT_OPTIONAL
OMPT_STORE_RETURN_ADDRESS(gtid);
#endif
__kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
}
See @ref __kmpc_dispatch_init_4
*/
void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
enum sched_type schedule, kmp_uint64 lb,
kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) {
KMP_DEBUG_ASSERT(__kmp_init_serial);
#if OMPT_SUPPORT && OMPT_OPTIONAL
OMPT_STORE_RETURN_ADDRESS(gtid);
#endif
__kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
}
See @ref __kmpc_dispatch_init_4
Difference from __kmpc_dispatch_init set of functions is these functions
are called for composite distribute parallel for construct. Thus before
regular iterations dispatching we need to calc per-team iteration space.
These functions are all identical apart from the types of the arguments.
*/
void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
enum sched_type schedule, kmp_int32 *p_last,
kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
kmp_int32 chunk) {
KMP_DEBUG_ASSERT(__kmp_init_serial);
#if OMPT_SUPPORT && OMPT_OPTIONAL
OMPT_STORE_RETURN_ADDRESS(gtid);
#endif
__kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st);
__kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
}
void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
enum sched_type schedule, kmp_int32 *p_last,
kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
kmp_int32 chunk) {
KMP_DEBUG_ASSERT(__kmp_init_serial);
#if OMPT_SUPPORT && OMPT_OPTIONAL
OMPT_STORE_RETURN_ADDRESS(gtid);
#endif
__kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st);
__kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
}
void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
enum sched_type schedule, kmp_int32 *p_last,
kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
kmp_int64 chunk) {
KMP_DEBUG_ASSERT(__kmp_init_serial);
#if OMPT_SUPPORT && OMPT_OPTIONAL
OMPT_STORE_RETURN_ADDRESS(gtid);
#endif
__kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st);
__kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
}
void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
enum sched_type schedule, kmp_int32 *p_last,
kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
kmp_int64 chunk) {
KMP_DEBUG_ASSERT(__kmp_init_serial);
#if OMPT_SUPPORT && OMPT_OPTIONAL
OMPT_STORE_RETURN_ADDRESS(gtid);
#endif
__kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st);
__kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
}
@param loc Source code location
@param gtid Global thread id
@param p_last Pointer to a flag set to one if this is the last chunk or zero
otherwise
@param p_lb Pointer to the lower bound for the next chunk of work
@param p_ub Pointer to the upper bound for the next chunk of work
@param p_st Pointer to the stride for the next chunk of work
@return one if there is work to be done, zero otherwise
Get the next dynamically allocated chunk of work for this thread.
If there is no more work, then the lb,ub and stride need not be modified.
*/
int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) {
#if OMPT_SUPPORT && OMPT_OPTIONAL
OMPT_STORE_RETURN_ADDRESS(gtid);
#endif
return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st
#if OMPT_SUPPORT && OMPT_OPTIONAL
,
OMPT_LOAD_RETURN_ADDRESS(gtid)
#endif
);
}
See @ref __kmpc_dispatch_next_4
*/
int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
kmp_uint32 *p_lb, kmp_uint32 *p_ub,
kmp_int32 *p_st) {
#if OMPT_SUPPORT && OMPT_OPTIONAL
OMPT_STORE_RETURN_ADDRESS(gtid);
#endif
return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st
#if OMPT_SUPPORT && OMPT_OPTIONAL
,
OMPT_LOAD_RETURN_ADDRESS(gtid)
#endif
);
}
See @ref __kmpc_dispatch_next_4
*/
int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) {
#if OMPT_SUPPORT && OMPT_OPTIONAL
OMPT_STORE_RETURN_ADDRESS(gtid);
#endif
return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st
#if OMPT_SUPPORT && OMPT_OPTIONAL
,
OMPT_LOAD_RETURN_ADDRESS(gtid)
#endif
);
}
See @ref __kmpc_dispatch_next_4
*/
int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
kmp_uint64 *p_lb, kmp_uint64 *p_ub,
kmp_int64 *p_st) {
#if OMPT_SUPPORT && OMPT_OPTIONAL
OMPT_STORE_RETURN_ADDRESS(gtid);
#endif
return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st
#if OMPT_SUPPORT && OMPT_OPTIONAL
,
OMPT_LOAD_RETURN_ADDRESS(gtid)
#endif
);
}
@param loc Source code location
@param gtid Global thread id
Mark the end of a dynamic loop.
*/
void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) {
__kmp_dispatch_finish<kmp_uint32>(gtid, loc);
}
See @ref __kmpc_dispatch_fini_4
*/
void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) {
__kmp_dispatch_finish<kmp_uint64>(gtid, loc);
}
See @ref __kmpc_dispatch_fini_4
*/
void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) {
__kmp_dispatch_finish<kmp_uint32>(gtid, loc);
}
See @ref __kmpc_dispatch_fini_4
*/
void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) {
__kmp_dispatch_finish<kmp_uint64>(gtid, loc);
}
See @ref __kmpc_dispatch_deinit
*/
void __kmpc_dispatch_deinit(ident_t *loc, kmp_int32 gtid) {}
kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) {
return value == checker;
}
kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) {
return value != checker;
}
kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) {
return value < checker;
}
kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) {
return value >= checker;
}
kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) {
return value <= checker;
}
kmp_uint32
__kmp_wait_4(volatile kmp_uint32 *spinner, kmp_uint32 checker,
kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
void *obj
) {
volatile kmp_uint32 *spin = spinner;
kmp_uint32 check = checker;
kmp_uint32 spins;
kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred;
kmp_uint32 r;
kmp_uint64 time;
KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin));
KMP_INIT_YIELD(spins);
KMP_INIT_BACKOFF(time);
while (!f(r = TCR_4(*spin), check)) {
KMP_FSYNC_SPIN_PREPARE(obj);
split. It causes problems with infinite recursion because of exit lock */
__kmp_abort_thread(); */
KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
}
KMP_FSYNC_SPIN_ACQUIRED(obj);
return r;
}
void __kmp_wait_4_ptr(void *spinner, kmp_uint32 checker,
kmp_uint32 (*pred)(void *, kmp_uint32),
void *obj
) {
void *spin = spinner;
kmp_uint32 check = checker;
kmp_uint32 spins;
kmp_uint32 (*f)(void *, kmp_uint32) = pred;
kmp_uint64 time;
KMP_FSYNC_SPIN_INIT(obj, spin);
KMP_INIT_YIELD(spins);
KMP_INIT_BACKOFF(time);
while (!f(spin, check)) {
KMP_FSYNC_SPIN_PREPARE(obj);
KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
}
KMP_FSYNC_SPIN_ACQUIRED(obj);
}
}
#ifdef KMP_GOMP_COMPAT
void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
enum sched_type schedule, kmp_int32 lb,
kmp_int32 ub, kmp_int32 st, kmp_int32 chunk,
int push_ws) {
__kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
push_ws);
}
void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
enum sched_type schedule, kmp_uint32 lb,
kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk,
int push_ws) {
__kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
push_ws);
}
void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
enum sched_type schedule, kmp_int64 lb,
kmp_int64 ub, kmp_int64 st, kmp_int64 chunk,
int push_ws) {
__kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
push_ws);
}
void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
enum sched_type schedule, kmp_uint64 lb,
kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk,
int push_ws) {
__kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
push_ws);
}
void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) {
__kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
}
void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) {
__kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
}
void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) {
__kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
}
void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) {
__kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
}
#endif