* kmp_collapse.cpp -- loop collapse feature
*/
#include "kmp.h"
#include "kmp_error.h"
#include "kmp_i18n.h"
#include "kmp_itt.h"
#include "kmp_stats.h"
#include "kmp_str.h"
#include "kmp_collapse.h"
#if OMPT_SUPPORT
#include "ompt-specific.h"
#endif
template <typename T> T __kmp_abs(const T val) {
return (val < 0) ? -val : val;
}
kmp_uint32 __kmp_abs(const kmp_uint32 val) { return val; }
kmp_uint64 __kmp_abs(const kmp_uint64 val) { return val; }
template <typename T> int __kmp_sign(T val) {
return (T(0) < val) - (val < T(0));
}
template <typename T> class CollapseAllocator {
typedef T *pT;
private:
static const size_t allocaSize = 32;
char stackAlloc[allocaSize];
static constexpr size_t maxElemCount = allocaSize / sizeof(T);
pT pTAlloc;
public:
CollapseAllocator(size_t n) : pTAlloc(reinterpret_cast<pT>(stackAlloc)) {
if (n > maxElemCount) {
pTAlloc = reinterpret_cast<pT>(__kmp_allocate(n * sizeof(T)));
}
}
~CollapseAllocator() {
if (pTAlloc != reinterpret_cast<pT>(stackAlloc)) {
__kmp_free(pTAlloc);
}
}
T &operator[](int index) { return pTAlloc[index]; }
operator const pT() { return pTAlloc; }
};
template <typename T>
void kmp_canonicalize_one_loop_XX(
ident_t *loc,
bounds_infoXX_template<T> *bounds) {
if (__kmp_env_consistency_check) {
if (bounds->step == 0) {
__kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
loc);
}
}
if (bounds->comparison == comparison_t::comp_not_eq) {
if (bounds->step > 0) {
bounds->comparison = comparison_t::comp_less;
} else {
bounds->comparison = comparison_t::comp_greater;
}
}
if (bounds->comparison == comparison_t::comp_less) {
bounds->ub0 -= 1;
bounds->comparison = comparison_t::comp_less_or_eq;
} else if (bounds->comparison == comparison_t::comp_greater) {
bounds->ub0 += 1;
bounds->comparison = comparison_t::comp_greater_or_eq;
}
}
void kmp_canonicalize_loop_nest(ident_t *loc,
bounds_info_t *original_bounds_nest,
kmp_index_t n) {
for (kmp_index_t ind = 0; ind < n; ++ind) {
auto bounds = &(original_bounds_nest[ind]);
switch (bounds->loop_type) {
case loop_type_t::loop_type_int32:
kmp_canonicalize_one_loop_XX<kmp_int32>(
loc,
(bounds_infoXX_template<kmp_int32> *)(bounds));
break;
case loop_type_t::loop_type_uint32:
kmp_canonicalize_one_loop_XX<kmp_uint32>(
loc,
(bounds_infoXX_template<kmp_uint32> *)(bounds));
break;
case loop_type_t::loop_type_int64:
kmp_canonicalize_one_loop_XX<kmp_int64>(
loc,
(bounds_infoXX_template<kmp_int64> *)(bounds));
break;
case loop_type_t::loop_type_uint64:
kmp_canonicalize_one_loop_XX<kmp_uint64>(
loc,
(bounds_infoXX_template<kmp_uint64> *)(bounds));
break;
default:
KMP_ASSERT(false);
}
}
}
template <typename T>
kmp_loop_nest_iv_t kmp_calculate_trip_count_XX(
bounds_infoXX_template<T> *bounds) {
if (bounds->comparison == comparison_t::comp_less_or_eq) {
if (bounds->ub0 < bounds->lb0) {
bounds->trip_count = 0;
} else {
bounds->trip_count =
static_cast<kmp_loop_nest_iv_t>(bounds->ub0 - bounds->lb0) /
__kmp_abs(bounds->step) +
1;
}
} else if (bounds->comparison == comparison_t::comp_greater_or_eq) {
if (bounds->lb0 < bounds->ub0) {
bounds->trip_count = 0;
} else {
bounds->trip_count =
static_cast<kmp_loop_nest_iv_t>(bounds->lb0 - bounds->ub0) /
__kmp_abs(bounds->step) +
1;
}
} else {
KMP_ASSERT(false);
}
return bounds->trip_count;
}
kmp_loop_nest_iv_t kmp_calculate_trip_count( bounds_info_t *bounds) {
kmp_loop_nest_iv_t trip_count = 0;
switch (bounds->loop_type) {
case loop_type_t::loop_type_int32:
trip_count = kmp_calculate_trip_count_XX<kmp_int32>(
(bounds_infoXX_template<kmp_int32> *)(bounds));
break;
case loop_type_t::loop_type_uint32:
trip_count = kmp_calculate_trip_count_XX<kmp_uint32>(
(bounds_infoXX_template<kmp_uint32> *)(bounds));
break;
case loop_type_t::loop_type_int64:
trip_count = kmp_calculate_trip_count_XX<kmp_int64>(
(bounds_infoXX_template<kmp_int64> *)(bounds));
break;
case loop_type_t::loop_type_uint64:
trip_count = kmp_calculate_trip_count_XX<kmp_uint64>(
(bounds_infoXX_template<kmp_uint64> *)(bounds));
break;
default:
KMP_ASSERT(false);
}
return trip_count;
}
kmp_uint64 kmp_fix_iv(loop_type_t loop_iv_type, kmp_uint64 original_iv) {
kmp_uint64 res = 0;
switch (loop_iv_type) {
case loop_type_t::loop_type_int8:
res = static_cast<kmp_uint64>(static_cast<kmp_int8>(original_iv));
break;
case loop_type_t::loop_type_uint8:
res = static_cast<kmp_uint64>(static_cast<kmp_uint8>(original_iv));
break;
case loop_type_t::loop_type_int16:
res = static_cast<kmp_uint64>(static_cast<kmp_int16>(original_iv));
break;
case loop_type_t::loop_type_uint16:
res = static_cast<kmp_uint64>(static_cast<kmp_uint16>(original_iv));
break;
case loop_type_t::loop_type_int32:
res = static_cast<kmp_uint64>(static_cast<kmp_int32>(original_iv));
break;
case loop_type_t::loop_type_uint32:
res = static_cast<kmp_uint64>(static_cast<kmp_uint32>(original_iv));
break;
case loop_type_t::loop_type_int64:
res = static_cast<kmp_uint64>(static_cast<kmp_int64>(original_iv));
break;
case loop_type_t::loop_type_uint64:
res = static_cast<kmp_uint64>(original_iv);
break;
default:
KMP_ASSERT(false);
}
return res;
}
bool kmp_ivs_eq(loop_type_t loop_iv_type, kmp_uint64 original_iv1,
kmp_uint64 original_iv2) {
bool res = false;
switch (loop_iv_type) {
case loop_type_t::loop_type_int8:
res = static_cast<kmp_int8>(original_iv1) ==
static_cast<kmp_int8>(original_iv2);
break;
case loop_type_t::loop_type_uint8:
res = static_cast<kmp_uint8>(original_iv1) ==
static_cast<kmp_uint8>(original_iv2);
break;
case loop_type_t::loop_type_int16:
res = static_cast<kmp_int16>(original_iv1) ==
static_cast<kmp_int16>(original_iv2);
break;
case loop_type_t::loop_type_uint16:
res = static_cast<kmp_uint16>(original_iv1) ==
static_cast<kmp_uint16>(original_iv2);
break;
case loop_type_t::loop_type_int32:
res = static_cast<kmp_int32>(original_iv1) ==
static_cast<kmp_int32>(original_iv2);
break;
case loop_type_t::loop_type_uint32:
res = static_cast<kmp_uint32>(original_iv1) ==
static_cast<kmp_uint32>(original_iv2);
break;
case loop_type_t::loop_type_int64:
res = static_cast<kmp_int64>(original_iv1) ==
static_cast<kmp_int64>(original_iv2);
break;
case loop_type_t::loop_type_uint64:
res = static_cast<kmp_uint64>(original_iv1) ==
static_cast<kmp_uint64>(original_iv2);
break;
default:
KMP_ASSERT(false);
}
return res;
}
template <typename T>
bool kmp_iv_is_in_upper_bound_XX(const bounds_infoXX_template<T> *bounds,
const kmp_point_t original_ivs,
kmp_index_t ind) {
T iv = static_cast<T>(original_ivs[ind]);
T outer_iv = static_cast<T>(original_ivs[bounds->outer_iv]);
if (((bounds->comparison == comparison_t::comp_less_or_eq) &&
(iv > (bounds->ub0 + bounds->ub1 * outer_iv))) ||
((bounds->comparison == comparison_t::comp_greater_or_eq) &&
(iv < (bounds->ub0 + bounds->ub1 * outer_iv)))) {
return false;
}
return true;
}
template <typename T>
bool kmp_calc_one_iv_XX(const bounds_infoXX_template<T> *bounds,
kmp_point_t original_ivs,
const kmp_iterations_t iterations, kmp_index_t ind,
bool start_with_lower_bound, bool checkBounds) {
kmp_uint64 temp = 0;
T outer_iv = static_cast<T>(original_ivs[bounds->outer_iv]);
if (start_with_lower_bound) {
temp = bounds->lb0 + bounds->lb1 * outer_iv;
} else {
auto iteration = iterations[ind];
temp = bounds->lb0 + bounds->lb1 * outer_iv + iteration * bounds->step;
}
original_ivs[ind] = kmp_fix_iv(bounds->loop_iv_type, temp);
if (checkBounds) {
return kmp_iv_is_in_upper_bound_XX(bounds, original_ivs, ind);
} else {
return true;
}
}
bool kmp_calc_one_iv(const bounds_info_t *bounds,
kmp_point_t original_ivs,
const kmp_iterations_t iterations, kmp_index_t ind,
bool start_with_lower_bound, bool checkBounds) {
switch (bounds->loop_type) {
case loop_type_t::loop_type_int32:
return kmp_calc_one_iv_XX<kmp_int32>(
(bounds_infoXX_template<kmp_int32> *)(bounds),
original_ivs, iterations, ind, start_with_lower_bound,
checkBounds);
break;
case loop_type_t::loop_type_uint32:
return kmp_calc_one_iv_XX<kmp_uint32>(
(bounds_infoXX_template<kmp_uint32> *)(bounds),
original_ivs, iterations, ind, start_with_lower_bound,
checkBounds);
break;
case loop_type_t::loop_type_int64:
return kmp_calc_one_iv_XX<kmp_int64>(
(bounds_infoXX_template<kmp_int64> *)(bounds),
original_ivs, iterations, ind, start_with_lower_bound,
checkBounds);
break;
case loop_type_t::loop_type_uint64:
return kmp_calc_one_iv_XX<kmp_uint64>(
(bounds_infoXX_template<kmp_uint64> *)(bounds),
original_ivs, iterations, ind, start_with_lower_bound,
checkBounds);
break;
default:
KMP_ASSERT(false);
return false;
}
}
template <typename T>
void kmp_calc_one_iv_rectang_XX(const bounds_infoXX_template<T> *bounds,
kmp_uint64 *original_ivs,
const kmp_iterations_t iterations,
kmp_index_t ind) {
auto iteration = iterations[ind];
kmp_uint64 temp =
bounds->lb0 +
bounds->lb1 * static_cast<T>(original_ivs[bounds->outer_iv]) +
iteration * bounds->step;
original_ivs[ind] = kmp_fix_iv(bounds->loop_iv_type, temp);
}
void kmp_calc_one_iv_rectang(const bounds_info_t *bounds,
kmp_uint64 *original_ivs,
const kmp_iterations_t iterations,
kmp_index_t ind) {
switch (bounds->loop_type) {
case loop_type_t::loop_type_int32:
kmp_calc_one_iv_rectang_XX<kmp_int32>(
(bounds_infoXX_template<kmp_int32> *)(bounds),
original_ivs, iterations, ind);
break;
case loop_type_t::loop_type_uint32:
kmp_calc_one_iv_rectang_XX<kmp_uint32>(
(bounds_infoXX_template<kmp_uint32> *)(bounds),
original_ivs, iterations, ind);
break;
case loop_type_t::loop_type_int64:
kmp_calc_one_iv_rectang_XX<kmp_int64>(
(bounds_infoXX_template<kmp_int64> *)(bounds),
original_ivs, iterations, ind);
break;
case loop_type_t::loop_type_uint64:
kmp_calc_one_iv_rectang_XX<kmp_uint64>(
(bounds_infoXX_template<kmp_uint64> *)(bounds),
original_ivs, iterations, ind);
break;
default:
KMP_ASSERT(false);
}
}
extern "C" kmp_loop_nest_iv_t
__kmpc_process_loop_nest_rectang(ident_t *loc, kmp_int32 gtid,
bounds_info_t *original_bounds_nest,
kmp_index_t n) {
kmp_canonicalize_loop_nest(loc, original_bounds_nest, n);
kmp_loop_nest_iv_t total = 1;
for (kmp_index_t ind = 0; ind < n; ++ind) {
auto bounds = &(original_bounds_nest[ind]);
kmp_loop_nest_iv_t trip_count = kmp_calculate_trip_count( bounds);
total *= trip_count;
}
return total;
}
extern "C" void
__kmpc_calc_original_ivs_rectang(ident_t *loc, kmp_loop_nest_iv_t new_iv,
const bounds_info_t *original_bounds_nest,
kmp_uint64 *original_ivs,
kmp_index_t n) {
CollapseAllocator<kmp_loop_nest_iv_t> iterations(n);
for (kmp_index_t ind = n; ind > 0;) {
--ind;
auto bounds = &(original_bounds_nest[ind]);
auto temp = new_iv / bounds->trip_count;
auto iteration = new_iv % bounds->trip_count;
new_iv = temp;
iterations[ind] = iteration;
}
KMP_ASSERT(new_iv == 0);
for (kmp_index_t ind = 0; ind < n; ++ind) {
auto bounds = &(original_bounds_nest[ind]);
kmp_calc_one_iv_rectang(bounds, original_ivs, iterations, ind);
}
}
template <typename T>
void kmp_calc_span_lessoreq_XX(
bounds_info_internalXX_template<T> *bounds,
bounds_info_internal_t *bounds_nest) {
typedef typename traits_t<T>::unsigned_t UT;
typedef T span_t;
auto &bbounds = bounds->b;
if ((bbounds.lb1 != 0) || (bbounds.ub1 != 0)) {
bounds_info_internalXX_template<T> *previous =
reinterpret_cast<bounds_info_internalXX_template<T> *>(
&(bounds_nest[bbounds.outer_iv]));
{
span_t bound_candidate1 =
bbounds.lb0 + bbounds.lb1 * previous->span_smallest;
span_t bound_candidate2 =
bbounds.lb0 + bbounds.lb1 * previous->span_biggest;
if (bound_candidate1 < bound_candidate2) {
bounds->span_smallest = bound_candidate1;
} else {
bounds->span_smallest = bound_candidate2;
}
}
{
span_t bound_candidate1 =
bbounds.ub0 + bbounds.ub1 * previous->span_smallest;
span_t bound_candidate2 =
bbounds.ub0 + bbounds.ub1 * previous->span_biggest;
if (bound_candidate1 < bound_candidate2) {
bounds->span_biggest = bound_candidate2;
} else {
bounds->span_biggest = bound_candidate1;
}
}
} else {
bounds->span_smallest = bbounds.lb0;
bounds->span_biggest = bbounds.ub0;
}
if (!bounds->loop_bounds_adjusted) {
bounds->span_biggest -=
(static_cast<UT>(bbounds.ub0 - bbounds.lb0)) % bbounds.step;
}
}
template <typename T>
void kmp_calc_span_greateroreq_XX(
bounds_info_internalXX_template<T> *bounds,
bounds_info_internal_t *bounds_nest) {
typedef typename traits_t<T>::unsigned_t UT;
typedef T span_t;
auto &bbounds = bounds->b;
if ((bbounds.lb1 != 0) || (bbounds.ub1 != 0)) {
bounds_info_internalXX_template<T> *previous =
reinterpret_cast<bounds_info_internalXX_template<T> *>(
&(bounds_nest[bbounds.outer_iv]));
{
span_t bound_candidate1 =
bbounds.lb0 + bbounds.lb1 * previous->span_smallest;
span_t bound_candidate2 =
bbounds.lb0 + bbounds.lb1 * previous->span_biggest;
if (bound_candidate1 >= bound_candidate2) {
bounds->span_smallest = bound_candidate1;
} else {
bounds->span_smallest = bound_candidate2;
}
}
{
span_t bound_candidate1 =
bbounds.ub0 + bbounds.ub1 * previous->span_smallest;
span_t bound_candidate2 =
bbounds.ub0 + bbounds.ub1 * previous->span_biggest;
if (bound_candidate1 >= bound_candidate2) {
bounds->span_biggest = bound_candidate2;
} else {
bounds->span_biggest = bound_candidate1;
}
}
} else {
bounds->span_biggest = bbounds.lb0;
bounds->span_smallest = bbounds.ub0;
}
if (!bounds->loop_bounds_adjusted) {
bounds->span_biggest -=
(static_cast<UT>(bbounds.ub0 - bbounds.lb0)) % bbounds.step;
}
}
template <typename T>
void kmp_calc_span_XX(
bounds_info_internalXX_template<T> *bounds,
bounds_info_internal_t *bounds_nest) {
if (bounds->b.comparison == comparison_t::comp_less_or_eq) {
kmp_calc_span_lessoreq_XX( bounds, bounds_nest);
} else {
KMP_ASSERT(bounds->b.comparison == comparison_t::comp_greater_or_eq);
kmp_calc_span_greateroreq_XX( bounds, bounds_nest);
}
}
template <typename T>
void kmp_calc_new_bounds_XX(
bounds_info_internalXX_template<T> *bounds,
bounds_info_internal_t *bounds_nest) {
auto &bbounds = bounds->b;
if (bbounds.lb1 == bbounds.ub1) {
bounds->loop_bounds_adjusted = false;
} else {
bounds->loop_bounds_adjusted = true;
T old_lb1 = bbounds.lb1;
T old_ub1 = bbounds.ub1;
if (__kmp_sign(old_lb1) != __kmp_sign(old_ub1)) {
bbounds.lb1 = 0;
bbounds.ub1 = 0;
} else {
if (((old_lb1 < 0) && (old_lb1 < old_ub1)) ||
((old_lb1 > 0) && (old_lb1 > old_ub1))) {
bbounds.lb1 = old_ub1;
} else {
bbounds.ub1 = old_lb1;
}
}
bounds_info_internalXX_template<T> *previous =
reinterpret_cast<bounds_info_internalXX_template<T> *>(
&bounds_nest[bbounds.outer_iv]);
if (bbounds.comparison == comparison_t::comp_less_or_eq) {
if (old_lb1 < bbounds.lb1) {
KMP_ASSERT(old_lb1 < 0);
T sub = (bbounds.lb1 - old_lb1) * previous->span_biggest;
bbounds.lb0 -= sub;
} else if (old_lb1 > bbounds.lb1) {
T add = (old_lb1 - bbounds.lb1) * previous->span_smallest;
bbounds.lb0 += add;
}
if (old_ub1 > bbounds.ub1) {
KMP_ASSERT(old_ub1 > 0);
T add = (old_ub1 - bbounds.ub1) * previous->span_biggest;
bbounds.ub0 += add;
} else if (old_ub1 < bbounds.ub1) {
T sub = (bbounds.ub1 - old_ub1) * previous->span_smallest;
bbounds.ub0 -= sub;
}
} else {
KMP_ASSERT(bbounds.comparison == comparison_t::comp_greater_or_eq);
if (old_lb1 < bbounds.lb1) {
KMP_ASSERT(old_lb1 < 0);
T sub = (bbounds.lb1 - old_lb1) * previous->span_smallest;
bbounds.lb0 -= sub;
} else if (old_lb1 > bbounds.lb1) {
T add = (old_lb1 - bbounds.lb1) * previous->span_biggest;
bbounds.lb0 += add;
}
if (old_ub1 > bbounds.ub1) {
KMP_ASSERT(old_ub1 > 0);
T add = (old_ub1 - bbounds.ub1) * previous->span_smallest;
bbounds.ub0 += add;
} else if (old_ub1 < bbounds.ub1) {
T sub = (bbounds.ub1 - old_ub1) * previous->span_biggest;
bbounds.ub0 -= sub;
}
}
}
}
template <typename T>
kmp_loop_nest_iv_t kmp_process_one_loop_XX(
bounds_info_internalXX_template<T> *bounds,
bounds_info_internal_t *bounds_nest) {
kmp_calc_new_bounds_XX( bounds, bounds_nest);
kmp_calc_span_XX( bounds, bounds_nest);
return kmp_calculate_trip_count_XX( &(bounds->b));
}
kmp_loop_nest_iv_t kmp_process_loop_nest(
bounds_info_internal_t *bounds_nest, kmp_index_t n) {
kmp_loop_nest_iv_t total = 1;
for (kmp_index_t ind = 0; ind < n; ++ind) {
auto bounds = &(bounds_nest[ind]);
kmp_loop_nest_iv_t trip_count = 0;
switch (bounds->b.loop_type) {
case loop_type_t::loop_type_int32:
trip_count = kmp_process_one_loop_XX<kmp_int32>(
(bounds_info_internalXX_template<kmp_int32> *)(bounds),
bounds_nest);
break;
case loop_type_t::loop_type_uint32:
trip_count = kmp_process_one_loop_XX<kmp_uint32>(
(bounds_info_internalXX_template<kmp_uint32> *)(bounds),
bounds_nest);
break;
case loop_type_t::loop_type_int64:
trip_count = kmp_process_one_loop_XX<kmp_int64>(
(bounds_info_internalXX_template<kmp_int64> *)(bounds),
bounds_nest);
break;
case loop_type_t::loop_type_uint64:
trip_count = kmp_process_one_loop_XX<kmp_uint64>(
(bounds_info_internalXX_template<kmp_uint64> *)(bounds),
bounds_nest);
break;
default:
KMP_ASSERT(false);
}
total *= trip_count;
}
return total;
}
template <typename T>
kmp_loop_nest_iv_t
kmp_calc_number_of_iterations_XX(const bounds_infoXX_template<T> *bounds,
const kmp_point_t original_ivs,
kmp_index_t ind) {
kmp_loop_nest_iv_t iterations = 0;
if (bounds->comparison == comparison_t::comp_less_or_eq) {
iterations =
(static_cast<T>(original_ivs[ind]) - bounds->lb0 -
bounds->lb1 * static_cast<T>(original_ivs[bounds->outer_iv])) /
__kmp_abs(bounds->step);
} else {
KMP_DEBUG_ASSERT(bounds->comparison == comparison_t::comp_greater_or_eq);
iterations = (bounds->lb0 +
bounds->lb1 * static_cast<T>(original_ivs[bounds->outer_iv]) -
static_cast<T>(original_ivs[ind])) /
__kmp_abs(bounds->step);
}
return iterations;
}
kmp_loop_nest_iv_t kmp_calc_number_of_iterations(const bounds_info_t *bounds,
const kmp_point_t original_ivs,
kmp_index_t ind) {
switch (bounds->loop_type) {
case loop_type_t::loop_type_int32:
return kmp_calc_number_of_iterations_XX<kmp_int32>(
(bounds_infoXX_template<kmp_int32> *)(bounds), original_ivs, ind);
break;
case loop_type_t::loop_type_uint32:
return kmp_calc_number_of_iterations_XX<kmp_uint32>(
(bounds_infoXX_template<kmp_uint32> *)(bounds), original_ivs, ind);
break;
case loop_type_t::loop_type_int64:
return kmp_calc_number_of_iterations_XX<kmp_int64>(
(bounds_infoXX_template<kmp_int64> *)(bounds), original_ivs, ind);
break;
case loop_type_t::loop_type_uint64:
return kmp_calc_number_of_iterations_XX<kmp_uint64>(
(bounds_infoXX_template<kmp_uint64> *)(bounds), original_ivs, ind);
break;
default:
KMP_ASSERT(false);
return 0;
}
}
kmp_loop_nest_iv_t
kmp_calc_new_iv_from_original_ivs(const bounds_info_internal_t *bounds_nest,
const kmp_point_t original_ivs,
kmp_index_t n) {
kmp_loop_nest_iv_t new_iv = 0;
for (kmp_index_t ind = 0; ind < n; ++ind) {
auto bounds = &(bounds_nest[ind].b);
new_iv = new_iv * bounds->trip_count +
kmp_calc_number_of_iterations(bounds, original_ivs, ind);
}
return new_iv;
}
bool kmp_calc_original_ivs_from_iterations(
const bounds_info_t *original_bounds_nest, kmp_index_t n,
kmp_point_t original_ivs,
kmp_iterations_t iterations, kmp_index_t ind) {
kmp_index_t lengthened_ind = n;
for (; ind < n;) {
auto bounds = &(original_bounds_nest[ind]);
bool good = kmp_calc_one_iv(bounds, original_ivs, iterations,
ind, (lengthened_ind < ind), true);
if (!good) {
if (ind == 0) {
return false;
} else {
--ind;
++iterations[ind];
lengthened_ind = ind;
for (kmp_index_t i = ind + 1; i < n; ++i) {
iterations[i] = 0;
}
continue;
}
}
++ind;
}
return true;
}
bool kmp_calc_original_ivs_for_start(const bounds_info_t *original_bounds_nest,
kmp_index_t n,
kmp_point_t original_ivs) {
CollapseAllocator<kmp_loop_nest_iv_t> iterations(n);
for (kmp_index_t ind = n; ind > 0;) {
--ind;
iterations[ind] = 0;
}
bool b = kmp_calc_original_ivs_from_iterations(original_bounds_nest, n,
original_ivs,
iterations, 0);
return b;
}
bool kmp_calc_next_original_ivs(const bounds_info_t *original_bounds_nest,
kmp_index_t n, const kmp_point_t original_ivs,
kmp_point_t next_original_ivs) {
CollapseAllocator<kmp_loop_nest_iv_t> iterations(n);
for (kmp_index_t ind = 0; ind < n; ++ind) {
auto bounds = &(original_bounds_nest[ind]);
iterations[ind] = kmp_calc_number_of_iterations(bounds, original_ivs, ind);
}
for (kmp_index_t ind = 0; ind < n; ++ind) {
next_original_ivs[ind] = original_ivs[ind];
}
kmp_index_t ind = n - 1;
++iterations[ind];
bool b = kmp_calc_original_ivs_from_iterations(
original_bounds_nest, n, next_original_ivs, iterations, ind);
return b;
}
template <typename T>
bool kmp_calc_one_iv_for_chunk_end_XX(
const bounds_infoXX_template<T> *bounds,
const bounds_infoXX_template<T> *updated_bounds,
kmp_point_t original_ivs, const kmp_iterations_t iterations,
kmp_index_t ind, bool start_with_lower_bound, bool compare_with_start,
const kmp_point_t original_ivs_start) {
T temp = 0;
T outer_iv = static_cast<T>(original_ivs[bounds->outer_iv]);
if (start_with_lower_bound) {
temp = bounds->lb0 + bounds->lb1 * outer_iv;
} else {
auto iteration = iterations[ind];
auto step = bounds->step;
auto accountForStep =
((bounds->lb0 + bounds->lb1 * outer_iv) -
(updated_bounds->lb0 + updated_bounds->lb1 * outer_iv)) %
step;
temp = updated_bounds->lb0 + updated_bounds->lb1 * outer_iv +
accountForStep + iteration * step;
if (((bounds->comparison == comparison_t::comp_less_or_eq) &&
(temp < (bounds->lb0 + bounds->lb1 * outer_iv))) ||
((bounds->comparison == comparison_t::comp_greater_or_eq) &&
(temp > (bounds->lb0 + bounds->lb1 * outer_iv)))) {
temp = bounds->lb0 + bounds->lb1 * outer_iv + iteration / 2 * step;
}
if (compare_with_start) {
T start = static_cast<T>(original_ivs_start[ind]);
temp = kmp_fix_iv(bounds->loop_iv_type, temp);
if (((bounds->comparison == comparison_t::comp_less_or_eq) &&
(temp < start)) ||
((bounds->comparison == comparison_t::comp_greater_or_eq) &&
(temp > start))) {
temp = start + iteration / 4 * step;
}
}
}
original_ivs[ind] = temp = kmp_fix_iv(bounds->loop_iv_type, temp);
if (((bounds->comparison == comparison_t::comp_less_or_eq) &&
(temp > (bounds->ub0 + bounds->ub1 * outer_iv))) ||
((bounds->comparison == comparison_t::comp_greater_or_eq) &&
(temp < (bounds->ub0 + bounds->ub1 * outer_iv)))) {
return false;
}
return true;
}
bool kmp_calc_one_iv_for_chunk_end(const bounds_info_t *bounds,
const bounds_info_t *updated_bounds,
kmp_point_t original_ivs,
const kmp_iterations_t iterations,
kmp_index_t ind, bool start_with_lower_bound,
bool compare_with_start,
const kmp_point_t original_ivs_start) {
switch (bounds->loop_type) {
case loop_type_t::loop_type_int32:
return kmp_calc_one_iv_for_chunk_end_XX<kmp_int32>(
(bounds_infoXX_template<kmp_int32> *)(bounds),
(bounds_infoXX_template<kmp_int32> *)(updated_bounds),
original_ivs, iterations, ind, start_with_lower_bound,
compare_with_start, original_ivs_start);
break;
case loop_type_t::loop_type_uint32:
return kmp_calc_one_iv_for_chunk_end_XX<kmp_uint32>(
(bounds_infoXX_template<kmp_uint32> *)(bounds),
(bounds_infoXX_template<kmp_uint32> *)(updated_bounds),
original_ivs, iterations, ind, start_with_lower_bound,
compare_with_start, original_ivs_start);
break;
case loop_type_t::loop_type_int64:
return kmp_calc_one_iv_for_chunk_end_XX<kmp_int64>(
(bounds_infoXX_template<kmp_int64> *)(bounds),
(bounds_infoXX_template<kmp_int64> *)(updated_bounds),
original_ivs, iterations, ind, start_with_lower_bound,
compare_with_start, original_ivs_start);
break;
case loop_type_t::loop_type_uint64:
return kmp_calc_one_iv_for_chunk_end_XX<kmp_uint64>(
(bounds_infoXX_template<kmp_uint64> *)(bounds),
(bounds_infoXX_template<kmp_uint64> *)(updated_bounds),
original_ivs, iterations, ind, start_with_lower_bound,
compare_with_start, original_ivs_start);
break;
default:
KMP_ASSERT(false);
return false;
}
}
bool kmp_calc_original_ivs_for_chunk_end(
const bounds_info_t *original_bounds_nest, kmp_index_t n,
const bounds_info_internal_t *updated_bounds_nest,
const kmp_point_t original_ivs_start, kmp_loop_nest_iv_t new_iv,
kmp_point_t original_ivs) {
CollapseAllocator<kmp_loop_nest_iv_t> iterations(n);
for (kmp_index_t ind = n; ind > 0;) {
--ind;
auto &updated_bounds = updated_bounds_nest[ind];
auto new_ind = new_iv / updated_bounds.b.trip_count;
auto iteration = new_iv % updated_bounds.b.trip_count;
new_iv = new_ind;
iterations[ind] = iteration;
}
KMP_DEBUG_ASSERT(new_iv == 0);
kmp_index_t lengthened_ind = n;
kmp_index_t equal_ind = -1;
for (kmp_index_t ind = 0; ind < n;) {
auto bounds = &(original_bounds_nest[ind]);
auto updated_bounds = &(updated_bounds_nest[ind].b);
bool good = kmp_calc_one_iv_for_chunk_end(
bounds, updated_bounds,
original_ivs, iterations, ind, (lengthened_ind < ind),
(equal_ind >= ind - 1), original_ivs_start);
if (!good) {
if (ind == 0) {
return false;
} else {
--ind;
++(iterations[ind]);
lengthened_ind = ind;
if (equal_ind >= lengthened_ind) {
equal_ind = lengthened_ind - 1;
}
for (kmp_index_t i = ind + 1; i < n; ++i) {
iterations[i] = 0;
}
continue;
}
}
if ((equal_ind == ind - 1) &&
(kmp_ivs_eq(bounds->loop_iv_type, original_ivs[ind],
original_ivs_start[ind]))) {
equal_ind = ind;
} else if ((equal_ind > ind - 1) &&
!(kmp_ivs_eq(bounds->loop_iv_type, original_ivs[ind],
original_ivs_start[ind]))) {
equal_ind = ind - 1;
}
++ind;
}
return true;
}
template <typename T>
void kmp_calc_one_iv_end_XX(const bounds_infoXX_template<T> *bounds,
kmp_point_t original_ivs,
kmp_index_t ind) {
T temp = bounds->ub0 +
bounds->ub1 * static_cast<T>(original_ivs[bounds->outer_iv]);
original_ivs[ind] = kmp_fix_iv(bounds->loop_iv_type, temp);
}
void kmp_calc_one_iv_end(const bounds_info_t *bounds,
kmp_point_t original_ivs, kmp_index_t ind) {
switch (bounds->loop_type) {
default:
KMP_ASSERT(false);
break;
case loop_type_t::loop_type_int32:
kmp_calc_one_iv_end_XX<kmp_int32>(
(bounds_infoXX_template<kmp_int32> *)(bounds),
original_ivs, ind);
break;
case loop_type_t::loop_type_uint32:
kmp_calc_one_iv_end_XX<kmp_uint32>(
(bounds_infoXX_template<kmp_uint32> *)(bounds),
original_ivs, ind);
break;
case loop_type_t::loop_type_int64:
kmp_calc_one_iv_end_XX<kmp_int64>(
(bounds_infoXX_template<kmp_int64> *)(bounds),
original_ivs, ind);
break;
case loop_type_t::loop_type_uint64:
kmp_calc_one_iv_end_XX<kmp_uint64>(
(bounds_infoXX_template<kmp_uint64> *)(bounds),
original_ivs, ind);
break;
}
}
void kmp_calc_original_ivs_for_end(
const bounds_info_t *const original_bounds_nest, kmp_index_t n,
kmp_point_t original_ivs) {
for (kmp_index_t ind = 0; ind < n; ++ind) {
auto bounds = &(original_bounds_nest[ind]);
kmp_calc_one_iv_end(bounds, original_ivs, ind);
}
}
* Identify nested loop structure - loops come in the canonical form
* Lower triangle matrix: i = 0; i <= N; i++ {0,0}:{N,0}
* j = 0; j <= 0/-1+1*i; j++ {0,0}:{0/-1,1}
* Upper Triangle matrix
* i = 0; i <= N; i++ {0,0}:{N,0}
* j = 0+1*i; j <= N; j++ {0,1}:{N,0}
* ************************************************************************/
nested_loop_type_t
kmp_identify_nested_loop_structure( bounds_info_t *original_bounds_nest,
kmp_index_t n) {
if (n != 2) {
return nested_loop_type_unkown;
}
KMP_ASSERT(
(original_bounds_nest[0].comparison == comparison_t::comp_less_or_eq) &&
(original_bounds_nest[1].comparison == comparison_t::comp_less_or_eq));
kmp_uint64 outer_lb0_u64 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
original_bounds_nest[0].lb0_u64);
kmp_uint64 outer_ub0_u64 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
original_bounds_nest[0].ub0_u64);
kmp_uint64 outer_lb1_u64 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
original_bounds_nest[0].lb1_u64);
kmp_uint64 outer_ub1_u64 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
original_bounds_nest[0].ub1_u64);
if (outer_lb0_u64 != 0 || outer_lb1_u64 != 0 || outer_ub1_u64 != 0) {
return nested_loop_type_unkown;
}
kmp_uint64 inner_lb0_u64 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type,
original_bounds_nest[1].lb0_u64);
kmp_uint64 inner_ub0_u64 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type,
original_bounds_nest[1].ub0_u64);
kmp_uint64 inner_lb1_u64 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type,
original_bounds_nest[1].lb1_u64);
kmp_uint64 inner_ub1_u64 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type,
original_bounds_nest[1].ub1_u64);
if (inner_lb0_u64 == 0 && inner_lb1_u64 == 0 &&
(inner_ub0_u64 == 0 || inner_ub0_u64 == -1) && inner_ub1_u64 == 1) {
return nested_loop_type_lower_triangular_matrix;
}
if (inner_lb0_u64 == 0 && inner_lb1_u64 == 1 &&
inner_ub0_u64 == outer_ub0_u64 && inner_ub1_u64 == 0) {
return nested_loop_type_upper_triangular_matrix;
}
return nested_loop_type_unkown;
}
* SQRT Approximation: https://math.mit.edu/~stevenj/18.335/newton-sqrt.pdf
* Start point is x so the result is always > sqrt(x)
* The method has uniform convergence, PRECISION is set to 0.1
* ************************************************************************/
#define level_of_precision 0.1
double sqrt_newton_approx( kmp_uint64 x) {
double sqrt_old = 0.;
double sqrt_new = (double)x;
do {
sqrt_old = sqrt_new;
sqrt_new = (sqrt_old + x / sqrt_old) / 2;
} while ((sqrt_old - sqrt_new) > level_of_precision);
return sqrt_new;
}
* Handle lower triangle matrix in the canonical form
* i = 0; i <= N; i++ {0,0}:{N,0}
* j = 0; j <= 0/-1 + 1*i; j++ {0,0}:{0/-1,1}
* ************************************************************************/
void kmp_handle_lower_triangle_matrix(
kmp_uint32 nth,
kmp_uint32 tid,
kmp_index_t n,
bounds_info_t *original_bounds_nest,
bounds_info_t *chunk_bounds_nest) {
for (kmp_index_t i = 0; i < n; ++i) {
chunk_bounds_nest[i] = original_bounds_nest[i];
}
kmp_uint64 outer_ub0 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
original_bounds_nest[0].ub0_u64);
kmp_uint64 outer_lb0 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
original_bounds_nest[0].lb0_u64);
kmp_uint64 inner_ub0 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type,
original_bounds_nest[1].ub0_u64);
kmp_uint64 outer_iters = (outer_ub0 - outer_lb0 + 1) + inner_ub0;
kmp_uint64 iter_total = outer_iters * (outer_iters + 1) / 2;
kmp_uint64 iter_current =
iter_total / nth + ((tid < (iter_total % nth)) ? 1 : 0);
kmp_uint64 iter_before_current =
tid * iter_current + ((tid < iter_total % nth) ? 0 : (iter_total % nth));
kmp_uint64 iter_with_current = iter_before_current + iter_current;
kmp_int64 inner_adjustment = 1 + 2 * inner_ub0;
kmp_uint64 lower_bound_outer =
(kmp_uint64)(sqrt_newton_approx(inner_adjustment * inner_adjustment +
8 * iter_before_current) +
inner_adjustment) /
2 -
inner_adjustment;
kmp_uint64 lower_bound_inner =
iter_before_current -
((lower_bound_outer + inner_adjustment) * lower_bound_outer) / 2;
kmp_uint64 upper_bound_outer =
(kmp_uint64)(sqrt_newton_approx(inner_adjustment * inner_adjustment +
8 * iter_with_current) +
inner_adjustment) /
2 -
inner_adjustment;
kmp_uint64 upper_bound_inner =
iter_with_current -
((upper_bound_outer + inner_adjustment) * upper_bound_outer) / 2;
if (upper_bound_inner == 0) {
upper_bound_outer -= 1;
upper_bound_inner = upper_bound_outer;
} else {
upper_bound_inner -= 1;
}
chunk_bounds_nest[0].lb0_u64 = lower_bound_outer;
chunk_bounds_nest[1].lb0_u64 = lower_bound_inner;
chunk_bounds_nest[0].ub0_u64 = upper_bound_outer;
chunk_bounds_nest[1].ub0_u64 = upper_bound_inner;
chunk_bounds_nest[0].lb1_u64 = 0;
chunk_bounds_nest[0].ub1_u64 = 0;
chunk_bounds_nest[1].lb1_u64 = 0;
chunk_bounds_nest[1].ub1_u64 = 0;
#if 0
printf("tid/nth = %d/%d : From [%llu, %llu] To [%llu, %llu] : Chunks %llu/%llu\n",
tid, nth, chunk_bounds_nest[0].lb0_u64, chunk_bounds_nest[1].lb0_u64,
chunk_bounds_nest[0].ub0_u64, chunk_bounds_nest[1].ub0_u64, iter_current, iter_total);
#endif
}
* Handle upper triangle matrix in the canonical form
* i = 0; i <= N; i++ {0,0}:{N,0}
* j = 0+1*i; j <= N; j++ {0,1}:{N,0}
* ************************************************************************/
void kmp_handle_upper_triangle_matrix(
kmp_uint32 nth,
kmp_uint32 tid,
kmp_index_t n,
bounds_info_t *original_bounds_nest,
bounds_info_t *chunk_bounds_nest) {
for (kmp_index_t i = 0; i < n; ++i) {
chunk_bounds_nest[i] = original_bounds_nest[i];
}
kmp_uint64 outer_ub0 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
original_bounds_nest[0].ub0_u64);
kmp_uint64 outer_lb0 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
original_bounds_nest[0].lb0_u64);
[[maybe_unused]] kmp_uint64 inner_ub0 = kmp_fix_iv(
original_bounds_nest[1].loop_iv_type, original_bounds_nest[1].ub0_u64);
kmp_uint64 outer_iters = (outer_ub0 - outer_lb0 + 1);
kmp_uint64 iter_total = outer_iters * (outer_iters + 1) / 2;
kmp_uint64 iter_current =
iter_total / nth + ((tid < (iter_total % nth)) ? 1 : 0);
kmp_uint64 iter_before_current =
tid * iter_current + ((tid < iter_total % nth) ? 0 : (iter_total % nth));
kmp_uint64 iter_with_current = iter_before_current + iter_current;
kmp_uint64 lower_bound_outer =
(kmp_uint64)(sqrt_newton_approx(1 + 8 * iter_before_current) + 1) / 2 - 1;
kmp_uint64 lower_bound_inner =
iter_before_current - ((lower_bound_outer + 1) * lower_bound_outer) / 2;
kmp_uint64 upper_bound_outer =
(kmp_uint64)(sqrt_newton_approx(1 + 8 * iter_with_current) + 1) / 2 - 1;
kmp_uint64 upper_bound_inner =
iter_with_current - ((upper_bound_outer + 1) * upper_bound_outer) / 2;
if (upper_bound_inner == 0) {
upper_bound_outer -= 1;
upper_bound_inner = upper_bound_outer;
} else {
upper_bound_inner -= 1;
}
chunk_bounds_nest[0].lb0_u64 = (outer_iters - 1) - upper_bound_outer;
chunk_bounds_nest[1].lb0_u64 = (outer_iters - 1) - upper_bound_inner;
chunk_bounds_nest[0].ub0_u64 = (outer_iters - 1) - lower_bound_outer;
chunk_bounds_nest[1].ub0_u64 = (outer_iters - 1) - lower_bound_inner;
chunk_bounds_nest[0].lb1_u64 = 0;
chunk_bounds_nest[0].ub1_u64 = 0;
chunk_bounds_nest[1].lb1_u64 = 0;
chunk_bounds_nest[1].ub1_u64 = 0;
#if 0
printf("tid/nth = %d/%d : From [%llu, %llu] To [%llu, %llu] : Chunks %llu/%llu\n",
tid, nth, chunk_bounds_nest[0].lb0_u64, chunk_bounds_nest[1].lb0_u64,
chunk_bounds_nest[0].ub0_u64, chunk_bounds_nest[1].ub0_u64, iter_current, iter_total);
#endif
}
extern "C" kmp_int32
__kmpc_for_collapsed_init(ident_t *loc, kmp_int32 gtid,
bounds_info_t *original_bounds_nest,
bounds_info_t *chunk_bounds_nest,
kmp_index_t n, kmp_int32 *plastiter) {
KMP_DEBUG_ASSERT(plastiter && original_bounds_nest);
KE_TRACE(10, ("__kmpc_for_collapsed_init called (%d)\n", gtid));
if (__kmp_env_consistency_check) {
__kmp_push_workshare(gtid, ct_pdo, loc);
}
kmp_canonicalize_loop_nest(loc, original_bounds_nest, n);
CollapseAllocator<bounds_info_internal_t> updated_bounds_nest(n);
for (kmp_index_t i = 0; i < n; ++i) {
updated_bounds_nest[i].b = original_bounds_nest[i];
}
kmp_loop_nest_iv_t total =
kmp_process_loop_nest( updated_bounds_nest, n);
if (plastiter != NULL) {
*plastiter = FALSE;
}
if (total == 0) {
return FALSE;
}
__kmp_assert_valid_gtid(gtid);
kmp_uint32 tid = __kmp_tid_from_gtid(gtid);
kmp_info_t *th = __kmp_threads[gtid];
kmp_team_t *team = th->th.th_team;
kmp_uint32 nth = team->t.t_nproc;
KMP_DEBUG_ASSERT(tid < nth);
nested_loop_type_t loop_type =
kmp_identify_nested_loop_structure(original_bounds_nest, n);
if (loop_type == nested_loop_type_lower_triangular_matrix) {
kmp_handle_lower_triangle_matrix(nth, tid, n, original_bounds_nest,
chunk_bounds_nest);
return TRUE;
} else if (loop_type == nested_loop_type_upper_triangular_matrix) {
kmp_handle_upper_triangle_matrix(nth, tid, n, original_bounds_nest,
chunk_bounds_nest);
return TRUE;
}
CollapseAllocator<kmp_uint64> original_ivs_start(n);
if (!kmp_calc_original_ivs_for_start(original_bounds_nest, n,
original_ivs_start)) {
return FALSE;
}
kmp_loop_nest_iv_t new_iv = kmp_calc_new_iv_from_original_ivs(
updated_bounds_nest, original_ivs_start, n);
bool last_iter = false;
for (; nth > 0;) {
KMP_DEBUG_ASSERT(total >= new_iv);
kmp_loop_nest_iv_t total_left = total - new_iv;
kmp_loop_nest_iv_t chunk_size = total_left / nth;
kmp_loop_nest_iv_t remainder = total_left % nth;
kmp_loop_nest_iv_t curr_chunk_size = chunk_size;
if (remainder > 0) {
++curr_chunk_size;
--remainder;
}
#if defined(KMP_DEBUG)
kmp_loop_nest_iv_t new_iv_for_start = new_iv;
#endif
if (curr_chunk_size > 1) {
new_iv += curr_chunk_size - 1;
}
CollapseAllocator<kmp_uint64> original_ivs_end(n);
if ((nth == 1) || (new_iv >= total - 1)) {
kmp_calc_original_ivs_for_end(original_bounds_nest, n,
original_ivs_end);
last_iter = true;
} else {
if (!kmp_calc_original_ivs_for_chunk_end(original_bounds_nest, n,
updated_bounds_nest,
original_ivs_start, new_iv,
original_ivs_end)) {
kmp_calc_original_ivs_for_end(original_bounds_nest, n,
original_ivs_end);
last_iter = true;
}
}
#if defined(KMP_DEBUG)
auto new_iv_for_end = kmp_calc_new_iv_from_original_ivs(
updated_bounds_nest, original_ivs_end, n);
KMP_DEBUG_ASSERT(new_iv_for_end >= new_iv_for_start);
#endif
if (last_iter && (tid != 0)) {
return FALSE;
}
if (tid == 0) {
CollapseAllocator<kmp_uint64> original_ivs_next_start(n);
if (last_iter ||
!kmp_calc_next_original_ivs(original_bounds_nest, n, original_ivs_end,
original_ivs_next_start)) {
if (plastiter != NULL) {
*plastiter = TRUE;
}
}
for (kmp_index_t i = 0; i < n; ++i) {
chunk_bounds_nest[i] =
original_bounds_nest[i];
chunk_bounds_nest[i].lb0_u64 = original_ivs_start[i];
chunk_bounds_nest[i].lb1_u64 = 0;
chunk_bounds_nest[i].ub0_u64 = original_ivs_end[i];
chunk_bounds_nest[i].ub1_u64 = 0;
}
return TRUE;
}
--tid;
--nth;
bool next_chunk = kmp_calc_next_original_ivs(
original_bounds_nest, n, original_ivs_end, original_ivs_start);
if (!next_chunk) {
break;
}
new_iv = kmp_calc_new_iv_from_original_ivs(updated_bounds_nest,
original_ivs_start, n);
}
return FALSE;
}