#ifndef OMNI_SHARED_INL_H
#define OMNI_SHARED_INL_H
#include <stddef.h>
#include <stdint.h>
#include "simd/base.h"
#include "simd/targets.h"
#include "simd/instruction/set_macros-inl.h"
namespace simd {
#if OMNI_COMPILER_GCC_ACTUAL && (OMNI_OS_WIN || OMNI_ARCH_ARM_A64)
template <class V> using VecArg = const V &;
#else
template <class V> using VecArg = V;
#endif
namespace detail {
template <typename T> struct NativeLaneTypeT {
using type = T;
};
template <> struct NativeLaneTypeT<simd::bfloat16_t> {
using type = uint16_t;
};
template <typename T> using NativeLaneType = typename NativeLaneTypeT<T>::type;
template <typename T, OMNI_IF_NOT_SPECIAL_FLOAT(T)> OMNI_INLINE T *NativeLanePointer(T *p)
{
return p;
}
template <typename T, typename NT = NativeLaneType<RemoveConst<T>>, OMNI_IF_F16(T)>
OMNI_INLINE constexpr If<IsConst<T>(), const NT *, NT *> NativeLanePointer(T *p)
{
return &p->native;
}
template <typename T, typename NT = NativeLaneType<RemoveConst<T>>, OMNI_IF_BF16(T)>
OMNI_INLINE constexpr If<IsConst<T>(), const NT *, NT *> NativeLanePointer(T *p)
{
return &p->bits;
}
template <typename T, OMNI_IF_SPECIAL_FLOAT(T)>
OMNI_INLINE If<IsConst<T>(), const uint16_t *, uint16_t *> U16LanePointer(T *p)
{
return &p->bits;
}
constexpr size_t ScaleByPower(size_t N, int pow2)
{
return pow2 >= 0 ? (N << pow2) : (N >> (-pow2));
}
template <typename T> OMNI_INLINE void MaybePoison(T *OMNI_RESTRICT unaligned, size_t count)
{
(void)unaligned;
(void)count;
}
template <typename T> OMNI_INLINE void MaybeUnpoison(T *OMNI_RESTRICT unaligned, size_t count)
{
(void)unaligned;
(void)count;
}
}
template <typename Lane, size_t N, int kPow2> struct Simd {
constexpr Simd() = default;
using T = Lane;
private:
static_assert(sizeof(Lane) <= 8, "Lanes are up to 64-bit");
static_assert(IsSame<Lane, RemoveCvRef<Lane>>(), "Lane must not be a reference type, const-qualified type, or "
"volatile-qualified type");
static_assert(IsIntegerLaneType<Lane>() || IsFloat<Lane>() || IsSpecialFloat<Lane>(),
"IsIntegerLaneType<T>(), IsFloat<T>(), or IsSpecialFloat<T>() "
"must be true");
static constexpr size_t kWhole = N & 0xFFFFF;
static constexpr int kFrac = static_cast<int>(N >> 20);
static_assert(kWhole <= 8 * OMNI_MAX_N && kFrac <= 3, "Out of range");
static_assert(kFrac == 0 || kWhole == 1, "If frac, whole must be 1");
static_assert((kWhole & (kWhole - 1)) == 0 && kWhole != 0, "Not 2^x");
static_assert(kPow2 >= OMNI_MIN_POW2, "Forgot kPow2 recursion terminator?");
public:
static constexpr size_t kPrivateLanes = OMNI_MAX(size_t{ 1 }, detail::ScaleByPower(kWhole, kPow2 - kFrac));
static constexpr int kPrivatePow2 = kPow2;
constexpr size_t MaxLanes() const
{
return kPrivateLanes;
}
constexpr size_t MaxBytes() const
{
return kPrivateLanes * sizeof(Lane);
}
constexpr size_t MaxBlocks() const
{
return (MaxBytes() + 15) / 16;
}
constexpr int Pow2() const
{
return kPow2;
}
template <typename NewT> static constexpr size_t RepartitionLanes()
{
return (kPrivateLanes * sizeof(T) + sizeof(NewT) - 1) / sizeof(NewT);
}
template <typename NewT> static constexpr int RebindPow2()
{
return kPow2 + ((sizeof(NewT) >= sizeof(T)) ? static_cast<int>(CeilLog2(sizeof(NewT) / sizeof(T))) :
-static_cast<int>(CeilLog2(sizeof(T) / sizeof(NewT))));
}
private:
template <int kNewPow2, size_t kNewMaxLanes> static constexpr size_t WholeN()
{
return detail::ScaleByPower(kNewMaxLanes, -kNewPow2);
}
template <int kNewPow2, size_t kNewMaxLanes> static constexpr size_t FracN()
{
static_assert(OMNI_MAX_N <= (size_t{ 1 } << 20), "Change bit shift");
return static_cast<size_t>(1 + (OMNI_MAX(0, kNewPow2 - static_cast<int>(CeilLog2(kNewMaxLanes))) << 20));
}
public:
template <int kNewPow2, size_t kNewMaxLanes> static constexpr size_t NewN()
{
return WholeN<kNewPow2, kNewMaxLanes>() == 0 ? FracN<kNewPow2, kNewMaxLanes>() :
WholeN<kNewPow2, kNewMaxLanes>();
}
template <typename NewT> using Rebind = Simd<NewT, NewN<RebindPow2<NewT>(), kPrivateLanes>(), RebindPow2<NewT>()>;
template <typename NewT> using Repartition = Simd<NewT, NewN<kPow2, RepartitionLanes<NewT>()>(), kPow2>;
using Half = Simd<T, N, kPow2 - 1>;
using Twice = Simd<T, N, kPow2 + 1>;
};
namespace detail {
template <typename T, size_t N, int kPow2> constexpr bool IsFull(Simd<T, N, kPow2> )
{
return N == OMNI_LANES(T) && kPow2 == 0;
}
template <typename T, size_t N, int kPow2> struct ClampNAndPow2 {
using type = Simd<T, OMNI_MIN(N, OMNI_MAX_N), OMNI_MIN(kPow2, OMNI_MAX_POW2)>;
};
template <typename T, int kPow2> struct ScalableTagChecker {
using type = typename ClampNAndPow2<T, OMNI_LANES(T), kPow2>::type;
};
template <typename T, size_t kLimit, int kPow2> struct CappedTagChecker {
static_assert(kLimit != 0, "Does not make sense to have zero lanes");
static constexpr size_t kLimitPow2 = size_t{ 1 } << simd::FloorLog2(kLimit);
static constexpr size_t N = OMNI_MIN(kLimitPow2, OMNI_LANES(T));
using type = typename ClampNAndPow2<T, N, kPow2>::type;
};
template <typename T, size_t kNumLanes> struct FixedTagChecker {
static_assert(kNumLanes != 0, "Does not make sense to have zero lanes");
static_assert(kNumLanes <= OMNI_LANES(T), "Too many lanes");
using type = Simd<T, kNumLanes, 0>;
};
}
template <typename T, int kPow2 = 0> using ScalableTag = typename detail::ScalableTagChecker<T, kPow2>::type;
template <typename T, size_t kLimit, int kPow2 = 0>
using CappedTag = typename detail::CappedTagChecker<T, kLimit, kPow2>::type;
#if !OMNI_HAVE_SCALABLE
template <typename T, size_t kLimit, int kPow2 = 0> using CappedTagIfFixed = CappedTag<T, kLimit, kPow2>;
#else
template <typename T, size_t kLimit, int kPow2 = 0> using CappedTagIfFixed = ScalableTag<T, kPow2>;
#endif
template <typename T, size_t kNumLanes> using FixedTag = typename detail::FixedTagChecker<T, kNumLanes>::type;
template <typename T> using Full16 = Simd<T, 2 / sizeof(T), 0>;
template <typename T> using Full32 = Simd<T, 4 / sizeof(T), 0>;
template <typename T> using Full64 = Simd<T, 8 / sizeof(T), 0>;
template <typename T> using Full128 = Simd<T, 16 / sizeof(T), 0>;
template <class D> using TFromD = typename D::T;
#define OMNI_MAX_LANES_D(D) D::kPrivateLanes
#define OMNI_POW2_D(D) D::kPrivatePow2
template <class D> OMNI_INLINE OMNI_MAYBE_UNUSED constexpr size_t MaxLanes(D)
{
return OMNI_MAX_LANES_D(D);
}
#if !OMNI_HAVE_SCALABLE
template <class D> OMNI_INLINE OMNI_MAYBE_UNUSED constexpr size_t Lanes(D)
{
return OMNI_MAX_LANES_D(D);
}
#endif
template <class T, class D> using Rebind = typename D::template Rebind<T>;
template <class D> using RebindToSigned = Rebind<MakeSigned<TFromD<D>>, D>;
template <class D> using RebindToUnsigned = Rebind<MakeUnsigned<TFromD<D>>, D>;
template <class D> using RebindToFloat = Rebind<MakeFloat<TFromD<D>>, D>;
template <class T, class D> using Repartition = typename D::template Repartition<T>;
template <class D> using RepartitionToWide = Repartition<MakeWide<TFromD<D>>, D>;
template <class D> using RepartitionToNarrow = Repartition<MakeNarrow<TFromD<D>>, D>;
template <class D> using RepartitionToWideX2 = RepartitionToWide<RepartitionToWide<D>>;
template <class D> using RepartitionToWideX3 = RepartitionToWide<RepartitionToWideX2<D>>;
template <class D> using Half = typename D::Half;
template <class D> using Twice = typename D::Twice;
#if OMNI_HAVE_SCALABLE
namespace detail {
template <class D> class BlockDFromD_t {};
template <typename T, size_t N, int kPow2> class BlockDFromD_t<Simd<T, N, kPow2>> {
using D = Simd<T, N, kPow2>;
static constexpr int kNewPow2 = OMNI_MIN(kPow2, 0);
static constexpr size_t kMaxLpb = OMNI_MIN(16 / sizeof(T), OMNI_MAX_LANES_D(D));
static constexpr size_t kNewN = D::template NewN<kNewPow2, kMaxLpb>();
public:
using type = Simd<T, kNewN, kNewPow2>;
};
}
template <class D> using BlockDFromD = typename detail::BlockDFromD_t<RemoveConst<D>>::type;
#else
template <class D> using BlockDFromD = Simd<TFromD<D>, OMNI_MIN(16 / sizeof(TFromD<D>), OMNI_MAX_LANES_D(D)), 0>;
#endif
template <class D, typename T> OMNI_API bool IsAligned(D d, T *ptr)
{
const size_t N = Lanes(d);
return reinterpret_cast<uintptr_t>(ptr) % (N * sizeof(T)) == 0;
}
#define OMNI_IF_UNSIGNED_D(D) OMNI_IF_UNSIGNED(simd::TFromD<D>)
#define OMNI_IF_NOT_UNSIGNED_D(D) OMNI_IF_NOT_UNSIGNED(simd::TFromD<D>)
#define OMNI_IF_SIGNED_D(D) OMNI_IF_SIGNED(simd::TFromD<D>)
#define OMNI_IF_FLOAT_D(D) OMNI_IF_FLOAT(simd::TFromD<D>)
#define OMNI_IF_NOT_FLOAT_D(D) OMNI_IF_NOT_FLOAT(simd::TFromD<D>)
#define OMNI_IF_FLOAT3264_D(D) OMNI_IF_FLOAT3264(simd::TFromD<D>)
#define OMNI_IF_NOT_FLOAT3264_D(D) OMNI_IF_NOT_FLOAT3264(simd::TFromD<D>)
#define OMNI_IF_SPECIAL_FLOAT_D(D) OMNI_IF_SPECIAL_FLOAT(simd::TFromD<D>)
#define OMNI_IF_NOT_SPECIAL_FLOAT_D(D) OMNI_IF_NOT_SPECIAL_FLOAT(simd::TFromD<D>)
#define OMNI_IF_FLOAT_OR_SPECIAL_D(D) OMNI_IF_FLOAT_OR_SPECIAL(simd::TFromD<D>)
#define OMNI_IF_NOT_FLOAT_NOR_SPECIAL_D(D) OMNI_IF_NOT_FLOAT_NOR_SPECIAL(simd::TFromD<D>)
#define OMNI_IF_T_SIZE_D(D, bytes) OMNI_IF_T_SIZE(simd::TFromD<D>, bytes)
#define OMNI_IF_NOT_T_SIZE_D(D, bytes) OMNI_IF_NOT_T_SIZE(simd::TFromD<D>, bytes)
#define OMNI_IF_T_SIZE_ONE_OF_D(D, bit_array) OMNI_IF_T_SIZE_ONE_OF(simd::TFromD<D>, bit_array)
#define OMNI_IF_T_SIZE_LE_D(D, bytes) OMNI_IF_T_SIZE_LE(simd::TFromD<D>, bytes)
#define OMNI_IF_T_SIZE_GT_D(D, bytes) OMNI_IF_T_SIZE_GT(simd::TFromD<D>, bytes)
#define OMNI_IF_LANES_D(D, lanes) OMNI_IF_LANES(OMNI_MAX_LANES_D(D), lanes)
#define OMNI_IF_LANES_LE_D(D, lanes) OMNI_IF_LANES_LE(OMNI_MAX_LANES_D(D), lanes)
#define OMNI_IF_LANES_GT_D(D, lanes) OMNI_IF_LANES_GT(OMNI_MAX_LANES_D(D), lanes)
#define OMNI_IF_LANES_PER_BLOCK_D(D, lanes) OMNI_IF_LANES_PER_BLOCK(simd::TFromD<D>, OMNI_MAX_LANES_D(D), lanes)
#if OMNI_COMPILER_MSVC
#define OMNI_IF_POW2_LE_D(D, pow2) simd::EnableIf<OMNI_POW2_D(D) <= pow2> * = nullptr
#define OMNI_IF_POW2_GT_D(D, pow2) simd::EnableIf<(OMNI_POW2_D(D) > pow2)> * = nullptr
#else
#define OMNI_IF_POW2_LE_D(D, pow2) simd::EnableIf<D().Pow2() <= pow2> * = nullptr
#define OMNI_IF_POW2_GT_D(D, pow2) simd::EnableIf<(D().Pow2() > pow2)> * = nullptr
#endif
#define OMNI_IF_U8_D(D) OMNI_IF_U8(simd::TFromD<D>)
#define OMNI_IF_U16_D(D) OMNI_IF_U16(simd::TFromD<D>)
#define OMNI_IF_U32_D(D) OMNI_IF_U32(simd::TFromD<D>)
#define OMNI_IF_U64_D(D) OMNI_IF_U64(simd::TFromD<D>)
#define OMNI_IF_I8_D(D) OMNI_IF_I8(simd::TFromD<D>)
#define OMNI_IF_I16_D(D) OMNI_IF_I16(simd::TFromD<D>)
#define OMNI_IF_I32_D(D) OMNI_IF_I32(simd::TFromD<D>)
#define OMNI_IF_I64_D(D) OMNI_IF_I64(simd::TFromD<D>)
#define OMNI_IF_UI8_D(D) OMNI_IF_UI8(simd::TFromD<D>)
#define OMNI_IF_UI16_D(D) OMNI_IF_UI16(simd::TFromD<D>)
#define OMNI_IF_UI32_D(D) OMNI_IF_UI32(simd::TFromD<D>)
#define OMNI_IF_UI64_D(D) OMNI_IF_UI64(simd::TFromD<D>)
#define OMNI_IF_BF16_D(D) OMNI_IF_BF16(simd::TFromD<D>)
#define OMNI_IF_NOT_BF16_D(D) OMNI_IF_NOT_BF16(simd::TFromD<D>)
#define OMNI_IF_F16_D(D) OMNI_IF_F16(simd::TFromD<D>)
#define OMNI_IF_NOT_F16_D(D) OMNI_IF_NOT_F16(simd::TFromD<D>)
#define OMNI_IF_F32_D(D) OMNI_IF_F32(simd::TFromD<D>)
#define OMNI_IF_F64_D(D) OMNI_IF_F64(simd::TFromD<D>)
#define OMNI_V_SIZE_D(D) (OMNI_MAX_LANES_D(D) * sizeof(simd::TFromD<D>))
#define OMNI_IF_V_SIZE_D(D, bytes) OMNI_IF_V_SIZE(simd::TFromD<D>, OMNI_MAX_LANES_D(D), bytes)
#define OMNI_IF_V_SIZE_LE_D(D, bytes) OMNI_IF_V_SIZE_LE(simd::TFromD<D>, OMNI_MAX_LANES_D(D), bytes)
#define OMNI_IF_V_SIZE_GT_D(D, bytes) OMNI_IF_V_SIZE_GT(simd::TFromD<D>, OMNI_MAX_LANES_D(D), bytes)
#define OMNI_IF_UNSIGNED_V(V) OMNI_IF_UNSIGNED(simd::TFromV<V>)
#define OMNI_IF_NOT_UNSIGNED_V(V) OMNI_IF_NOT_UNSIGNED(simd::TFromV<V>)
#define OMNI_IF_SIGNED_V(V) OMNI_IF_SIGNED(simd::TFromV<V>)
#define OMNI_IF_FLOAT_V(V) OMNI_IF_FLOAT(simd::TFromV<V>)
#define OMNI_IF_NOT_FLOAT_V(V) OMNI_IF_NOT_FLOAT(simd::TFromV<V>)
#define OMNI_IF_SPECIAL_FLOAT_V(V) OMNI_IF_SPECIAL_FLOAT(simd::TFromV<V>)
#define OMNI_IF_NOT_FLOAT_NOR_SPECIAL_V(V) OMNI_IF_NOT_FLOAT_NOR_SPECIAL(simd::TFromV<V>)
#define OMNI_IF_T_SIZE_V(V, bytes) OMNI_IF_T_SIZE(simd::TFromV<V>, bytes)
#define OMNI_IF_NOT_T_SIZE_V(V, bytes) OMNI_IF_NOT_T_SIZE(simd::TFromV<V>, bytes)
#define OMNI_IF_T_SIZE_ONE_OF_V(V, bit_array) OMNI_IF_T_SIZE_ONE_OF(simd::TFromV<V>, bit_array)
#define OMNI_MAX_LANES_V(V) OMNI_MAX_LANES_D(simd::DFromV<V>)
#define OMNI_IF_V_SIZE_V(V, bytes) OMNI_IF_V_SIZE(simd::TFromV<V>, OMNI_MAX_LANES_V(V), bytes)
#define OMNI_IF_V_SIZE_LE_V(V, bytes) OMNI_IF_V_SIZE_LE(simd::TFromV<V>, OMNI_MAX_LANES_V(V), bytes)
#define OMNI_IF_V_SIZE_GT_V(V, bytes) OMNI_IF_V_SIZE_GT(simd::TFromV<V>, OMNI_MAX_LANES_V(V), bytes)
#undef OMNI_IF_REDUCE_D
#define OMNI_IF_REDUCE_D(D) \
simd::EnableIf<OMNI_MAX_LANES_D(D) != 1 && (OMNI_MAX_LANES_D(D) != 4 || sizeof(simd::TFromD<D>) != 1)> * = nullptr
#undef OMNI_IF_SUM_OF_LANES_D
#define OMNI_IF_SUM_OF_LANES_D(D) OMNI_IF_LANES_GT_D(D, 1)
#undef OMNI_IF_MINMAX_OF_LANES_D
#define OMNI_IF_MINMAX_OF_LANES_D(D) OMNI_IF_LANES_GT_D(D, 1)
#undef OMNI_IF_ADDSUB_V
#define OMNI_IF_ADDSUB_V(V) OMNI_IF_LANES_GT_D(simd::DFromV<V>, 1)
#undef OMNI_IF_MULADDSUB_V
#define OMNI_IF_MULADDSUB_V(V) OMNI_IF_LANES_GT_D(simd::DFromV<V>, 1)
#undef OMNI_IF_U2I_DEMOTE_FROM_LANE_SIZE_V
#define OMNI_IF_U2I_DEMOTE_FROM_LANE_SIZE_V(V) void * = nullptr
#define OMNI_IF_LANE_SIZE_D(D, bytes) OMNI_IF_T_SIZE_D(D, bytes)
#define OMNI_IF_NOT_LANE_SIZE_D(D, bytes) OMNI_IF_NOT_T_SIZE_D(D, bytes)
}
#endif