asc-devkit/impl/basic_api/utils/kernel_utils_ceil_oom_que.h · CANN/asc-devkit - AtomGit

/**
* Copyright (c) 2025 Huawei Technologies Co., Ltd.
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
* CANN Open Software License Agreement Version 2.0 (the "License").
* Please refer to the License for details. You may not use this file except in compliance with the License.
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
* See LICENSE in the root of the software repository for the full text of the License.
*/

/*!
 * \file kernel_utils_ceil_oom_que.h
 * \brief
 */
#if !defined(__ASCENDC_INCLUDE_INTERNAL_HEADERS__)
#pragma message("impl/basic_api/utils/kernel_utils_ceil_oom_que.h is an internal header file and must not be used directly. Functions or variables defined in this file may be removed in the future. Please use \"#include \"basic_api/kernel_tpipe.h\"\" and use public functions or variables defined in interface headers files.")
#define __ASCENDC_INCLUDE_INTERNAL_HEADERS__
#define __UNDEF_ASCENDC_INCLUDE_INTERNAL_HEADERS_KERNEL_UTILS_CEIL_OOM_QUE_H__
#endif
#ifndef ASCENDC_MODULE_UTILS_CEIL_OOM_QUE_H
#define ASCENDC_MODULE_UTILS_CEIL_OOM_QUE_H
#include "utils/kernel_utils_macros.h"
namespace AscendC {
#ifdef ASCENDC_CPU_DEBUG
#define PRELOAD(len) \
    {}

#else
#define PRELOAD(len)                                  \
    do {                                              \
        uint64_t pc;                                  \
        asm volatile("mov %0, pc \n" : "=l"(pc) : :); \
        preload((void*)pc, len);                      \
    } while (0)

#endif

__aicore__ constexpr inline uint32_t DivCeil(uint32_t a, uint32_t b)
{
    return (a + b - 1) / b;
}

__aicore__ constexpr inline uint32_t AlignUp(uint32_t a, uint32_t b)
{
    return DivCeil(a, b) * b;
}

__aicore__ constexpr inline uint32_t ConstCeil(uint32_t a, uint32_t b)
{
    return (a + b - 1) / b;
}

__aicore__ inline uint32_t Ceil(uint32_t a, uint32_t b)
{
    return (a + b - 1) / b;
}

#if defined(__NPU_ARCH__) && ((__NPU_ARCH__ == 3510) || (__NPU_ARCH__ == 5102) || \
    (__NPU_ARCH__ == 3003) || (__NPU_ARCH__ == 3113)) || defined(__ASC_NPU_HOST__)
__aicore__ constexpr inline int32_t CeilDivision(int32_t num1, int32_t num2)
{
    if (num2 == 0) {
        return 0;
    }
#if defined(ASCENDC_CPU_DEBUG) && ASCENDC_CPU_DEBUG == 1 || !defined(SPLIT_CORE_VEC)
    return (num1 + num2 - 1) / num2;
#else
    return get_repeat_ceiling(num1, num2);
#endif
}
#else
__aicore__ inline int32_t CeilDivision(int32_t num1, int32_t num2)
{
    if (num2 == 0) {
        return 0;
    }
    return (num1 + num2 - 1) / num2;
}
#endif

// only for ascend910, ascend310p
__aicore__ inline void WriteBackOverflow(GM_ADDR overflowStatus)
{
    (void)overflowStatus;
#if (__NPU_ARCH__ == 5102)
#elif defined(__NPU_ARCH__) && ((__NPU_ARCH__ == 1001) || (__NPU_ARCH__ == 2002))
    uint64_t statusOverflow[1] = {0};
    statusOverflow[0] = get_status();
    statusOverflow[0] = (statusOverflow[0] << 0x20) >> 0x20;
    uint64_t statusMask = 0x520;
    statusOverflow[0] = statusOverflow[0] & statusMask;
    if (statusOverflow[0] != 0) {
        uint64_t *ptr = (uint64_t *)get_imm(0x43FE0);
        uint64_t buff[0x4];
        buff[0x0] = ptr[0x0];
        buff[0x1] = ptr[0x1];
        buff[0x2] = ptr[0x2] | statusOverflow[0x0];
        buff[0x3] = ptr[0x3];
        if (buff[0x0] == 0) {
            ptr[0x0] = 0xFFFFFFFFFFFFFFFF;
            ptr[0x1] = block_idx;
        }
        ptr[0x2] = buff[0x2];

        __ubuf__ uint8_t* tmpStatus = (__ubuf__ uint8_t *)get_imm(0);
        *tmpStatus = 0;
        if (buff[0x2] > 0) {
            *tmpStatus = 0x3;
        }
        pipe_barrier(PIPE_ALL);
        copy_ubuf_to_gm(((__gm__ int32_t *)overflowStatus), ((__ubuf__ int32_t *)tmpStatus), 0, 1, 1, 0, 0);
        pipe_barrier(PIPE_ALL);
    }
#endif
}

template <typename T>
__aicore__ static inline void OOMCheckTensorListRange(__gm__ T *gmInputAddr, const int inputSize)
{
#if defined(ASCENDC_OOM) && ASCENDC_OOM == 1
    uint64_t ioCount = g_oomAddrArange.count;
    if (ioCount >= g_oomAddrRangeMaxSize) {
        return;
    }
    g_oomAddrArange.addr[ioCount] = reinterpret_cast<uintptr_t>(gmInputAddr);
    g_oomAddrArange.len[ioCount] = inputSize;
    g_oomAddrArange.isLevelOnePointer[ioCount] = 0;
    g_oomAddrArange.count += 1;
#endif
}

__aicore__ static inline bool OOMCheckAddrInTensorList(uint64_t index, uintptr_t gmAddrConvert,
    uintptr_t& inputOutputAddr, uint64_t& inputOutputLen)
{
#if defined(ASCENDC_OOM) && ASCENDC_OOM == 1
    uintptr_t gmInputAddr = g_oomAddrArange.addr[index];
    uint64_t inputSize = g_oomAddrArange.len[index] & 0xffff;
    uint64_t scaleTmp = (g_oomAddrArange.len[index] >> 16) & 0xffff;  // high 16bit is scale value
    uint64_t scaleValue = (scaleTmp == 0) ? 1 : scaleTmp;

    __gm__ uint64_t *dynamicPtr = (__gm__ uint64_t *)gmInputAddr;
    uint64_t dynamicOffset = *dynamicPtr / 8;
    uint64_t offset = 1;
    __gm__ uint64_t *dynAddr = dynamicPtr + dynamicOffset;
    while (offset < dynamicOffset) {
        dynamicPtr += 1;
        offset += 1;
        inputOutputAddr = reinterpret_cast<uintptr_t>(*dynAddr);
        dynAddr = dynAddr + 1;
        uint64_t dimCnt = *dynamicPtr;
        uint64_t dims = dimCnt & 0xFFFFFFFF;
        uint64_t tensorSize = inputSize;
        for (int i = 0; i < dims; i++) {
            dynamicPtr += 1;
            offset += 1;
            tensorSize = tensorSize * (*dynamicPtr);
        }
        inputOutputLen = tensorSize / scaleValue;
        if (gmAddrConvert >= inputOutputAddr && gmAddrConvert < inputOutputAddr + inputOutputLen)
        {
            return true;
        }
    }
#endif
    (void)index;
    (void)gmAddrConvert;
    (void)inputOutputAddr;
    (void)inputOutputLen;
    return false;
}

template <typename T>
__aicore__ static inline void OOMCheckAddrRange(__gm__ T* gmAddr, const uint64_t gmSize)
{
#if defined(ASCENDC_OOM) && ASCENDC_OOM == 1
    uint64_t ioCount = g_oomAddrArange.count;
    if (ioCount >= g_oomAddrRangeMaxSize) {
        return;
    }
    g_oomAddrArange.addr[ioCount] = reinterpret_cast<uintptr_t>(gmAddr);
    g_oomAddrArange.len[ioCount] = gmSize;
    g_oomAddrArange.isLevelOnePointer[ioCount] = 1;
    g_oomAddrArange.count += 1;
#endif
}

template <typename T>
__aicore__ static inline void OOMAddAddrForL2Cache(__gm__ T* gmAddr, __gm__ T* oriAddr)
{
#if defined(ASCENDC_OOM) && ASCENDC_OOM == 1
    uint64_t ioCount = g_oomAddrArange.count;
    if (ioCount >= g_oomAddrRangeMaxSize) {
        return;
    }

    // gmAddr: addr with l2cache offset
    if (gmAddr != oriAddr) {
        for (uint32_t i = 0; i < ioCount; i++) {
            if (g_oomAddrArange.addr[i] <= reinterpret_cast<uintptr_t>(oriAddr) &&
                reinterpret_cast<uintptr_t>(oriAddr) <
                    reinterpret_cast<uintptr_t>(g_oomAddrArange.addr[i]) + g_oomAddrArange.len[i]) {
                g_oomAddrArange.addr[ioCount] = reinterpret_cast<uintptr_t>(gmAddr);
                g_oomAddrArange.len[ioCount] =
                    g_oomAddrArange.len[i] -
                    (reinterpret_cast<uintptr_t>(oriAddr) - reinterpret_cast<uintptr_t>(g_oomAddrArange.addr[i]));
                g_oomAddrArange.isLevelOnePointer[ioCount] = 1;
                g_oomAddrArange.count += 1;
                return;
            }
        }
    }
#endif
}

__aicore__ static inline void OOMInit()
{
#if defined(ASCENDC_OOM) && ASCENDC_OOM == 1
    g_oomAddrArange.count = 0;
#endif
}

struct TQueConfig {
    bool nd2nz = false;
    bool nz2nd = false;
    bool scmBlockGroup = false;
    uint32_t bufferLen = 0;
    uint32_t bufferNumber = 0;
    uint32_t consumerSize = 0;
    TPosition consumer[8] = {};
    bool enableStaticEvtId = false;
    bool enableLoopQueue = false;
};

__aicore__ constexpr TQueConfig GetTQueConfig(bool nd2nzIn, bool nz2ndIn, bool scmBlockGroupIn,
    uint32_t bufferLenIn, uint32_t bufferNumberIn, uint32_t consumerSizeIn,
    const TPosition consumerIn[], bool enableStaticEvtIdIn, bool enableLoopQueueIn)
{
    return {
        .nd2nz = nd2nzIn,
        .nz2nd = nz2ndIn,
        .scmBlockGroup = scmBlockGroupIn,
        .bufferLen = bufferLenIn,
        .bufferNumber = bufferNumberIn,
        .consumerSize = consumerSizeIn,
        .consumer = {consumerIn[0], consumerIn[1], consumerIn[2], consumerIn[3],
            consumerIn[4], consumerIn[5], consumerIn[6], consumerIn[7]},
        .enableStaticEvtId = enableStaticEvtIdIn,
        .enableLoopQueue = enableLoopQueueIn
    };
}

__aicore__ constexpr TQueConfig GetTQueConfig(const int32_t mask)
{
    return {
        .nd2nz = static_cast<bool>(static_cast<uint32_t>(mask) & 0x1u),
        .nz2nd = static_cast<bool>((static_cast<uint32_t>(mask) & 0x2u) >> 1),
        .scmBlockGroup = static_cast<bool>((static_cast<uint32_t>(mask) & 0x4u) >> 2),
        .bufferLen = 0,
        .bufferNumber = 0,
        .consumerSize = 0,
        .consumer = {TPosition::MAX, TPosition::MAX, TPosition::MAX, TPosition::MAX,
            TPosition::MAX, TPosition::MAX, TPosition::MAX, TPosition::MAX},
        .enableStaticEvtId = false,
        .enableLoopQueue = false
    };
}

__aicore__ constexpr TQueConfig GetTQueConfig(const TQueConfig* conf)
{
    return {
        .nd2nz = conf->nd2nz,
        .nz2nd = conf->nz2nd,
        .scmBlockGroup = conf->scmBlockGroup,
        .bufferLen = conf->bufferLen,
        .bufferNumber = conf->bufferNumber,
        .consumerSize = conf->consumerSize,
        .consumer = {conf->consumer[0], conf->consumer[1], conf->consumer[2], conf->consumer[3],
            conf->consumer[4], conf->consumer[5], conf->consumer[6], conf->consumer[7]},
        .enableStaticEvtId = conf->enableStaticEvtId,
        .enableLoopQueue = conf->enableLoopQueue
    };
}

#if defined(__NPU_ARCH__) && ((__NPU_ARCH__ == 3510) || (__NPU_ARCH__ == 5102)) || defined(__ASC_NPU_HOST__)
__aicore__ constexpr bool UseAltBufId(TPosition queDstPos, TPosition dstConsumerPos, uint32_t consumerSize)
{
    if (consumerSize <= 1) {
        return false;
    }
    return queDstPos != dstConsumerPos;
}

__aicore__ constexpr bool IdentifyPos(TQueConfig config, TPosition pos)
{
    if (config.consumerSize <= 1) {
        return true;
    }
    return config.consumer[0] == pos;
}
#endif

template <bool b> struct BoolInst {
    using Type = BoolInst<b>;
    static constexpr bool value = b;
};

using TrueType = BoolInst<true>;
using FalseType = BoolInst<false>;

template <typename T, typename U> struct IsSameType : public FalseType {};

template <typename T> struct IsSameType<T, T> : public TrueType {};

template <typename... Arg>
struct Tuple {};

template <typename T, typename U, typename... Args>
__aicore__ constexpr bool SupportType()
{
    if constexpr (sizeof...(Args) > 0) {
        return IsSameType<T, U>::value || SupportType<T, Args...>();
    }
    return IsSameType<T, U>::value;
}

template <auto T, auto U, auto... Args> __aicore__ constexpr bool SupportEnum()
{
    if constexpr (sizeof...(Args) > 0) {
        return T == U || SupportEnum<T, Args...>();
    }
    return T == U;
}

#if defined(__NPU_ARCH__) && ((__NPU_ARCH__ == 3510) || (__NPU_ARCH__ == 5102)) || defined(__ASC_NPU_HOST__)
template <typename T> struct GetComplexElementType {
    using Type = T;
};

template<class T>
struct Complex {
    using EleType = typename GetComplexElementType<T>::Type;
    __simd_callee__ inline Complex() : real(0), imag(0) {}
    __simd_callee__ inline Complex(T realTmp, T imagTmp) : real(realTmp), imag(imagTmp) {}

    template<class U>
    __simd_callee__ inline Complex(const U realTmp) : real(T(realTmp)), imag(0) {}

    __simd_callee__ inline Complex(const Complex& other) : real(other.real), imag(other.imag) {}

    __simd_callee__ inline bool operator == (const Complex& val) const
    {
        return (real == val.real) && (imag == val.imag);
    }

    __simd_callee__ inline Complex& operator = (const Complex& other)
    {
        if (this == &other) {
            return *this;
        }
        real = other.real;
        imag = other.imag;
        return *this;
    }

    T real;
    T imag;
};
#endif

template <typename T, int U, int... Args> __aicore__ constexpr bool SupportBytes()
{
    if constexpr (sizeof...(Args) > 0) {
        return sizeof(T) == U || SupportBytes<T, Args...>();
    }
    return sizeof(T) == U;
}
#if defined(__NPU_ARCH__) && ((__NPU_ARCH__ == 3003) || (__NPU_ARCH__ == 3113))
template <auto funcPtr, typename... Args> __aicore__ inline void VF_CALL(Args &&... args)
{
    __VEC_SCOPE__
    {
        funcPtr(args...);
    }
}
#endif
} // namespace AscendC

#if defined(__NPU_ARCH__) && ((__NPU_ARCH__ == 3510) || (__NPU_ARCH__ == 5102)) || defined(__ASC_NPU_HOST__)
using complex32 = AscendC::Complex<half>;
using complex64 = AscendC::Complex<float>;

namespace AscendC {
template <typename T> class LocalTensor;

namespace TypeUtils {
template <typename T> __aicore__ constexpr bool IsInnerDefaultType()
{
    return SupportType<T, bool, uint8_t, int8_t, half, bfloat16_t, int16_t, uint16_t, float, int32_t, uint32_t,
        fp8_e5m2_t, fp8_e4m3fn_t, fp8_e8m0_t, int64_t, uint64_t, complex32, complex64, double>();
}

template <typename T, typename U>
__aicore__ constexpr bool IsInnerDefaultType()
{
    return IsInnerDefaultType<T>() && IsInnerDefaultType<U>();
}

template <typename T> struct IsLocalTensor : public FalseType {};

template <typename T> struct IsLocalTensor<LocalTensor<T>> : public TrueType {};

template <typename T> __aicore__ constexpr bool IsLocalTensorType()
{
    if constexpr (IsLocalTensor<T>::value) {
        return SupportType<typename T::PrimType, bool, int8_t, uint8_t, int16_t, uint16_t, half, bfloat16_t, float, 
            fp8_e5m2_t, fp8_e4m3fn_t, fp8_e8m0_t, int32_t, uint32_t, int64_t, uint64_t, complex32, complex64, double>();
    } else {
        return false;
    }
}

template <typename T, typename U> __aicore__ constexpr bool IsLocalTensorType()
{
    return IsLocalTensorType<T>() && IsLocalTensorType<U>();
}
} // TypeUtils

template <auto funcPtr, typename... Args> __aicore__ inline void VF_CALL(Args &&... args)
{
#if (defined(__NPU_ARCH__) && __NPU_ARCH__ == 5102) || defined(SPLIT_CORE_VEC) || defined(ASCENDC_CPU_DEBUG)
    __VEC_SCOPE__
    {
        funcPtr(args...);
    }
#endif
}
} // namespace AscendC
#endif

#endif // ASCENDC_MODULE_UTILS_CEIL_OOM_QUE_H
#if defined(__UNDEF_ASCENDC_INCLUDE_INTERNAL_HEADERS_KERNEL_UTILS_CEIL_OOM_QUE_H__)
#undef __ASCENDC_INCLUDE_INTERNAL_HEADERS__
#undef __UNDEF_ASCENDC_INCLUDE_INTERNAL_HEADERS_KERNEL_UTILS_CEIL_OOM_QUE_H__
#endif