* Copyright (c) 2026 Huawei Technologies Co., Ltd.
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
* CANN Open Software License Agreement Version 2.0 (the "License").
* Please refer to the License for details. You may not use this file except in compliance with the License.
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
* See LICENSE in the root of the software repository for the full text of the License.
*/
* \file asc_debug_utils.h
* \brief
*/
#ifndef IMPL_UTILS_DEBUG_ASC_DEBUG_UTILS_H
#define IMPL_UTILS_DEBUG_ASC_DEBUG_UTILS_H
inline __gm__ uint8_t* __gm__ g_sysPrintFifoSpace = nullptr;
#ifndef ASCENDC_CPU_DEBUG
#include <type_traits>
#include "impl/utils/sys_macros.h"
#include "impl/utils/debug/asc_utils_types.h"
#include "impl/utils/debug/asc_debug_types.h"
#if __NPU_ARCH__ == 2002
#include "impl/utils/debug/npu_arch_2002/asc_debug_utils_impl.h"
#elif __NPU_ARCH__ == 2201
#include "impl/utils/debug/npu_arch_2201/asc_debug_utils_impl.h"
#elif __NPU_ARCH__ == 3510
#include "impl/utils/debug/npu_arch_3510/asc_debug_utils_impl.h"
#elif __NPU_ARCH__ == 5102
#include "impl/utils/debug/npu_arch_5102/asc_debug_utils_impl.h"
#else
#include "impl/utils/debug/asc_debug_utils_impl_stub.h"
#endif
namespace __asc_aicore {
__aicore__ inline void asc_copy_ub2gm_align(__gm__ void* dst, __ubuf__ void* src, uint32_t size)
{
copy_ubuf_to_gm_align_v2(dst, src, 1, size, 0, 0, 0, 0);
}
__aicore__ inline void enable_asc_diagnostics()
{
#if (!defined(ASCENDC_DUMP) || (ASCENDC_DUMP != 0)) || defined(ASCENDC_TIME_STAMP_ON)
static const struct AscTlv __asc_debug_meta_section__ __attribute__ ((used, section (".ascend.meta"))) =
{4, 4, 1};
#endif
}
__aicore__ inline void enable_asc_assert()
{
#if (!defined(ASCENDC_DUMP) || (ASCENDC_DUMP != 0)) || defined(ASCENDC_TIME_STAMP_ON)
#if (__NPU_ARCH__ == 5102)
static const struct AscTlv __asc_assert_meta_section__ __attribute__ ((used, section (".ascend.meta"))) =
{4, 4, 1};
#else
static const struct AscTlv __asc_assert_meta_section__ __attribute__ ((used, section (".ascend.meta"))) =
{4, 4, 5};
#endif
#endif
}
template <typename T, typename U, typename... Args>
__aicore__ constexpr bool is_same_in_list()
{
if constexpr (sizeof...(Args) > 0) {
return std::is_same<T, U>::value || is_same_in_list<T, Args...>();
}
return std::is_same<T, U>::value;
}
__aicore__ inline void asc_entire_dcci(__gm__ uint64_t* ptr)
{
asc_entire_dcci_impl(ptr);
}
__aicore__ inline uint64_t asc_debug_get_system_cycle()
{
return asc_debug_get_system_cycle_impl();
}
__aicore__ inline uint32_t asc_debug_get_core_idx()
{
return asc_debug_get_core_idx_impl();
}
__aicore__ inline uint64_t asc_debug_get_block_idx()
{
return asc_debug_get_block_idx_impl();
}
__aicore__ inline uint64_t asc_debug_get_block_num()
{
return get_block_num();
}
__aicore__ inline uint64_t asc_debug_get_block_total_num()
{
#if (__NPU_ARCH__ == 2201) || (__NPU_ARCH__ == 3510)
return asc_debug_get_block_num() * get_task_ration();
#else
return asc_debug_get_block_num();
#endif
}
__aicore__ inline int64_t asc_debug_get_program_counter()
{
return get_pc();
}
template <uint64_t timeoutCycle = 15 * 1000 * 1000>
__aicore__ inline void ringbuf_wait_rts_sync()
{
return ringbuf_wait_rts_sync_impl<timeoutCycle>();
}
__aicore__ inline uint32_t get_cstring_len(__gm__ const char* s)
{
uint32_t i = 0;
while (*(s + i) != '\0') {
i++;
}
return i + 1;
}
__aicore__ constexpr inline uint32_t div_ceil(uint32_t a, uint32_t b)
{
if (b == 0) {
return 0;
}
return (a + b - 1) / b;
}
__aicore__ constexpr inline uint32_t align_up(uint32_t a, uint32_t b)
{
return div_ceil(a, b) * b;
}
__aicore__ inline void sync_all()
{
sync_all_impl();
}
__aicore__ inline uint32_t mem_copy_gm_to_gm(__gm__ uint8_t* dst, __gm__ const uint8_t* src, const uint32_t& len)
{
if (dst == nullptr || src == nullptr)
{
return 1;
}
for (uint32_t i = 0; i < len; i++) {
*(dst + i) = *(src + i);
}
asc_entire_dcci((__gm__ uint64_t*)dst);
return 0;
}
__aicore__ __gm__ inline DebugBlockHeadInfo* get_block_info()
{
uint32_t blockIdx = asc_debug_get_core_idx();
uint32_t blockLength = reinterpret_cast<__gm__ DebugBlockHeadInfo*>(g_sysPrintFifoSpace)->length;
__gm__ DebugBlockHeadInfo* blockInfo =
reinterpret_cast<__gm__ DebugBlockHeadInfo*>(g_sysPrintFifoSpace + blockLength * blockIdx);
return blockInfo->magic == 0xAE86 ? blockInfo : nullptr;
}
__aicore__ __gm__ inline DebugBlockReadInfo* get_block_read_info(__gm__ DebugBlockHeadInfo* blockInfo)
{
__gm__ uint8_t* blockHeadAddr = reinterpret_cast<__gm__ uint8_t*>(blockInfo);
return reinterpret_cast<__gm__ DebugBlockReadInfo*>(blockHeadAddr + sizeof(DebugBlockHeadInfo));
}
__aicore__ __gm__ inline DebugBlockWriteInfo* get_block_write_info(__gm__ DebugBlockHeadInfo* blockInfo)
{
__gm__ uint8_t* ringBufAddr = reinterpret_cast<__gm__ uint8_t*>(blockInfo->ringBufAddr);
return reinterpret_cast<__gm__ DebugBlockWriteInfo*>(ringBufAddr + blockInfo->ringBufLen);
}
__aicore__ __gm__ inline uint8_t* get_ringbuf_tlv_addr(__gm__ DebugBlockHeadInfo* blockInfo)
{
__gm__ DebugBlockWriteInfo* writeInfo = get_block_write_info(blockInfo);
__gm__ uint8_t* ringBufAddr = reinterpret_cast<__gm__ uint8_t*>(blockInfo->ringBufAddr);
return ringBufAddr + writeInfo->bufOffset;
}
__aicore__ inline void update_write_info(__gm__ DebugBlockWriteInfo* writeInfo, const uint32_t& tlvLen)
{
writeInfo->bufOffset += tlvLen;
writeInfo->packIdx += 1;
asc_entire_dcci((__gm__ uint64_t*)writeInfo);
}
__aicore__ inline bool ringbuf_read_wait(__gm__ DebugBlockReadInfo* readInfo)
{
constexpr uint32_t maxCounter = 15;
uint32_t counter = 0;
while (readInfo->bufOffset == 0) {
if (counter >= maxCounter) {
return false;
}
ringbuf_wait_rts_sync();
++counter;
asc_entire_dcci((__gm__ uint64_t*)readInfo);
}
return true;
}
__aicore__ inline void ringbuf_skip_directly(__gm__ DebugBlockWriteInfo* writeInfo)
{
writeInfo->bufOffset = 0;
asc_entire_dcci((__gm__ uint64_t*)writeInfo);
}
__aicore__ inline void ringbuf_skip_with_info(
__gm__ DebugBlockWriteInfo* writeInfo, __gm__ uint8_t* ringBufAddr, const uint32_t& ringBufLen)
{
__gm__ SkipTlv* skipInfo = reinterpret_cast<__gm__ SkipTlv*>(ringBufAddr + writeInfo->bufOffset);
skipInfo->type = static_cast<uint32_t>(DumpType::DUMP_SKIP);
skipInfo->length = ringBufLen - writeInfo->bufOffset - sizeof(SkipTlv);
writeInfo->bufOffset = 0;
writeInfo->packIdx += 1;
asc_entire_dcci((__gm__ uint64_t*)skipInfo);
asc_entire_dcci((__gm__ uint64_t*)writeInfo);
return;
}
__aicore__ inline bool ringbuf_overflow_wait(__gm__ DebugBlockReadInfo* readInfo, __gm__ DebugBlockWriteInfo* writeInfo,
const uint32_t& tlvLen)
{
constexpr uint32_t maxCounter = 15;
uint32_t counter = 0;
while (writeInfo->bufOffset < readInfo->bufOffset && writeInfo->bufOffset + tlvLen >= readInfo->bufOffset) {
if (counter >= maxCounter) {
return false;
}
ringbuf_wait_rts_sync();
++counter;
asc_entire_dcci((__gm__ uint64_t*)readInfo);
}
return true;
}
__aicore__ inline bool check_ringbuf_space(__gm__ DebugBlockHeadInfo* blockInfo, const uint32_t& tlvLen)
{
constexpr uint32_t minTlvLen = sizeof(SkipTlv);
__gm__ uint8_t* ringBufAddr = reinterpret_cast<__gm__ uint8_t*>(blockInfo->ringBufAddr);
uint32_t ringBufLen = blockInfo->ringBufLen;
__gm__ DebugBlockReadInfo* readInfo = get_block_read_info(blockInfo);
__gm__ DebugBlockWriteInfo* writeInfo = get_block_write_info(blockInfo);
if (minTlvLen >= ringBufLen || tlvLen > ringBufLen) {
return false;
} else if (writeInfo->bufOffset + minTlvLen >= ringBufLen) {
if (!ringbuf_read_wait(readInfo)) {
return false;
}
ringbuf_skip_directly(writeInfo);
} else if (writeInfo->bufOffset + tlvLen > ringBufLen) {
if (!ringbuf_read_wait(readInfo)) {
return false;
}
ringbuf_skip_with_info(writeInfo, ringBufAddr, ringBufLen);
}
if (writeInfo->packIdx > 0 &&
writeInfo->bufOffset < readInfo->bufOffset &&
writeInfo->bufOffset + tlvLen >= readInfo->bufOffset) {
return ringbuf_overflow_wait(readInfo, writeInfo, tlvLen);
}
return true;
}
template <typename T>
__aicore__ constexpr inline DumpTensorDataType get_dump_datatype_impl();
template <typename T>
__aicore__ constexpr inline DumpTensorDataType get_dump_datatype()
{
return get_dump_datatype_impl<T>();
}
__aicore__ inline void asc_debug_get_cann_vserion(__gm__ char*& versionStr, uint64_t& version, uint64_t& timeStamp)
{
#ifdef ASC_DEVKIT_VERSION_STR
versionStr = const_cast<__gm__ char*>(ASC_DEVKIT_VERSION_STR);
#else
versionStr = const_cast<__gm__ char*>("Unknown ASC_DEVKIT version");
#endif
#ifdef ASC_DEVKIT_TIMESTAMP
timeStamp = static_cast<uint64_t>(ASC_DEVKIT_TIMESTAMP);
#else
timeStamp = 0;
#endif
#ifdef ASC_DEVKIT_VERSION
version = static_cast<uint64_t>(ASC_DEVKIT_VERSION);
#else
version = 0;
#endif
}
}
#endif
__aicore__ static __attribute__((noinline)) void AscVFDebugInitUb()
{
#if !defined(ASCENDC_CPU_DEBUG) && defined(__NPU_ARCH__) && __NPU_ARCH__ == 3510
if (g_sysPrintFifoSpace != nullptr) {
constexpr uint32_t RESERVED_UB_SIZE = 8 * 1024;
uint64_t ascReservedAddr = get_shmem_sz() - RESERVED_UB_SIZE;
uint16_t blockIdx = asc_debug_get_block_idx();
get_printf_ubuf_addr_aicore(ascReservedAddr, blockIdx);
}
#endif
}
__aicore__ static __attribute__((noinline)) void AscVFDebugTransferUb()
{
#if !defined(ASCENDC_CPU_DEBUG) && defined(__NPU_ARCH__) && __NPU_ARCH__ == 3510
if (g_sysPrintFifoSpace != nullptr) {
pipe_barrier(PIPE_ALL);
asc_vf_debug_ub2gm();
}
#endif
}
#if defined(__UNDEF_ASCENDC_INCLUDE_INTERNAL_HEADERS_ASC_DEBUG_UTILS__)
#undef __ASCENDC_INCLUDE_INTERNAL_HEADERS__
#undef __UNDEF_ASCENDC_INCLUDE_INTERNAL_HEADERS_ASC_DEBUG_UTILS__
#endif
#endif