* Copyright (c) 2025 Huawei Technologies Co., Ltd.
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
* CANN Open Software License Agreement Version 2.0 (the "License").
* Please refer to the License for details. You may not use this file except in compliance with the License.
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
* See LICENSE in the root of the software repository for the full text of the License.
*/
* \file device_error_tracking.cpp
* \brief
*/
#include "machine/runtime/runner/device_error_tracking.h"
#include <cstdint>
#include <iostream>
#include "adapter/api/acl_api.h"
#include "tilefwk/device_error_code.h"
namespace npu::tile_fwk {
struct ErrorCodeEntry {
int32_t retcode;
const char* msg;
};
static const ErrorCodeEntry kErrorCodeTable[] = {
{PYPTO_DEVICE_ERROR_PARAM_INVALID, "param invalid"},
{PYPTO_DEVICE_ERROR_INVALID_DEVICEID, "invalid device id"},
{PYPTO_DEVICE_ERROR_CONTEXT_NULL, "current context null"},
{PYPTO_DEVICE_ERROR_STREAM_CONTEXT, "stream not in current context"},
{PYPTO_DEVICE_ERROR_MODEL_CONTEXT, "model not in current context"},
{PYPTO_DEVICE_ERROR_STREAM_MODEL, "stream not in model"},
{PYPTO_DEVICE_ERROR_EVENT_TIMESTAMP_INVALID, "event timestamp invalid"},
{PYPTO_DEVICE_ERROR_EVENT_TIMESTAMP_REVERSAL, "event timestamp reversal"},
{PYPTO_DEVICE_ERROR_ADDR_UNALIGNED, "memory address unaligned"},
{PYPTO_DEVICE_ERROR_FILE_OPEN, "open file failed"},
{PYPTO_DEVICE_ERROR_FILE_WRITE, "write file failed"},
{PYPTO_DEVICE_ERROR_STREAM_SUBSCRIBE, "error subscribe stream"},
{PYPTO_DEVICE_ERROR_THREAD_SUBSCRIBE, "error subscribe thread"},
{PYPTO_DEVICE_ERROR_GROUP_NOT_SET, "group not set"},
{PYPTO_DEVICE_ERROR_GROUP_NOT_CREATE, "group not create"},
{PYPTO_DEVICE_ERROR_STREAM_NO_CB_REG, "callback not register to stream"},
{PYPTO_DEVICE_ERROR_INVALID_MEMORY_TYPE, "invalid memory type"},
{PYPTO_DEVICE_ERROR_INVALID_HANDLE, "invalid handle"},
{PYPTO_DEVICE_ERROR_INVALID_MALLOC_TYPE, "invalid malloc type"},
{PYPTO_DEVICE_ERROR_WAIT_TIMEOUT, "wait timeout"},
{PYPTO_DEVICE_ERROR_TASK_TIMEOUT, "task timeout"},
{PYPTO_DEVICE_ERROR_SYSPARAMOPT_NOT_SET, "not set sysparamopt"},
{PYPTO_DEVICE_ERROR_DEVICE_TASK_ABORT, "device task aborting"},
{PYPTO_DEVICE_ERROR_STREAM_ABORT, "stream aborting"},
{PYPTO_DEVICE_ERROR_CAPTURE_DEPENDENCY, "capture dependency failure"},
{PYPTO_DEVICE_ERROR_STREAM_UNJOINED, "invalid capture model"},
{PYPTO_DEVICE_ERROR_MODEL_CAPTURED, "model is captured"},
{PYPTO_DEVICE_ERROR_STREAM_CAPTURED, "stream is captured"},
{PYPTO_DEVICE_ERROR_EVENT_CAPTURED, "event is captured"},
{PYPTO_DEVICE_ERROR_STREAM_NOT_CAPTURED, "stream is not in capture status"},
{PYPTO_DEVICE_ERROR_CAPTURE_MODE_NOT_SUPPORT, "stream is captured, not support current oper"},
{PYPTO_DEVICE_ERROR_STREAM_CAPTURE_IMPLICIT, "a disallowed implicit dependency from default stream"},
{PYPTO_DEVICE_ERROR_STREAM_CAPTURE_CONFLICT, "interdependent stream cannot begin capture together"},
{PYPTO_DEVICE_ERROR_STREAM_TASK_GROUP_STATUS, "task group status error"},
{PYPTO_DEVICE_ERROR_STREAM_TASK_GROUP_INTR, "task group interrupted"},
{PYPTO_DEVICE_ERROR_TASK_ABORT_STOP, "device task aborting stop before post process"},
{PYPTO_DEVICE_ERROR_STREAM_CAPTURE_UNMATCHED, "the capture was not initiated in this stream"},
{PYPTO_DEVICE_ERROR_MODEL_RUNNING, "the model is still running"},
{PYPTO_DEVICE_ERROR_STREAM_CAPTURE_WRONG_THREAD, "the thread of end capture and begin capture is not same"},
{PYPTO_DEVICE_ERROR_INSUFFICIENT_INPUT_ARRAY, "input array capacity insufficient"},
{PYPTO_DEVICE_ERROR_MODEL_UPDATE_FAILED, "the model update failed"},
{PYPTO_DEVICE_ERROR_CAPTURE_MODE_BLOCK_ASYNC, "async oper convert to sync oper, stream is captured"},
{PYPTO_DEVICE_ERROR_SYMBOL_NOT_FOUND, "symbol not found"},
{PYPTO_DEVICE_ERROR_FEATURE_NOT_SUPPORT, "feature not support"},
{PYPTO_DEVICE_ERROR_MEMORY_ALLOCATION, "memory allocation error"},
{PYPTO_DEVICE_ERROR_MEMORY_FREE, "memory free error"},
{PYPTO_DEVICE_ERROR_AICORE_OVER_FLOW, "aicore over flow"},
{PYPTO_DEVICE_ERROR_NO_DEVICE, "no device"},
{PYPTO_DEVICE_ERROR_RESOURCE_ALLOC_FAIL, "resource alloc fail"},
{PYPTO_DEVICE_ERROR_NO_PERMISSION, "no permission"},
{PYPTO_DEVICE_ERROR_NO_EVENT_RESOURCE, "no event resource"},
{PYPTO_DEVICE_ERROR_NO_STREAM_RESOURCE, "no stream resource"},
{PYPTO_DEVICE_ERROR_NO_NOTIFY_RESOURCE, "no notify resource"},
{PYPTO_DEVICE_ERROR_NO_MODEL_RESOURCE, "no model resource"},
{PYPTO_DEVICE_ERROR_NO_CDQ_RESOURCE, "no cdq resource"},
{PYPTO_DEVICE_ERROR_OVER_LIMIT, "over limit"},
{PYPTO_DEVICE_ERROR_QUEUE_EMPTY, "queue is empty"},
{PYPTO_DEVICE_ERROR_QUEUE_FULL, "queue is full"},
{PYPTO_DEVICE_ERROR_REPEATED_INIT, "repeated init"},
{PYPTO_DEVICE_ERROR_AIVEC_OVER_FLOW, "aivec over flow"},
{PYPTO_DEVICE_ERROR_OVER_FLOW, "common over flow"},
{PYPTO_DEVICE_ERROR_DEVICE_OOM, "device oom"},
{PYPTO_DEVICE_ERROR_FEATURE_NOT_SUPPORT_UPDATE_OP, "not support to update this op"},
{PYPTO_DEVICE_ERROR_TIMEOUT, "driver timeout"},
{PYPTO_DEVICE_ERROR_INTERNAL_ERROR, "runtime internal error"},
{PYPTO_DEVICE_ERROR_TS_ERROR, "ts internal error"},
{PYPTO_DEVICE_ERROR_STREAM_TASK_FULL, "task full in stream"},
{PYPTO_DEVICE_ERROR_STREAM_TASK_EMPTY, "task empty in stream"},
{PYPTO_DEVICE_ERROR_STREAM_NOT_COMPLETE, "stream not complete"},
{PYPTO_DEVICE_ERROR_END_OF_SEQUENCE, "end of sequence"},
{PYPTO_DEVICE_ERROR_EVENT_NOT_COMPLETE, "event not complete"},
{PYPTO_DEVICE_ERROR_CONTEXT_RELEASE_ERROR, "context release error"},
{PYPTO_DEVICE_ERROR_SOC_VERSION, "soc version error"},
{PYPTO_DEVICE_ERROR_TASK_TYPE_NOT_SUPPORT, "task type not support"},
{PYPTO_DEVICE_ERROR_LOST_HEARTBEAT, "ts lost heartbeat"},
{PYPTO_DEVICE_ERROR_MODEL_EXECUTE, "model execute failed"},
{PYPTO_DEVICE_ERROR_REPORT_TIMEOUT, "report timeout"},
{PYPTO_DEVICE_ERROR_SYS_DMA, "sys dma error"},
{PYPTO_DEVICE_ERROR_AICORE_TIMEOUT, "aicore timeout"},
{PYPTO_DEVICE_ERROR_AICORE_EXCEPTION, "aicore exception"},
{PYPTO_DEVICE_ERROR_AICORE_TRAP_EXCEPTION, "aicore trap exception"},
{PYPTO_DEVICE_ERROR_AICPU_TIMEOUT, "aicpu timeout"},
{PYPTO_DEVICE_ERROR_AICPU_EXCEPTION, "aicpu exception"},
{PYPTO_DEVICE_ERROR_AICPU_DATADUMP_RSP_ERR, "aicpu datadump response error"},
{PYPTO_DEVICE_ERROR_AICPU_MODEL_RSP_ERR, "aicpu model operate response error"},
{PYPTO_DEVICE_ERROR_PROFILING_ERROR, "profiling error"},
{PYPTO_DEVICE_ERROR_IPC_ERROR, "ipc error"},
{PYPTO_DEVICE_ERROR_MODEL_ABORT_NORMAL, "model abort normal"},
{PYPTO_DEVICE_ERROR_KERNEL_UNREGISTERING, "kernel unregistering"},
{PYPTO_DEVICE_ERROR_RINGBUFFER_NOT_INIT, "ringbuffer not init"},
{PYPTO_DEVICE_ERROR_RINGBUFFER_NO_DATA, "ringbuffer no data"},
{PYPTO_DEVICE_ERROR_KERNEL_LOOKUP, "kernel lookup error"},
{PYPTO_DEVICE_ERROR_KERNEL_DUPLICATE, "kernel register duplicate"},
{PYPTO_DEVICE_ERROR_DEBUG_REGISTER_FAIL, "debug register failed"},
{PYPTO_DEVICE_ERROR_DEBUG_UNREGISTER_FAIL, "debug unregister failed"},
{PYPTO_DEVICE_ERROR_LABEL_CONTEXT, "label not in current context"},
{PYPTO_DEVICE_ERROR_PROGRAM_USE_OUT, "program register num use out"},
{PYPTO_DEVICE_ERROR_DEV_SETUP_ERROR, "device setup error"},
{PYPTO_DEVICE_ERROR_VECTOR_CORE_TIMEOUT, "vector core timeout"},
{PYPTO_DEVICE_ERROR_VECTOR_CORE_EXCEPTION, "vector core exception"},
{PYPTO_DEVICE_ERROR_VECTOR_CORE_TRAP_EXCEPTION, "vector core trap exception"},
{PYPTO_DEVICE_ERROR_CDQ_BATCH_ABNORMAL, "cdq alloc batch abnormal"},
{PYPTO_DEVICE_ERROR_DIE_MODE_CHANGE_ERROR, "can not change die mode"},
{PYPTO_DEVICE_ERROR_DIE_SET_ERROR, "single die mode can not set die"},
{PYPTO_DEVICE_ERROR_INVALID_DIEID, "invalid die id"},
{PYPTO_DEVICE_ERROR_DIE_MODE_NOT_SET, "die mode not set"},
{PYPTO_DEVICE_ERROR_AICORE_TRAP_READ_OVERFLOW, "aic trap read overflow"},
{PYPTO_DEVICE_ERROR_AICORE_TRAP_WRITE_OVERFLOW, "aic trap write overflow"},
{PYPTO_DEVICE_ERROR_VECTOR_CORE_TRAP_READ_OVERFLOW, "aiv trap read overflow"},
{PYPTO_DEVICE_ERROR_VECTOR_CORE_TRAP_WRITE_OVERFLOW, "aiv trap write overflow"},
{PYPTO_DEVICE_ERROR_STREAM_SYNC_TIMEOUT, "stream sync time out"},
{PYPTO_DEVICE_ERROR_EVENT_SYNC_TIMEOUT, "event sync time out"},
{PYPTO_DEVICE_ERROR_FFTS_PLUS_TIMEOUT, "ffts+ timeout"},
{PYPTO_DEVICE_ERROR_FFTS_PLUS_EXCEPTION, "ffts+ exception"},
{PYPTO_DEVICE_ERROR_FFTS_PLUS_TRAP_EXCEPTION, "ffts+ trap exception"},
{PYPTO_DEVICE_ERROR_SEND_MSG, "hdc send msg fail"},
{PYPTO_DEVICE_ERROR_COPY_DATA, "copy data fail"},
{PYPTO_DEVICE_ERROR_DEVICE_MEM_ERROR, "device MEM ERROR"},
{PYPTO_DEVICE_ERROR_HBM_MULTI_BIT_ECC_ERROR, "hbm Multi-bit ECC error"},
{PYPTO_DEVICE_ERROR_SUSPECT_DEVICE_MEM_ERROR, "suspect device MEM ERROR"},
{PYPTO_DEVICE_ERROR_LINK_ERROR, "link ERROR"},
{PYPTO_DEVICE_ERROR_SUSPECT_REMOTE_ERROR, "suspect remote ERROR"},
{PYPTO_DEVICE_ERROR_DRV_INTERNAL_ERROR, "drv internal error"},
{PYPTO_DEVICE_ERROR_AICPU_INTERNAL_ERROR, "aicpu internal error"},
{PYPTO_DEVICE_ERROR_SOCKET_CLOSE, "hdc disconnect"},
{PYPTO_DEVICE_ERROR_AICPU_INFO_LOAD_RSP_ERR, "aicpu info load response error"},
{PYPTO_DEVICE_ERROR_STREAM_CAPTURE_INVALIDATED, "capture status is invalidated"},
{PYPTO_DEVICE_ERROR_COMM_OP_RETRY_FAIL, "hccl operation retry failed"},
};
const char* GetRetcodeMessage(int32_t retcode)
{
for (const auto& entry : kErrorCodeTable) {
if (entry.retcode == retcode) {
return entry.msg;
}
}
return "unknown error";
}
void PyPTOExceptionInfoCallBack(AclRtExceptionInfo* exceptionInfo)
{
const char* errMsg = GetRetcodeMessage(static_cast<int32_t>(exceptionInfo->retcode));
const char *kernelName = "(Null)";
if (exceptionInfo->expandInfo.type == RtExceptionExpandType::AICORE) {
kernelName = exceptionInfo->expandInfo.u.aicoreInfo.exceptionArgs.exceptionKernelInfo.kernelName;
}
printf("[Error]: %s, device_id: %u, stream_id: %u, task_id: %u, retcode: %u, kernelName: %s\n",
errMsg, exceptionInfo->deviceid, exceptionInfo->streamid, exceptionInfo->taskid,
exceptionInfo->retcode, kernelName);
printf(" Rectify the fault based on the error information in the ascend log.\n");
printf("PyPTO error: PyPTO Inner Error. Please rectify the fault based on the error information "
"in the ascend log. (function PyPTOExceptionInfoCallBack)\n");
}
void InitializeErrorCallback()
{
AclError ret = AclRtSetExceptionInfoCallback(&PyPTOExceptionInfoCallBack);
if (ret != ACLRT_SUCCESS) {
printf("Failed to set exception callback: %d\n", ret);
}
}
}