* Copyright (c) 2025 Huawei Technologies Co., Ltd.
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
* CANN Open Software License Agreement Version 2.0 (the "License").
* Please refer to the License for details. You may not use this file except in compliance with the License.
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
* See LICENSE in the root of the software repository for the full text of the License.
*/
* \file sk_dfx_exception_handler.cpp
* \brief SuperKernel DFX, handle sk aicore error
*/
#include <cstdarg>
#include <cstring>
#include <set>
#include <map>
#include "sk_dfx_exception_handler.h"
#include "sk_log.h"
#include "sk_types.h"
#include "sk_common.h"
#include "sk_event_recorder.h"
#include "runtime/kernel.h"
SuperKernelExceptionHandler::SuperKernelExceptionHandler()
: aicoreNums(GetSkRuntimeConfig().eventCoreNum)
, skDeviceEntryArgsDev(nullptr)
, skDeviceEntryArgsPtrLen(0)
, skDeviceEntryArgsHost(nullptr)
, skHeaderInfoHost(nullptr)
, aicTaskQueDevPtr(nullptr)
, aivTaskQueDevPtr(nullptr)
, aicTaskCnt(0)
, aivTaskCnt(0)
, hasOpTrace_(false)
, errorNodeIdx_(-1)
{
}
void SuperKernelExceptionHandler::HandleException(aclrtExceptionInfo *exceptionInfo) {
if (exceptionInfo == nullptr) {
SK_LOGE("Exception info is null");
return;
}
if (!IsSuperKernelException(exceptionInfo)) {
SK_LOGD("Exception is not from sk_entry, skip handling.");
return;
}
SK_LOGD("Aclgraph superkernel aicore exception occurred, callback function.");
if (!ExtractSkEntryArgs(exceptionInfo)) {
FreeResources();
return;
}
if (!ExtractTaskQueue()) {
FreeResources();
return;
}
ExtractAndPrintSkInfo();
if (!ParseAndPrintSubKernelSymbols(exceptionInfo)) {
FreeResources();
return;
}
SkEventRecorder::Instance().PrintModelIdIndexMap();
PrintAllCoreSymbols();
FreeResources();
}
bool SuperKernelExceptionHandler::ExtractSkEntryArgs(aclrtExceptionInfo *exceptionInfo) {
if (!ExtractSkDeviceEntryArgsPtr(exceptionInfo)) {
return false;
}
if (!CopySkDeviceEntryArgsToHost()) {
return false;
}
return true;
}
bool SuperKernelExceptionHandler::CopySkDeviceEntryArgsToHost() {
if (!ExtractSkHeaderInfo()) {
return false;
}
SK_LOGI("---Total SkDeviceEntryArgs size: %lu bytes", skHeaderInfoHost->totalSize);
if (CheckError(aclrtMallocHost((void **)(&skDeviceEntryArgsHost), skHeaderInfoHost->totalSize),
"aclrtMallocHost for skDeviceEntryArgsHost") != ACL_SUCCESS) {
aclrtFreeHost(skHeaderInfoHost);
skHeaderInfoHost = nullptr;
return false;
}
if (CheckError(aclrtMemcpy(skDeviceEntryArgsHost, skHeaderInfoHost->totalSize,
skDeviceEntryArgsDev, skHeaderInfoHost->totalSize,
ACL_MEMCPY_DEVICE_TO_HOST),
"aclrtMemcpy for skDeviceEntryArgs") != ACL_SUCCESS) {
aclrtFreeHost(skHeaderInfoHost);
skHeaderInfoHost = nullptr;
return false;
}
if (skHeaderInfoHost != nullptr) {
aclrtFreeHost(skHeaderInfoHost);
skHeaderInfoHost = nullptr;
}
skHeaderInfoHost = &(skDeviceEntryArgsHost->skHeader);
return true;
}
bool SuperKernelExceptionHandler::ExtractSkDeviceEntryArgsPtr(aclrtExceptionInfo *exceptionInfo) {
auto ret = aclrtGetArgsFromExceptionInfo(exceptionInfo, &skDeviceEntryArgsDev, &skDeviceEntryArgsPtrLen);
SK_LOGI("---skDeviceEntryArgsDev: %p", skDeviceEntryArgsDev);
SK_LOGI("---skDeviceEntryArgsPtrLen: %d", skDeviceEntryArgsPtrLen);
if (skDeviceEntryArgsPtrLen < 8) {
SK_LOGI("no args, callback return");
return false;
}
return true;
}
bool SuperKernelExceptionHandler::ExtractSkHeaderInfo() {
SkHeaderInfo *tempHeader = nullptr;
if (CheckError(aclrtMallocHost((void **)(&tempHeader), sizeof(SkHeaderInfo)),
"aclrtMallocHost for temp SkHeaderInfo") != ACL_SUCCESS) {
return false;
}
if (CheckError(aclrtMemcpy(tempHeader, sizeof(SkHeaderInfo),
skDeviceEntryArgsDev, sizeof(SkHeaderInfo),
ACL_MEMCPY_DEVICE_TO_HOST),
"aclrtMemcpy for SkHeaderInfo") != ACL_SUCCESS) {
aclrtFreeHost(tempHeader);
return false;
}
skHeaderInfoHost = tempHeader;
return true;
}
bool SuperKernelExceptionHandler::ExtractTaskQueue() {
uint8_t *dataBase = reinterpret_cast<uint8_t*>(skDeviceEntryArgsHost);
if (skHeaderInfoHost->aicQueOffset > 0) {
TaskQue *aicTaskQue = reinterpret_cast<TaskQue*>(dataBase + skHeaderInfoHost->aicQueOffset);
aicTaskCnt = aicTaskQue->taskCnt;
for (uint32_t i = 0; i < aicTaskCnt; ++i) {
if (aicTaskQue->taskInfos[i].type == SkTaskType::TYPE_FUNC) {
funcNodeIndices_.insert(aicTaskQue->taskInfos[i].index);
}
}
}
if (skHeaderInfoHost->aivQueOffset > 0) {
TaskQue *aivTaskQue = reinterpret_cast<TaskQue*>(dataBase + skHeaderInfoHost->aivQueOffset);
aivTaskCnt = aivTaskQue->taskCnt;
for (uint32_t i = 0; i < aivTaskCnt; ++i) {
if (aivTaskQue->taskInfos[i].type == SkTaskType::TYPE_FUNC) {
funcNodeIndices_.insert(aivTaskQue->taskInfos[i].index);
}
}
}
return true;
}
void SuperKernelExceptionHandler::PrintSkHeaderInfo() const {
SK_LOGI("=== SkHeaderInfo ===");
SK_LOGI("aicQueOffset: %u", skHeaderInfoHost->aicQueOffset);
SK_LOGI("aivQueOffset: %u", skHeaderInfoHost->aivQueOffset);
SK_LOGI("counterOffset: %u", skHeaderInfoHost->counterOffset);
SK_LOGI("dfxOffset: %u", skHeaderInfoHost->dfxOffset);
SK_LOGI("nodeCnt: %u", skHeaderInfoHost->nodeCnt);
SK_LOGI("totalSize: %lu", skHeaderInfoHost->totalSize);
uint16_t modelIdIdx = static_cast<uint16_t>((skHeaderInfoHost->modelIdIndexAndSkScopeId >> 32) & 0xFFFF);
uint16_t skScopeId = static_cast<uint16_t>((skHeaderInfoHost->modelIdIndexAndSkScopeId >> 16) & 0xFFFF);
std::string modelId = SkEventRecorder::Instance().GetModelIdByIndex(modelIdIdx);
SK_LOGI("modelIdIndexAndSkScopeId: 0x%lx (modelIdIdx=%u, skScopeId=%u, modelId=%s)",
skHeaderInfoHost->modelIdIndexAndSkScopeId, modelIdIdx, skScopeId, modelId.c_str());
}
void SuperKernelExceptionHandler::ExtractAndPrintSkInfo() {
PrintSkHeaderInfo();
PrintTaskQueue();
PrintCounterInfo();
PrintDfxInfo();
}
void SuperKernelExceptionHandler::PrintTaskQueue() const {
uint8_t *dataBase = reinterpret_cast<uint8_t*>(skDeviceEntryArgsHost);
if (skHeaderInfoHost->aicQueOffset > 0) {
TaskQue *aicTaskQue = reinterpret_cast<TaskQue*>(dataBase + skHeaderInfoHost->aicQueOffset);
SK_LOGI("=== AIC TaskQue (offset=%u) ===", skHeaderInfoHost->aicQueOffset);
SK_LOGI("taskCnt: %u", aicTaskCnt);
for (uint32_t i = 0; i < aicTaskCnt; ++i) {
const TaskInfo &task = aicTaskQue->taskInfos[i];
SK_LOGI(" [%u] index=%u, type=%s, numBlocks=%u, args=0x%lx",
i, task.index, to_string(task.type), task.numBlocks, task.args);
}
}
if (skHeaderInfoHost->aivQueOffset > 0) {
TaskQue *aivTaskQue = reinterpret_cast<TaskQue*>(dataBase + skHeaderInfoHost->aivQueOffset);
SK_LOGI("=== AIV TaskQue (offset=%u) ===", skHeaderInfoHost->aivQueOffset);
SK_LOGI("taskCnt: %u", aivTaskCnt);
for (uint32_t i = 0; i < aivTaskCnt; ++i) {
const TaskInfo &task = aivTaskQue->taskInfos[i];
SK_LOGI(" [%u] index=%u, type=%s, numBlocks=%u, args=0x%lx",
i, task.index, to_string(task.type), task.numBlocks, task.args);
}
}
}
void SuperKernelExceptionHandler::PrintCounterInfo() const {
uint8_t *dataBase = reinterpret_cast<uint8_t*>(skDeviceEntryArgsHost);
if (skHeaderInfoHost->counterOffset > 0) {
SkCounterInfo *counterInfo = reinterpret_cast<SkCounterInfo*>(dataBase + skHeaderInfoHost->counterOffset);
SK_LOGI("=== SkCounterInfo (offset=%u) ===", skHeaderInfoHost->counterOffset);
for (uint32_t i = 0; i < aicoreNums; ++i) {
SK_LOGI(" [core %u] index=%u, opState=%u",
i, counterInfo[i].index, counterInfo[i].opState);
}
}
}
void SuperKernelExceptionHandler::PrintDfxInfo() const {
uint8_t *dataBase = reinterpret_cast<uint8_t*>(skDeviceEntryArgsHost);
if (skHeaderInfoHost->dfxOffset > 0) {
SkDfxInfo *dfxInfo = reinterpret_cast<SkDfxInfo*>(dataBase + skHeaderInfoHost->dfxOffset);
SK_LOGE("=== SkDfxInfo (offset=%u, nodeCnt=%u) ===",
skHeaderInfoHost->dfxOffset, skHeaderInfoHost->nodeCnt);
for (uint32_t i = 0; i < skHeaderInfoHost->nodeCnt; ++i) {
SK_LOGE(" [node %u] binHdl=0x%lx, funcHdlOri=0x%lx, aicSize=0x%x, aivSize=0x%x, numBlocks=%u, cubeNum=%u, vecNum=%u",
i, dfxInfo[i].binHdl, dfxInfo[i].funcHdlOri,
dfxInfo[i].aicSize, dfxInfo[i].aivSize,
dfxInfo[i].numBlocks, dfxInfo[i].cubeNum, dfxInfo[i].vecNum);
if (funcNodeIndices_.count(i) > 0) {
aclrtFuncHandle funcHdl = reinterpret_cast<aclrtFuncHandle>(dfxInfo[i].funcHdlOri);
char funcName[256] = {0};
aclError ret = aclrtGetFunctionName(funcHdl, sizeof(funcName), funcName);
if (ret == ACL_SUCCESS) {
SK_LOGE(" Origin function name: %s", funcName);
} else {
SK_LOGE(" Failed to get origin function name for node[%u], ret=%d", i, ret);
}
} else {
SK_LOGE(" Node[%u] is not a FUNC type", i);
}
for (int j = 0; j < 4; ++j) {
if (dfxInfo[i].entryAic[j] != 0) {
SK_LOGE(" entryAic[%d]=0x%lx", j, dfxInfo[i].entryAic[j]);
}
if (dfxInfo[i].entryAiv[j] != 0) {
SK_LOGE(" entryAiv[%d]=0x%lx", j, dfxInfo[i].entryAiv[j]);
}
}
}
}
}
bool SuperKernelExceptionHandler::GetExceptionRegInfo(const aclrtExceptionInfo &exception,
ExceptionRegInfo &exceptionRegInfo) {
rtError_t rtRet = rtGetExceptionRegInfo(&exception, &exceptionRegInfo.errRegInfo, &exceptionRegInfo.coreNum);
if (rtRet != 0) {
SK_LOGE("Call rtGetExceptionRegInfo for error register information failed. ret: %d", static_cast<int32_t>(rtRet));
return false;
}
SK_LOGI("Get error register information. coreNum=%u", exceptionRegInfo.coreNum);
return true;
}
uint64_t SuperKernelExceptionHandler::GetCondRegValue(const rtExceptionErrRegInfo_t &coreErrRegInfo) {
constexpr uint32_t COND_REG_LOW_IDX = 20;
constexpr uint32_t COND_REG_HIGH_IDX = 21;
uint64_t condValue = (static_cast<uint64_t>(coreErrRegInfo.errReg[COND_REG_HIGH_IDX]) << 32)
| static_cast<uint64_t>(coreErrRegInfo.errReg[COND_REG_LOW_IDX]);
return condValue;
}
void SuperKernelExceptionHandler::ParseAndPrintCondInfo(uint32_t coreId, rtCoreType_t coreType, uint64_t condValue) {
if (condValue == 0) {
SK_LOGE("[Core %u] Failed to get COND register value, possible reason: driver package not upgraded", coreId);
return;
}
uint8_t opState = static_cast<uint8_t>(condValue & 0xFF);
uint32_t opIndex = static_cast<uint32_t>((condValue >> 8) & 0xFF);
const char *coreTypeName = (coreType == RT_CORE_TYPE_AIC) ? "AIC" : "AIV";
SK_LOGE("[Core %u] %s COND register value: 0x%lx (opState=%u, opIndex=%u)",
coreId, coreTypeName, condValue, opState, opIndex);
PrintCondSubKernelInfo(coreId, condValue);
}
void SuperKernelExceptionHandler::PrintCondSubKernelInfo(uint32_t coreId, uint64_t condValue) {
uint8_t opState = static_cast<uint8_t>(condValue & 0xFF);
uint32_t opIndex = static_cast<uint32_t>((condValue >> 8) & 0xFF);
if (opState == static_cast<uint8_t>(SkOpTraceType::ORIGIN)) {
SK_LOGE("[Core %u] COND: No SK entry executed yet (opState=ORIGIN).", coreId);
} else if (opState == static_cast<uint8_t>(SkOpTraceType::SK_ENTRY_LAUNCHED)) {
SK_LOGE("[Core %u] COND: SK entry launched, no sub-kernel executed yet. Next opIndex=%u", coreId, opIndex);
if (opIndex < skHeaderInfoHost->nodeCnt) {
KernelFuncName kernelFuncName = GetOrLoadKernelSymbols(opIndex);
if (!kernelFuncName.name.empty()) {
SK_LOGE("[Core %u] COND: Next sub-kernel function name: %s", coreId, kernelFuncName.name.c_str());
}
}
} else if (opState == static_cast<uint8_t>(SkOpTraceType::OP_LAUNCHED)) {
SK_LOGE("[Core %u] COND: Currently running sub-kernel opIndex=%u", coreId, opIndex);
if (opIndex < skHeaderInfoHost->nodeCnt) {
KernelFuncName kernelFuncName = GetOrLoadKernelSymbols(opIndex);
if (!kernelFuncName.name.empty()) {
SK_LOGE("[Core %u] COND: Current sub-kernel function name: %s", coreId, kernelFuncName.name.c_str());
}
}
} else if (opState == static_cast<uint8_t>(SkOpTraceType::OP_FINISHED)) {
SK_LOGE("[Core %u] COND: Sub-kernel opIndex=%u finished, about to run next.", coreId, opIndex);
if (opIndex < skHeaderInfoHost->nodeCnt) {
KernelFuncName currentKernelFuncName = GetOrLoadKernelSymbols(opIndex);
if (!currentKernelFuncName.name.empty()) {
SK_LOGE("[Core %u] COND: Last finished sub-kernel function name: %s", coreId, currentKernelFuncName.name.c_str());
}
}
if (opIndex + 1 < skHeaderInfoHost->nodeCnt) {
KernelFuncName nextKernelFuncName = GetOrLoadKernelSymbols(opIndex + 1);
if (!nextKernelFuncName.name.empty()) {
SK_LOGE("[Core %u] COND: Next sub-kernel function name: %s", coreId, nextKernelFuncName.name.c_str());
}
}
} else if (opState == static_cast<uint8_t>(SkOpTraceType::SK_ENTRY_FINISHED)) {
SK_LOGE("[Core %u] COND: SK entry execution completed.", coreId);
} else {
SK_LOGE("[Core %u] COND: Unknown opState=%u, opIndex=%u", coreId, opState, opIndex);
}
}
void SuperKernelExceptionHandler::PrintSymbolByCoreId(uint32_t coreId, rtCoreType_t coreType,
uint64_t startPC, uint64_t currentPC,
const KernelFuncName &kernelFuncName) {
const char *coreTypeName = (coreType == RT_CORE_TYPE_AIC) ? "AIC" : "AIV";
SK_LOGI("========== coreId=%u, coreType=%s, startPC=0x%lx, currentPC=0x%lx ==========",
coreId, coreTypeName, startPC, currentPC);
if (!kernelFuncName.name.empty()) {
SK_LOGE(" Function name: %s", kernelFuncName.name.c_str());
} else {
SK_LOGE(" No function name available");
}
}
KernelFuncName SuperKernelExceptionHandler::GetOrLoadKernelSymbols(uint32_t opId) {
auto it = opSymbolCache.find(opId);
if (it != opSymbolCache.end()) {
SK_LOGI("Using cached function name for opId=%u", opId);
return it->second;
}
if (opId >= skHeaderInfoHost->nodeCnt) {
SK_LOGE("opId=%u exceeds nodeCnt=%u", opId, skHeaderInfoHost->nodeCnt);
return KernelFuncName{""};
}
if (funcNodeIndices_.count(opId) == 0) {
SK_LOGI("opId=%u is not a FUNC type node", opId);
return KernelFuncName{""};
}
uint8_t *dataBase = reinterpret_cast<uint8_t*>(skDeviceEntryArgsHost);
SkDfxInfo *dfxInfo = reinterpret_cast<SkDfxInfo*>(dataBase + skHeaderInfoHost->dfxOffset);
aclrtFuncHandle funcHdl = reinterpret_cast<aclrtFuncHandle>(dfxInfo[opId].funcHdlOri);
SK_LOGI("Loading function name for opId=%u, funcHdl=%p", opId, funcHdl);
char funcName[256] = {0};
constexpr uint32_t maxLen = sizeof(funcName);
aclError ret = aclrtGetFunctionName(funcHdl, maxLen, funcName);
if (ret != ACL_SUCCESS) {
SK_LOGE("aclrtGetFunctionName failed for opId=%u, funcHdl=%p, ret=%d", opId, funcHdl, ret);
return KernelFuncName{""};
}
SK_LOGI("Got function name for opId=%u: %s", opId, funcName);
KernelFuncName kernelFuncName{std::string(funcName)};
opSymbolCache[opId] = kernelFuncName;
return kernelFuncName;
}
void SuperKernelExceptionHandler::PrintMatchedNodeBasicInfo(
uint32_t coreId, rtCoreType_t coreType, uint64_t startPC, uint64_t currentPC,
uint32_t nodeIdx, int entryIdx, uint64_t entryAddr, uint64_t endAddr,
uint32_t funcSize, const SkDfxInfo &dfxNode) const {
const char *coreTypeName = (coreType == RT_CORE_TYPE_AIC) ? "AIC" : "AIV";
uint16_t modelIdIdx = static_cast<uint16_t>((skHeaderInfoHost->modelIdIndexAndSkScopeId >> 32) & 0xFFFF);
uint16_t skScopeId = static_cast<uint16_t>((skHeaderInfoHost->modelIdIndexAndSkScopeId >> 16) & 0xFFFF);
std::string modelId = SkEventRecorder::Instance().GetModelIdByIndex(modelIdIdx);
SK_LOGE("============================================================");
SK_LOGE("[Core %u] ModelIdIdx=%u, ModelId=%s, SkScopeId=%u", coreId, modelIdIdx, modelId.c_str(), skScopeId);
SK_LOGE("[Core %u] CoreType: %s", coreId, coreTypeName);
SK_LOGE("[Core %u] StartPC: 0x%lx", coreId, startPC);
SK_LOGE("[Core %u] CurrentPC: 0x%lx", coreId, currentPC);
SK_LOGE("[Core %u] Found in node[%u], entry[%d]", coreId, nodeIdx, entryIdx);
SK_LOGE("[Core %u] Entry address: 0x%lx", coreId, entryAddr);
SK_LOGE("[Core %u] End address: 0x%lx", coreId, endAddr);
SK_LOGE("[Core %u] Function size: 0x%x (%u bytes)", coreId, funcSize, funcSize);
SK_LOGE("[Core %u] numBlocks=%u, cubeNum=%u, vecNum=%u", coreId,
dfxNode.numBlocks, dfxNode.cubeNum, dfxNode.vecNum);
}
void SuperKernelExceptionHandler::PrintFuncSymbolInfo(
uint32_t coreId, rtCoreType_t coreType, uint32_t nodeIdx, int entryIdx,
const uint64_t* entries, const SkDfxInfo &dfxNode) {
if (funcNodeIndices_.count(nodeIdx) == 0) {
SK_LOGE("[Core %u] Node[%u] is not a FUNC type", coreId, nodeIdx);
return;
}
aclrtFuncHandle funcHdl = reinterpret_cast<aclrtFuncHandle>(dfxNode.funcHdlOri);
char funcName[256] = {0};
aclError ret = aclrtGetFunctionName(funcHdl, sizeof(funcName), funcName);
if (ret == ACL_SUCCESS) {
SK_LOGE("[Core %u] Origin function name: %s", coreId, funcName);
} else {
SK_LOGE("Failed to get origin function name for node[%u], ret=%d", nodeIdx, ret);
}
aclrtBinHandle binHdl = reinterpret_cast<aclrtBinHandle>(dfxNode.binHdl);
void *binHostAddr = nullptr;
uint32_t binHostSize = 0;
int rtRet = rtGetBinBuffer(binHdl, RT_BIN_HOST_ADDR, &binHostAddr, &binHostSize);
if (rtRet != 0 || binHostAddr == nullptr) {
SK_LOGE("[Core %u] Failed to get bin buffer for node[%u], rtRet=%d", coreId, nodeIdx, rtRet);
return;
}
uint64_t funcOffset = (coreType == RT_CORE_TYPE_AIC)
? dfxNode.aicFuncOffset[entryIdx] : dfxNode.aivFuncOffset[entryIdx];
if (funcOffset == 0 && dfxNode.aicFuncOffset[0] != 0 && dfxNode.entryAic[0] != 0) {
uint64_t binDevAddr = dfxNode.entryAic[0] - dfxNode.aicFuncOffset[0];
funcOffset = entries[entryIdx] - binDevAddr;
SK_LOGI("[Core %u] Computed funcOffset=0x%lx for node[%u] entry[%d] (binDevAddr=0x%lx)",
coreId, funcOffset, nodeIdx, entryIdx, binDevAddr);
}
if (funcOffset == 0) {
SK_LOGE("[Core %u] funcOffset is 0 for node[%u] entry[%d], cannot resolve SK symbol name",
coreId, nodeIdx, entryIdx);
return;
}
std::string symbolName;
uint64_t symbolSize = 0;
std::string symbolBind;
if (GetFuncSymbolInfo(binHdl, static_cast<const char*>(binHostAddr), binHostSize,
funcOffset, symbolName, symbolSize, symbolBind)) {
SK_LOGE("[Core %u] Bound SK symbol name: %s (bind=%s, size=0x%lx)",
coreId, symbolName.c_str(), symbolBind.c_str(), symbolSize);
} else {
SK_LOGE("[Core %u] Failed to get SK symbol name for node[%u] entry[%d], offset=0x%lx",
coreId, nodeIdx, entryIdx, funcOffset);
}
}
void SuperKernelExceptionHandler::PrintNodeDevArgs(
uint32_t coreId, rtCoreType_t coreType, uint32_t nodeIdx) const {
uint8_t *dataBase = reinterpret_cast<uint8_t*>(skDeviceEntryArgsHost);
TaskQue *taskQue = (coreType == RT_CORE_TYPE_AIC)
? reinterpret_cast<TaskQue*>(dataBase + skHeaderInfoHost->aicQueOffset)
: reinterpret_cast<TaskQue*>(dataBase + skHeaderInfoHost->aivQueOffset);
uint32_t taskCnt = (coreType == RT_CORE_TYPE_AIC) ? aicTaskCnt : aivTaskCnt;
for (uint32_t t = 0; t < taskCnt; ++t) {
if (taskQue->taskInfos[t].index == nodeIdx && taskQue->taskInfos[t].type == SkTaskType::TYPE_FUNC) {
SK_LOGE("[Core %u] node[%u] devArgs: 0x%lx", coreId, nodeIdx, taskQue->taskInfos[t].args);
break;
}
}
}
void SuperKernelExceptionHandler::PrintNoMatchInfo(
uint32_t coreId, rtCoreType_t coreType, uint64_t startPC, uint64_t currentPC) const {
const char *coreTypeName = (coreType == RT_CORE_TYPE_AIC) ? "AIC" : "AIV";
uint16_t modelIdIdx = static_cast<uint16_t>((skHeaderInfoHost->modelIdIndexAndSkScopeId >> 32) & 0xFFFF);
uint16_t skScopeId = static_cast<uint16_t>((skHeaderInfoHost->modelIdIndexAndSkScopeId >> 16) & 0xFFFF);
std::string modelId = SkEventRecorder::Instance().GetModelIdByIndex(modelIdIdx);
SK_LOGE("============================================================");
SK_LOGE("[Core %u] No sub kernel matched, aicore error occurred in sk entry.", coreId);
SK_LOGE("[Core %u] ModelIdIdx=%u, ModelId=%s, SkScopeId=%u", coreId, modelIdIdx, modelId.c_str(), skScopeId);
SK_LOGE("[Core %u] CoreType: %s", coreId, coreTypeName);
SK_LOGE("[Core %u] startPC: 0x%lx", coreId, startPC);
SK_LOGE("[Core %u] CurrentPC: 0x%lx", coreId, currentPC);
SK_LOGE("============================================================");
}
void SuperKernelExceptionHandler::IdentifyErrorNodeByPC(uint32_t coreId, rtCoreType_t coreType,
uint64_t startPC, uint64_t currentPC) {
errorNodeIdx_ = -1;
if (skHeaderInfoHost->dfxOffset == 0 || skHeaderInfoHost->nodeCnt == 0 || currentPC == 0) {
return;
}
uint8_t *dataBase = reinterpret_cast<uint8_t*>(skDeviceEntryArgsHost);
SkDfxInfo *dfxInfo = reinterpret_cast<SkDfxInfo*>(dataBase + skHeaderInfoHost->dfxOffset);
for (uint32_t i = 0; i < skHeaderInfoHost->nodeCnt; ++i) {
uint64_t* entries = (coreType == RT_CORE_TYPE_AIC) ? dfxInfo[i].entryAic : dfxInfo[i].entryAiv;
uint32_t funcSize = (coreType == RT_CORE_TYPE_AIC) ? dfxInfo[i].aicSize : dfxInfo[i].aivSize;
for (int j = 0; j < 4; ++j) {
if (entries[j] == 0) {
continue;
}
uint64_t entryAddr = entries[j];
uint64_t endAddr = entryAddr + funcSize;
if (currentPC >= entryAddr && currentPC < endAddr) {
PrintMatchedNodeBasicInfo(coreId, coreType, startPC, currentPC, i, j, entryAddr, endAddr, funcSize, dfxInfo[i]);
PrintFuncSymbolInfo(coreId, coreType, i, j, entries, dfxInfo[i]);
PrintNodeDevArgs(coreId, coreType, i);
SK_LOGE("============================================================");
errorNodeIdx_ = i;
SK_LOGE("errorNodeIdx_ = %d", errorNodeIdx_);
return;
}
}
}
PrintNoMatchInfo(coreId, coreType, startPC, currentPC);
SK_LOGE("errorNodeIdx_ = -1");
}
void SuperKernelExceptionHandler::PrintCoreSymbols(uint32_t coreId, rtCoreType_t coreType,
uint64_t startPC, uint64_t currentPC) {
if (coreId >= aicoreNums) {
SK_LOGE("coreId=%u exceeds aicoreNums=%u", coreId, aicoreNums);
return;
}
if (!hasOpTrace_) {
return;
}
uint8_t *dataBase = reinterpret_cast<uint8_t*>(skDeviceEntryArgsHost);
SkCounterInfo *counterInfo = reinterpret_cast<SkCounterInfo*>(dataBase + skHeaderInfoHost->counterOffset);
uint32_t opState = counterInfo[coreId].opState;
uint32_t opId = counterInfo[coreId].index;
const char *coreTypeName = (coreType == RT_CORE_TYPE_AIC) ? "AIC" : "AIV";
SK_LOGE("[Core %u] Type=%s, opState=%u, opId=%u",
coreId, coreTypeName, opState, opId);
if (opState == static_cast<uint8_t>(SkOpTraceType::ORIGIN)) {
SK_LOGE("[Core %u] No SK entry executed yet.", coreId);
} else if (opState == static_cast<uint8_t>(SkOpTraceType::SK_ENTRY_LAUNCHED)) {
SK_LOGE("[Core %u] SK started but no sub-kernel executed yet, checking next operators", coreId);
if (opId < skHeaderInfoHost->nodeCnt) {
SK_LOGE("[Core %u] > Next opId=%u (first sub-kernel):", coreId, opId);
KernelFuncName nextKernelFuncName = GetOrLoadKernelSymbols(opId);
PrintSymbolByCoreId(coreId, coreType, startPC, currentPC, nextKernelFuncName);
} else {
SK_LOGE("[Core %u] > No next operator (opId=%u/%u)", coreId, opId, skHeaderInfoHost->nodeCnt);
}
} else if (opState == static_cast<uint8_t>(SkOpTraceType::OP_LAUNCHED)) {
SK_LOGE("[Core %u] Currently running opId=%u", coreId, opId);
KernelFuncName kernelFuncName = GetOrLoadKernelSymbols(opId);
PrintSymbolByCoreId(coreId, coreType, startPC, currentPC, kernelFuncName);
} else if (opState == static_cast<uint8_t>(SkOpTraceType::OP_FINISHED)) {
SK_LOGE("[Core %u] opId=%u finished, checking current and next operators", coreId, opId);
SK_LOGE("[Core %u] > Last finished opId=%u:", coreId, opId);
KernelFuncName currentKernelFuncName = GetOrLoadKernelSymbols(opId);
PrintSymbolByCoreId(coreId, coreType, startPC, currentPC, currentKernelFuncName);
if (opId + 1 < skHeaderInfoHost->nodeCnt) {
SK_LOGE("[Core %u] > Next opId=%u:", coreId, opId + 1);
KernelFuncName nextKernelFuncName = GetOrLoadKernelSymbols(opId + 1);
PrintSymbolByCoreId(coreId, coreType, startPC, currentPC, nextKernelFuncName);
} else {
SK_LOGE("[Core %u] > No next operator (opId=%u/%u)", coreId, opId, skHeaderInfoHost->nodeCnt);
}
} else if (opState == static_cast<uint8_t>(SkOpTraceType::SK_ENTRY_FINISHED)) {
SK_LOGE("[Core %u] SK entry operator execution completed.", coreId);
} else {
SK_LOGE("[Core %u] Unknown opState: %u", coreId, opState);
}
}
void SuperKernelExceptionHandler::PrintAllCoreSymbols() {
if (!hasOpTrace_) {
return;
}
SK_LOGE("==================================================");
SK_LOGE("=== Sub-kernel running info on all %u cores ===", aicoreNums);
SK_LOGE("==================================================");
for (uint32_t coreId = 0; coreId < aicoreNums; coreId++) {
rtCoreType_t coreType = CoreIsAiv(coreId) ? RT_CORE_TYPE_AIV : RT_CORE_TYPE_AIC;
uint64_t startPC = 0;
uint64_t currentPC = 0;
PrintCoreSymbols(coreId, coreType, startPC, currentPC);
}
}
bool SuperKernelExceptionHandler::ParseAndPrintSubKernelSymbols(aclrtExceptionInfo *exceptionInfo) {
ExceptionRegInfo exceptionRegInfo{0, nullptr};
if (!GetExceptionRegInfo(*reinterpret_cast<const aclrtExceptionInfo*>(exceptionInfo), exceptionRegInfo)) {
return false;
}
SK_LOGE("====================================================");
SK_LOGE("=== Sub-kernel running info on exception cores ===");
SK_LOGE("====================================================");
for (uint32_t i = 0; i < exceptionRegInfo.coreNum; i++) {
rtExceptionErrRegInfo_t coreErrRegInfo = exceptionRegInfo.errRegInfo[i];
rtCoreType_t coreType = static_cast<rtCoreType_t>(coreErrRegInfo.coreType);
IdentifyErrorNodeByPC(coreErrRegInfo.coreId, coreType, coreErrRegInfo.startPC, coreErrRegInfo.currentPC);
PrintCoreSymbols(coreErrRegInfo.coreId, coreType,
coreErrRegInfo.startPC,
coreErrRegInfo.currentPC);
uint64_t condValue = GetCondRegValue(coreErrRegInfo);
ParseAndPrintCondInfo(coreErrRegInfo.coreId, coreType, condValue);
}
return true;
}
void SuperKernelExceptionHandler::FreeResources() {
if (skDeviceEntryArgsHost != nullptr) {
aclrtFreeHost(skDeviceEntryArgsHost);
skDeviceEntryArgsHost = nullptr;
}
skHeaderInfoHost = nullptr;
errorNodeIdx_ = -1;
opSymbolCache.clear();
funcNodeIndices_.clear();
}
bool SuperKernelExceptionHandler::StartsWith(const char* source, const char* prefix) {
if (!source || !prefix) {
return false;
}
size_t srcLen = strlen(source);
size_t prefixLen = strlen(prefix);
if (prefixLen > srcLen) {
return false;
}
return strncmp(source, prefix, prefixLen) == 0;
}
aclError SuperKernelExceptionHandler::CheckError(aclError ret, const char *errorMsg) {
if (ret != ACL_SUCCESS) {
SK_LOGE("Operation failed: %s returned error code %d", errorMsg, static_cast<int32_t>(ret));
}
return ret;
}
bool SuperKernelExceptionHandler::IsSuperKernelException(aclrtExceptionInfo *exceptionInfo) {
constexpr uint32_t MAX_FUNC_NAME_LEN = 256;
char funcName[MAX_FUNC_NAME_LEN] = {0};
aclrtFuncHandle funcHandle = nullptr;
auto ret = aclrtGetFuncHandleFromExceptionInfo(exceptionInfo, &funcHandle);
if (ret != ACL_SUCCESS) {
SK_LOGE("Failed to get func handle from exception info, ret=%d", ret);
return false;
}
ret = aclrtGetFunctionName(funcHandle, MAX_FUNC_NAME_LEN, funcName);
if (ret != ACL_SUCCESS) {
SK_LOGE("Failed to get function name, ret=%d", ret);
return false;
}
if (!StartsWith(funcName, "sk_entry")) {
SK_LOGD("fault kernel_name '%s' does not start with 'sk_entry', skipping", funcName);
return false;
}
hasOpTrace_ = (strstr(funcName, "op_trace") != nullptr);
SK_LOGE("Exception is from superkernel function '%s', op_trace=%s, proceeding with handling",
funcName, hasOpTrace_ ? "true" : "false");
return true;
}
* \brief Prepare common operations before exception dump
* \param exceptionInfo Input: exception info
* \param exceptionRegInfo Output: exception register info
* \return aclError ACL_SUCCESS on success, error code on failure
*/
aclError SuperKernelExceptionHandler::PrepareExceptionDump(aclrtExceptionInfo* exceptionInfo, ExceptionRegInfo& exceptionRegInfo)
{
if (!ExtractSkEntryArgs(exceptionInfo) || !ExtractTaskQueue()) {
FreeResources();
SK_LOGE("Failed to extract SK entry args or task queue");
return ACL_ERROR_FAILURE;
}
exceptionRegInfo = {0, nullptr};
if (!GetExceptionRegInfo(*exceptionInfo, exceptionRegInfo)) {
FreeResources();
SK_LOGE("Failed to get exception register info");
return ACL_ERROR_FAILURE;
}
return ACL_SUCCESS;
}
* \brief Fill sk_entry function name into kernelDisplayName
* \param dumpInfo Output: dump info structure
* \param exceptionInfo Input: exception info
* \return aclError ACL_SUCCESS on success, error code on failure
*/
aclError SuperKernelExceptionHandler::PopulateSkEntryFields(Adx::ExceptionDumpInfo& dumpInfo, aclrtExceptionInfo* exceptionInfo)
{
char skFuncName[Adx::MAX_KERNELNAME_LEN] = {0};
aclrtFuncHandle funcHandle = nullptr;
aclError ret = aclrtGetFuncHandleFromExceptionInfo(exceptionInfo, &funcHandle);
if (ret != ACL_SUCCESS) {
SK_LOGE("Failed to get func handle from exception info, ret=%d", ret);
return ACL_ERROR_FAILURE;
}
ret = aclrtGetFunctionName(funcHandle, Adx::MAX_KERNELNAME_LEN, skFuncName);
if (ret != ACL_SUCCESS) {
SK_LOGE("Failed to get function name, ret=%d", ret);
return ACL_ERROR_FAILURE;
}
uint16_t modelIdIdx = static_cast<uint16_t>((skHeaderInfoHost->modelIdIndexAndSkScopeId >> 32) & 0xFFFF);
uint16_t skScopeId = static_cast<uint16_t>((skHeaderInfoHost->modelIdIndexAndSkScopeId >> 16) & 0xFFFF);
std::string modelId = SkEventRecorder::Instance().GetModelIdByIndex(modelIdIdx);
std::string skNameFromRecorder = SkEventRecorder::Instance().GetSkName(modelId, skScopeId);
if (!skNameFromRecorder.empty()) {
snprintf_s(dumpInfo.kernelDisplayName, Adx::MAX_KERNELNAME_LEN, Adx::MAX_KERNELNAME_LEN - 1,
"%s", skNameFromRecorder.c_str());
} else {
snprintf_s(dumpInfo.kernelDisplayName, Adx::MAX_KERNELNAME_LEN, Adx::MAX_KERNELNAME_LEN - 1,
"%s_scope%u", skFuncName, skScopeId);
}
return ACL_SUCCESS;
}
* \brief Fill tensor related fields (coreId, coreType, extraTensor)
* \param dumpInfo Output: dump info structure
* \param coreId Input: exception core ID
* \param coreType Input: exception core type
*/
void SuperKernelExceptionHandler::PopulateTensorFields(Adx::ExceptionDumpInfo& dumpInfo, uint32_t coreId, rtCoreType_t coreType)
{
dumpInfo.coreId = coreId;
dumpInfo.coreType = coreType;
dumpInfo.extraTensorNum = 1;
Adx::TensorInfo& tensor = dumpInfo.extraTensor[0];
tensor.type = Adx::TensorType::WORKSPACE;
tensor.tensorSize = skHeaderInfoHost->totalSize;
tensor.format = ACL_FORMAT_ND;
tensor.dataType = ACL_UINT8;
tensor.tensorAddr = reinterpret_cast<int64_t*>(skDeviceEntryArgsDev);
tensor.addrType = Adx::AddressType::RAW;
tensor.placement = Adx::TensorPlacement::kOnDeviceHbm;
tensor.argsOffSet = 0;
tensor.shape = {static_cast<int64_t>(tensor.tensorSize)};
tensor.originShape = {static_cast<int64_t>(tensor.tensorSize)};
}
* \brief Fill sub-kernel fields (bin, kernelName, argAddr, argSize)
* \param dumpInfo Output: dump info structure
* \param errorNodeIdx Input: index of the error sub-kernel node
* \param exceptionInfo exception info for getting SK entry func handle
* \param coreId exception core id
* \return true: filled sub-kernel info; false: no match, fields filled with SK entry info
*/
bool SuperKernelExceptionHandler::PopulateSubKernelFields(Adx::ExceptionDumpInfo& dumpInfo, int32_t errorNodeIdx,
aclrtExceptionInfo* exceptionInfo)
{
if (errorNodeIdx < 0 || static_cast<uint32_t>(errorNodeIdx) >= skHeaderInfoHost->nodeCnt) {
SK_LOGI("No sub kernel matched, fill with SK entry fields");
constexpr uint32_t MAX_FUNC_NAME_LEN = 256;
char funcName[MAX_FUNC_NAME_LEN] = {0};
aclrtFuncHandle funcHandle = nullptr;
aclError ret = aclrtGetFuncHandleFromExceptionInfo(exceptionInfo, &funcHandle);
if (ret != ACL_SUCCESS || funcHandle == nullptr) {
SK_LOGE("Failed to get SK entry func handle, ret=%d", ret);
memset_s(dumpInfo.kernelName, sizeof(dumpInfo.kernelName), 0, sizeof(dumpInfo.kernelName));
dumpInfo.bin = nullptr;
dumpInfo.argAddr = nullptr;
dumpInfo.argSize = 0;
return false;
}
ret = aclrtGetFunctionName(funcHandle, MAX_FUNC_NAME_LEN, funcName);
if (ret == ACL_SUCCESS) {
SK_LOGI("Successfully got function name: %s", funcName);
snprintf_s(dumpInfo.kernelName, Adx::MAX_KERNELNAME_LEN, Adx::MAX_KERNELNAME_LEN - 1, "%s", funcName);
} else {
SK_LOGI("Failed to get function name, ret=%d", ret);
memset_s(dumpInfo.kernelName, sizeof(dumpInfo.kernelName), 0, sizeof(dumpInfo.kernelName));
}
ret = aclrtFunctionGetBinary(funcHandle, &dumpInfo.bin);
if (ret != ACL_SUCCESS) {
SK_LOGE("Failed to get SK entry bin handle, ret=%d", ret);
dumpInfo.bin = nullptr;
}
dumpInfo.argAddr = nullptr;
dumpInfo.argSize = 0;
SK_LOGI("Not match sub Kernel, exception occurred within Superkernel");
return true;
}
SK_LOGI("Fill sub kernel info, errorNodeIdx=%d", errorNodeIdx);
uint8_t* dataBase = reinterpret_cast<uint8_t*>(skDeviceEntryArgsHost);
SkDfxInfo* dfxInfo = reinterpret_cast<SkDfxInfo*>(dataBase + skHeaderInfoHost->dfxOffset);
SkDfxInfo& errorNode = dfxInfo[errorNodeIdx];
dumpInfo.bin = reinterpret_cast<void*>(errorNode.binHdl);
KernelFuncName kernelFunc = GetOrLoadKernelSymbols(errorNodeIdx);
snprintf_s(dumpInfo.kernelName, Adx::MAX_KERNELNAME_LEN, Adx::MAX_KERNELNAME_LEN - 1, "%s", kernelFunc.name.c_str());
uint64_t argsAddr = 0;
uint32_t argsSize = 0;
GetSubKernelTaskArgs(errorNodeIdx, argsAddr, argsSize);
dumpInfo.argAddr = reinterpret_cast<void*>(argsAddr);
dumpInfo.argSize = argsSize;
SK_LOGI("Match sub kernel success, exception occurred within sub kernel");
return true;
}
* \brief Populate all fields of exception dump info structure
* \param dumpInfo Output: dump info structure
* \param errorNodeIdx Index of the error sub-kernel node
* \param exceptionInfo exception info
* \param coreId exception core id
* \param coreType exception core type (AIC/AIV)
* \return aclError ACL_SUCCESS on success, error code on failure
*/
aclError SuperKernelExceptionHandler::PopulateDumpInfoFields(Adx::ExceptionDumpInfo& dumpInfo, int32_t errorNodeIdx,
aclrtExceptionInfo* exceptionInfo, uint32_t coreId, rtCoreType_t coreType)
{
if (skHeaderInfoHost == nullptr || skDeviceEntryArgsDev == nullptr) {
SK_LOGE("PopulateDumpInfoFields: null pointer");
return ACL_ERROR_FAILURE;
}
aclError ret = PopulateSkEntryFields(dumpInfo, exceptionInfo);
if (ret != ACL_SUCCESS) {
SK_LOGI("PopulateSkEntryFields failed, ret=%d", ret);
return ret;
}
PopulateTensorFields(dumpInfo, coreId, coreType);
if(PopulateSubKernelFields(dumpInfo, errorNodeIdx, exceptionInfo)) {
SK_LOGI("PopulateSubKernelFields success");
}
return ACL_SUCCESS;
}
* \brief Fill exception dump information structure
* \param dumpInfo Output: exception dump info structure to be filled
* \return aclError ACL_SUCCESS on success, error code on failure
*/
aclError SuperKernelExceptionHandler::FillExceptionDumpInfo(Adx::ExceptionDumpInfo& dumpInfo, aclrtExceptionInfo* exceptionInfo)
{
if (exceptionInfo == nullptr) {
SK_LOGE("FillExceptionDumpInfo: no exception info");
return ACL_ERROR_INVALID_PARAM;
}
ExceptionRegInfo exceptionRegInfo{0, nullptr};
aclError ret = PrepareExceptionDump(exceptionInfo, exceptionRegInfo);
if (ret != ACL_SUCCESS) {
return ret;
}
int32_t errorNodeIdx = -1;
for (uint32_t i = 0; i < exceptionRegInfo.coreNum; i++) {
rtExceptionErrRegInfo_t coreErr = exceptionRegInfo.errRegInfo[i];
IdentifyErrorNodeByPC(coreErr.coreId, (rtCoreType_t)coreErr.coreType,
coreErr.startPC, coreErr.currentPC);
errorNodeIdx = errorNodeIdx_;
(void)PopulateDumpInfoFields(dumpInfo, errorNodeIdx, exceptionInfo, coreErr.coreId, (rtCoreType_t)coreErr.coreType);
}
SK_LOGI("FillExceptionDumpInfo final errorNodeIdx=%d", errorNodeIdx);
FreeResources();
return ACL_SUCCESS;
}
bool SuperKernelExceptionHandler::GetSubKernelTaskArgs(uint32_t nodeIdx, uint64_t& argsAddr, uint32_t& argsSize)
{
if (nodeIdx >= skHeaderInfoHost->nodeCnt) {
return false;
}
uint8_t* dataBase = reinterpret_cast<uint8_t*>(skDeviceEntryArgsHost);
auto GetTaskArgs = [&](uint32_t queOffset, uint32_t taskCnt) -> bool {
if (queOffset == 0 || taskCnt == 0) {
SK_LOGI("queOffset or taskCnt is null, skip");
return false;
}
TaskQue* taskQue = reinterpret_cast<TaskQue*>(dataBase + queOffset);
for (uint32_t i = 0; i < taskCnt; ++i) {
const TaskInfo& task = taskQue->taskInfos[i];
if (task.index == nodeIdx && task.type == SkTaskType::TYPE_FUNC) {
argsAddr = task.args;
argsSize = task.argsSize;
return true;
}
}
SK_LOGI("task not found, skip");
return false;
};
if (GetTaskArgs(skHeaderInfoHost->aivQueOffset, aivTaskCnt)) {
return true;
}
if (GetTaskArgs(skHeaderInfoHost->aicQueOffset, aicTaskCnt)) {
return true;
}
return false;
}
namespace {
static const char* TensorTypeToString(Adx::TensorType type)
{
switch (type) {
case Adx::TensorType::INPUT: return "INPUT";
case Adx::TensorType::OUTPUT: return "OUTPUT";
case Adx::TensorType::WORKSPACE: return "WORKSPACE";
default: return "UNKNOWN";
}
}
static const char* AddressTypeToString(Adx::AddressType type)
{
switch (type) {
case Adx::AddressType::TRADITIONAL: return "TRADITIONAL";
case Adx::AddressType::NOTILING: return "NOTILING";
case Adx::AddressType::RAW: return "RAW";
default: return "UNKNOWN";
}
}
static void PrintVector(const std::vector<int64_t>& vec, const char* name) {
if (vec.empty()) {
SK_LOGI(" %-18s: empty", name);
return;
}
std::string vecStr;
for (size_t i = 0; i < vec.size(); ++i) {
vecStr += std::to_string(vec[i]);
if (i != vec.size() - 1) vecStr += ", ";
}
SK_LOGI(" %-18s: [%s]", name, vecStr.c_str());
}
}
static void PrintExceptionDumpInfoArray(Adx::ExceptionDumpInfo* dumpInfos, uint32_t dumpCount)
{
if (dumpInfos == nullptr || dumpCount == 0) {
SK_LOGI("Adx::ExceptionDumpInfo array is empty, skip print");
return;
}
SK_LOGI("==============================================================");
SK_LOGI("======= SuperKernel Exception Dump Info Array (count=%u) ======", dumpCount);
SK_LOGI("==============================================================");
for (uint32_t i = 0; i < dumpCount; i++) {
Adx::ExceptionDumpInfo& info = dumpInfos[i];
SK_LOGI("------------------ Adx::ExceptionDumpInfo [%u] ------------------", i);
SK_LOGI("coreId : %u", info.coreId);
SK_LOGI("coreType : %s", (info.coreType == RT_CORE_TYPE_AIC) ? "AIC" : "AIV");
SK_LOGI("argSize : %u", info.argSize);
SK_LOGI("argAddr : 0x%lx", reinterpret_cast<uint64_t>(info.argAddr));
SK_LOGI("bin handle : 0x%lx", reinterpret_cast<uint64_t>(reinterpret_cast<void*>(info.bin)));
SK_LOGI("kernelName : %s", info.kernelName);
SK_LOGI("kernelDisplayName : %s", info.kernelDisplayName);
SK_LOGI("extraTensorNum : %u", info.extraTensorNum);
for (uint32_t j = 0; j < info.extraTensorNum && j < Adx::EXCEPTION_DUMP_MAX_TENSOR_NUM; j++) {
Adx::TensorInfo& tensor = info.extraTensor[j];
SK_LOGI(" >>> extraTensor[%u] <<<", j);
SK_LOGI(" type : %s", TensorTypeToString(tensor.type));
SK_LOGI(" tensorSize : %zu bytes", tensor.tensorSize);
SK_LOGI(" format : %d", tensor.format);
SK_LOGI(" dataType : %d", tensor.dataType);
SK_LOGI(" tensorAddr : 0x%lx", reinterpret_cast<uint64_t>(tensor.tensorAddr));
SK_LOGI(" addrType : %s", AddressTypeToString(tensor.addrType));
SK_LOGI(" placement : %d", tensor.placement);
SK_LOGI(" argsOffSet : %u", tensor.argsOffSet);
PrintVector(tensor.shape, "shape");
PrintVector(tensor.originShape, "originShape");
}
}
SK_LOGI("==============================================================\n");
}
void SuperKernelExceptionCallBackFunc(aclrtExceptionInfo *exceptionInfo)
{
SuperKernelExceptionHandler handler;
handler.HandleException(exceptionInfo);
}
uint32_t SuperKernelExceptionHandler::ProcessExceptionDump(
aclrtExceptionInfo* exceptionInfo,
Adx::ExceptionDumpInfo* exceptionDumpInfo,
uint32_t exceptionDumpSize,
uint32_t* exceptionDumpRealSize,
Adx::ExceptionDumpMode* mode
)
{
if (exceptionDumpSize < 1) {
SK_LOGE("ExceptionDumpCallBack: exceptionDumpSize too small");
return ACL_ERROR_INVALID_PARAM;
}
*exceptionDumpRealSize = 0;
memset_s(exceptionDumpInfo, exceptionDumpSize * sizeof(Adx::ExceptionDumpInfo), 0, exceptionDumpSize * sizeof(Adx::ExceptionDumpInfo));
if (!IsSuperKernelException(exceptionInfo)) {
SK_LOGD("Not superkernel exception, skip dump");
*exceptionDumpRealSize = 0;
*mode = Adx::ExceptionDumpMode::DUMP_MODE_NONE;
return ACL_SUCCESS;
}
SK_LOGI("Exception is in SuperKernel");
ExceptionRegInfo exceptionRegInfo{0, nullptr};
aclError ret = PrepareExceptionDump(exceptionInfo, exceptionRegInfo);
if (ret != ACL_SUCCESS) {
return ret;
}
uint32_t validDumpNum = 0;
for (uint32_t i = 0; i < exceptionRegInfo.coreNum && validDumpNum < exceptionDumpSize; ++i) {
rtExceptionErrRegInfo_t& coreErr = exceptionRegInfo.errRegInfo[i];
IdentifyErrorNodeByPC(coreErr.coreId, (rtCoreType_t)coreErr.coreType, coreErr.startPC, coreErr.currentPC);
aclError ret = PopulateDumpInfoFields(exceptionDumpInfo[validDumpNum], errorNodeIdx_, exceptionInfo, coreErr.coreId, (rtCoreType_t)coreErr.coreType);
if (ret == ACL_SUCCESS) {
validDumpNum++;
}
}
PrintExceptionDumpInfoArray(exceptionDumpInfo, validDumpNum);
*exceptionDumpRealSize = validDumpNum;
*mode = Adx::ExceptionDumpMode::DUMP_MODE_OVERWRITE;
FreeResources();
SK_LOGI("Adx::ExceptionDumpInfo fill success, coreNum=%u, validDump=%u", exceptionRegInfo.coreNum, validDumpNum);
return ACL_SUCCESS;
}
bool IsValidCommonException(const rtExceptionExpandType_t exceptionTaskType)
{
return exceptionTaskType != RT_EXCEPTION_AICORE && exceptionTaskType != RT_EXCEPTION_FFTS_PLUS && exceptionTaskType != RT_EXCEPTION_FUSION;
}
uint32_t ExceptionDumpInfoCallBack(void* exceptionInfo, Adx::ExceptionDumpInfo* exceptionDumpInfo,
uint32_t exceptionDumpSize, uint32_t* exceptionDumpRealSize, Adx::ExceptionDumpMode* mode)
{
if (exceptionInfo == nullptr || exceptionDumpInfo == nullptr || exceptionDumpRealSize == nullptr) {
SK_LOGE("ExceptionDumpCallBack: invalid null params");
return ACL_ERROR_INVALID_PARAM;
}
if (IsValidCommonException(static_cast<aclrtExceptionInfo*>(exceptionInfo)->expandInfo.type)) {
SK_LOGI("Not superkernel exception, skip dump");
*exceptionDumpRealSize = 0;
*mode = Adx::ExceptionDumpMode::DUMP_MODE_NONE;
return ACL_SUCCESS;
}
SK_LOGI("Start SuperKernelExceptionHandler::ProcessExceptionDump");
SuperKernelExceptionHandler handler;
return handler.ProcessExceptionDump(
static_cast<aclrtExceptionInfo*>(exceptionInfo),
exceptionDumpInfo,
exceptionDumpSize,
exceptionDumpRealSize,
mode
);
}