* Copyright (c) 2025 Huawei Technologies Co., Ltd.
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
* CANN Open Software License Agreement Version 2.0 (the "License").
* Please refer to the License for details. You may not use this file except in compliance with the License.
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
* See LICENSE in the root of the software repository for the full text of the License.
*/
* \file sk_node.cpp
* \brief
*/
#include <map>
#include <unordered_map>
#include <unordered_set>
#include <array>
#include <memory>
#include <limits>
#include <cstdint>
#include <stdexcept>
#include <string>
#include <sstream>
#include <fstream>
#include <utility>
#include <cstring>
#include "sk_node.h"
#include "sk_log.h"
#include "sk_scope_launch.h"
#include "sk_scope_info.h"
#include "sk_lock_detector.h"
#include "sk_common.h"
#include "sk_model_context.h"
#include "sk_options_manager.h"
#include "kernel.h"
#include "base.h"
extern "C" aclrtBinHandle AscendGetEntryBinHandle();
constexpr uint64_t INVALID_SK_BIND_VALUE = 0xffffffffffffffffULL;
KernelCapBits ParseKernelCapBits(uint64_t cap)
{
const auto getBit = [cap](KernelCapBitOffset offset) -> bool {
return ((cap >> static_cast<uint8_t>(offset)) & 1ULL) != 0;
};
KernelCapBits bits;
bits.earlyStartWaitFlag = getBit(KernelCapBitOffset::EARLY_START_WAIT_FLAG);
bits.earlyStartSetFlag = getBit(KernelCapBitOffset::EARLY_START_SET_FLAG);
bits.disableDcci = getBit(KernelCapBitOffset::DCCI);
bits.disableScheMode = getBit(KernelCapBitOffset::DISABLE_SCHEMODE);
return bits;
}
FusionFailReasonInfo::FusionFailReasonInfo(FusionFailReason reason, ScopeFailReason scopeReason)
: primary(reason), scopeDetailValue(static_cast<uint8_t>(scopeReason)) {}
FusionFailReasonInfo::FusionFailReasonInfo(FusionFailReason reason, DeadlockFailReason deadlockReason)
: primary(reason), deadlockDetailValue(static_cast<uint8_t>(deadlockReason)) {}
ScopeFailReason FusionFailReasonInfo::GetScopeDetail() const {
return static_cast<ScopeFailReason>(scopeDetailValue);
}
void FusionFailReasonInfo::SetScopeDetail(ScopeFailReason scopeReason) {
scopeDetailValue = static_cast<uint8_t>(scopeReason);
}
DeadlockFailReason FusionFailReasonInfo::GetDeadlockDetail() const {
return static_cast<DeadlockFailReason>(deadlockDetailValue);
}
void FusionFailReasonInfo::SetDeadlockDetail(DeadlockFailReason deadlockReason) {
deadlockDetailValue = static_cast<uint8_t>(deadlockReason);
}
FusionFailReasonInfo::FusionFailReasonInfo(FusionFailReason reason, BindmapFailReason bindmapReason)
: primary(reason), bindmapDetailValue(static_cast<uint8_t>(bindmapReason)) {}
BindmapFailReason FusionFailReasonInfo::GetBindmapDetail() const {
return static_cast<BindmapFailReason>(bindmapDetailValue);
}
void FusionFailReasonInfo::SetBindmapDetail(BindmapFailReason bindmapReason) {
bindmapDetailValue = static_cast<uint8_t>(bindmapReason);
}
const char* BindmapFailReasonToStr(BindmapFailReason reason) {
switch (reason) {
case BindmapFailReason::NONE: return "NONE";
case BindmapFailReason::BINDMAP_INIT_EMPTY: return "bindmap init empty";
case BindmapFailReason::BINHDL_NULL: return "binHdl is null";
case BindmapFailReason::FUNCHDL_NULL: return "funcHdl is null";
case BindmapFailReason::FUNC_NOT_FOUND: return "function not found in bind map";
case BindmapFailReason::BIN_DEV_ADDR_GET_FAILED: return "failed to get binary device address";
case BindmapFailReason::FUNC_ADDR_GET_FAILED: return "failed to get function address";
case BindmapFailReason::BINDMAP_ENTRY_CONFLICT: return "bind map entry conflict";
case BindmapFailReason::BINDMAP_CAP_INCONSISTENT: return "bind map cap inconsistent";
case BindmapFailReason::BIN_HOST_ADDR_GET_FAILED: return "failed to get binary host address";
default: return "UNKNOWN_BINDMAP_REASON";
}
}
std::string FusionFailReasonToStr(const FusionFailReasonInfo& info) {
std::string result = FusionFailReasonToStr(info.primary);
if (info.primary == FusionFailReason::SCOPE_FUSE_PART) {
ScopeFailReason scopeDetail = info.GetScopeDetail();
if (scopeDetail != ScopeFailReason::NONE) {
result += " [";
result += ScopeFailReasonToStr(scopeDetail);
result += "]";
}
} else if (info.primary == FusionFailReason::EXIST_DEADLOCK) {
DeadlockFailReason deadlockDetail = info.GetDeadlockDetail();
if (deadlockDetail != DeadlockFailReason::NOT_FIND_DEADLOCK) {
result += " [";
result += DeadlockFailReasonToStr(deadlockDetail);
result += "]";
}
} else if (info.primary == FusionFailReason::BINDMAP_IS_EMPTY) {
BindmapFailReason bindmapDetail = info.GetBindmapDetail();
if (bindmapDetail != BindmapFailReason::NONE) {
result += " [";
result += BindmapFailReasonToStr(bindmapDetail);
result += "]";
}
}
return result;
}
SkBindMap InitSuperKernelBindMap(aclrtBinHandle binHdl)
{
struct __attribute__((packed)) SknlValuePayload {
uint32_t res;
SknlMapInfo info;
};
constexpr size_t payloadSize = sizeof(SknlValuePayload);
size_t metaNum = 0;
if (int ret = rtBinaryGetMetaNum(binHdl, RT_BINARY_TYPE_SK_INFO, &metaNum) != 0) {
SK_LOGI("rtBinaryGetMetaNum unsuccessful, ret=%d", ret);
return SkBindMap();
}
if (metaNum == 0) {
SK_LOGI("metaNum is zero!");
return SkBindMap();
}
SK_LOGI("binHdl=0x%lx, metaNum=%lu, payloadSize=%zu",
(uint64_t)binHdl, metaNum, payloadSize);
std::vector<uint8_t> dataPool(metaNum * payloadSize);
std::vector<size_t> infoSize(metaNum, payloadSize);
std::vector<void *> metaDataList(metaNum);
for (size_t i = 0; i < metaNum; ++i) {
metaDataList[i] = &dataPool[i * payloadSize];
}
if (int ret = rtBinaryGetMetaInfo(binHdl, RT_BINARY_TYPE_SK_INFO, metaNum, metaDataList.data(),
infoSize.data()) != 0) {
SK_LOGI("rtBinaryGetMetaInfo failed, ret=%d", ret);
return SkBindMap();
}
SkBindMap bindMap;
for (size_t i = 0; i < metaNum; ++i) {
SknlValuePayload *payload = (SknlValuePayload *)metaDataList[i];
SknlMapInfo localInfo;
memcpy_s(&localInfo, sizeof(SknlMapInfo), &(payload->info), sizeof(SknlMapInfo));
SK_LOGI("[%zu] cap=%lu, globalFunc=0x%lx, skFunc[0]=0x%lx, skFunc[1]=0x%lx, "
"skFunc[2]=0x%lx, skFunc[3]=0x%lx",
i, localInfo.cap, (uint64_t)localInfo.globalFunc,
(uint64_t)localInfo.sknlFunc[0],
(uint64_t)localInfo.sknlFunc[1],
(uint64_t)localInfo.sknlFunc[2],
(uint64_t)localInfo.sknlFunc[3]);
const auto globalFunc = (uint64_t)(localInfo.globalFunc);
SkBindInfo bindInfo;
bindInfo.cap = localInfo.cap;
bindInfo.sknlFuncs = {
(uint64_t)(localInfo.sknlFunc[0]),
(uint64_t)(localInfo.sknlFunc[1]),
(uint64_t)(localInfo.sknlFunc[2]),
(uint64_t)(localInfo.sknlFunc[3])
};
auto it = bindMap.find(globalFunc);
if (it != bindMap.end() &&
(it->second.cap != bindInfo.cap || it->second.sknlFuncs != bindInfo.sknlFuncs)) {
SK_LOGI("InitSuperKernelBindMap: globalFunc=0x%lx is duplicated with different value",
globalFunc);
it->second.sknlFuncs[0] = INVALID_SK_BIND_VALUE;
continue;
}
bindMap[globalFunc] = bindInfo;
}
return bindMap;
}
namespace {
constexpr uint32_t AIV_TYPE_SIMT_VF_ONLY = 3U;
constexpr uint32_t AIV_TYPE_SIMD_SIMT_MIX_VF = 4U;
using SkAllBinMap = std::unordered_map<aclrtBinHandle, SkBindMap>;
struct CoreFuncInitContext {
ResolvedFunctionInfo* info;
SkBindMap::iterator bindIt;
size_t splitIdx;
BindmapFailReason* failReason;
};
enum class SkNodeCoreType: uint32_t {
AIC,
AIV,
};
constexpr int32_t ACL_FUNC_ATTR_KERNEL_SCHEMODE_PLACEHOLDER = 3;
bool HasInvalidSkBindValue(const SkBindInfo &bindInfo)
{
return bindInfo.sknlFuncs[0] == INVALID_SK_BIND_VALUE;
}
bool UpdateKernelCap(const SkBindInfo &bindInfo, bool &hasCap, uint64_t &cap)
{
if (!hasCap) {
cap = bindInfo.cap;
hasCap = true;
return true;
}
return cap == bindInfo.cap;
}
ScheModeState ParseScheModeState(int64_t rawValue)
{
constexpr int64_t SCHE_MODE_OFF_VALUE = 0;
constexpr int64_t SCHE_MODE_ON_VALUE = 1;
if (rawValue == SCHE_MODE_OFF_VALUE) {
return ScheModeState::SCHE_MODE_OFF;
}
if (rawValue == SCHE_MODE_ON_VALUE) {
return ScheModeState::SCHE_MODE_ON;
}
SK_LOGW("Invalid schemode value: %ld, valid value is 0 or 1", rawValue);
return ScheModeState::NONE;
}
ScheModeState GetScheModeFromFuncAttr(aclrtFuncHandle funcHandle)
{
int64_t funcAttrScheModeValue = 0;
aclError aclRet = aclrtGetFunctionAttribute(funcHandle,
static_cast<aclrtFuncAttribute>(ACL_FUNC_ATTR_KERNEL_SCHEMODE_PLACEHOLDER), &funcAttrScheModeValue);
if (aclRet != ACL_SUCCESS) {
SK_LOGE("Failed to query function attribute schemode, ret=%d", aclRet);
return ScheModeState::NONE;
}
return ParseScheModeState(funcAttrScheModeValue);
}
ScheModeState GetScheModeFromKernelTask(aclmdlRITask kernelTask)
{
SK_LOGI("Query kernel task schemode begin, kernelTask=%p", kernelTask);
aclError aclRet = ACL_SUCCESS;
aclrtLaunchKernelAttrValue launchAttr;
aclRet = aclmdlRIKernelTaskGetAttribute(kernelTask,
static_cast<aclrtLaunchKernelAttrId>(ACL_RT_LAUNCH_KERNEL_ATTR_SCHEM_MODE), &launchAttr);
if (aclRet != ACL_SUCCESS) {
SK_LOGE("Failed to get task launch attribute schemode, ret=%d", aclRet);
return ScheModeState::NONE;
}
ScheModeState scheModeState = ScheModeState::NONE;
scheModeState = ParseScheModeState(static_cast<int64_t>(launchAttr.schemMode));
SK_LOGI("Query kernel task schemode end, kernelTask=%p, rawSchemMode=%ld, parsedState=%ld",
kernelTask, static_cast<int64_t>(launchAttr.schemMode), static_cast<int64_t>(scheModeState));
return scheModeState;
}
const SkBindMap& GetSkBindMap(aclrtBinHandle binHdl)
{
static SkAllBinMap allBinMap;
auto it = allBinMap.find(binHdl);
if (it != allBinMap.end()) {
return it->second;
}
allBinMap[binHdl] = InitSuperKernelBindMap(binHdl);
return allBinMap[binHdl];
}
template <SkNodeCoreType coreType>
bool InitSingleCoreFunc(const CoreFuncInitContext& ctx, aclrtBinHandle binHdl, void *binDevAddr, uint32_t& validFuncNum)
{
std::string coreName = "";
if (coreType == SkNodeCoreType::AIC) {
coreName = "AIC";
} else {
coreName = "AIV";
}
constexpr uint32_t coreTypeId = static_cast<uint32_t>(coreType);
uint64_t skFuncOffset = ctx.bindIt->second.sknlFuncs[ctx.splitIdx];
ctx.info->funcAddr[coreTypeId] = skFuncOffset + (uint64_t)binDevAddr;
ctx.info->funcOffset[coreTypeId] = skFuncOffset;
void *binHostAddr = nullptr;
uint32_t binHostSize = 0;
if (int ret = rtGetBinBuffer(binHdl, RT_BIN_HOST_ADDR, &binHostAddr, &binHostSize) != 0) {
SK_LOGE("split[%zu] rtGetBinBuffer failed for %s, ret=%d", ctx.splitIdx,
coreName.c_str(), ret);
if (ctx.failReason != nullptr) {
*ctx.failReason = BindmapFailReason::BIN_HOST_ADDR_GET_FAILED;
}
return false;
}
std::string symbolName = "";
uint64_t funcSize = 0;
std::string symbolBind = "";
if (GetFuncSymbolInfo(binHdl, static_cast<const char*>(binHostAddr), binHostSize, skFuncOffset,
symbolName, funcSize, symbolBind)) {
ctx.info->prefetchCnt[coreTypeId] = AlignUpAndClamp(funcSize, coreTypeId);
ctx.info->symbolBind[coreTypeId] = symbolBind;
SK_LOGI("split[%zu] %s symbol=%s, size=0x%lx, bind=%s",
ctx.splitIdx, coreName.c_str(), symbolName.c_str(), funcSize, symbolBind.c_str());
} else {
ctx.info->prefetchCnt[coreTypeId] = coreTypeId == 0 ? 16 : 8;
SK_LOGW("split[%zu] Failed to get %s symbol info, default prefetchCnt[%zu]=%u",
ctx.splitIdx, coreName.c_str(), coreTypeId, ctx.info->prefetchCnt[coreTypeId]);
}
if (ctx.splitIdx > 0 && ctx.bindIt->second.sknlFuncs[ctx.splitIdx] == ctx.bindIt->second.sknlFuncs[0]) {
SK_LOGI("InitSingleCoreFunc: split[%zu] %s function is not sk sub op", ctx.splitIdx, coreName.c_str());
} else {
validFuncNum++;
}
return true;
}
bool InitSingleSplitFunc(ResolvedFunctionInfo &info, size_t splitIdx,
const SkBindMap &bindMap, SkBindMap::iterator aicIt, SkBindMap::iterator aivIt,
aclrtBinHandle binHdl, void *binDevAddr, uint32_t &resolvedNum, BindmapFailReason &failReason)
{
bool res = false;
uint32_t validFuncNum = 0;
if (aicIt != bindMap.end()) {
CoreFuncInitContext aicCtx = {&info, aicIt, splitIdx, &failReason};
res |= InitSingleCoreFunc<SkNodeCoreType::AIC>(aicCtx, binHdl, binDevAddr, validFuncNum);
}
if (aivIt != bindMap.end()) {
CoreFuncInitContext aivCtx = {&info, aivIt, splitIdx, &failReason};
res |= InitSingleCoreFunc<SkNodeCoreType::AIV>(aivCtx, binHdl, binDevAddr, validFuncNum);
}
if (!res) {
SK_LOGI("Failed to initialize kernel function in sk Node split[%zu]", splitIdx);
return false;
}
if (validFuncNum > 0) {
resolvedNum++;
}
return true;
}
bool InitKernelResolvedFuncs(KernelInfos &kernelInfos)
{
aclrtBinHandle binHdl = kernelInfos.binHdl;
aclrtFuncHandle oriFuncHdl = kernelInfos.funcHdl;
if (binHdl == nullptr) {
SK_LOGI("binHdl is null for kernel %s", kernelInfos.funcName.c_str());
kernelInfos.bindmapFailReason = BindmapFailReason::BINHDL_NULL;
return false;
}
if (oriFuncHdl == nullptr) {
SK_LOGI("funcHdl is null for kernel %s", kernelInfos.funcName.c_str());
kernelInfos.bindmapFailReason = BindmapFailReason::FUNCHDL_NULL;
return false;
}
SkBindMap bindMap = GetSkBindMap(binHdl);
if (bindMap.empty()) {
SK_LOGI("bindMap is empty for kernel %s", kernelInfos.funcName.c_str());
kernelInfos.bindmapFailReason = BindmapFailReason::BINDMAP_INIT_EMPTY;
return false;
}
size_t binDevSize = 0;
void *binDevAddr = nullptr;
aclError aclRet = aclrtBinaryGetDevAddress(binHdl, &binDevAddr, &binDevSize);
if (aclRet != ACL_SUCCESS) {
SK_LOGE("Failed to get binary device address for kernel %s, ret=%d",
kernelInfos.funcName.c_str(), aclRet);
kernelInfos.bindmapFailReason = BindmapFailReason::BIN_DEV_ADDR_GET_FAILED;
return false;
}
void *addr[2] = {nullptr, nullptr};
aclRet = aclrtGetFunctionAddr(oriFuncHdl, addr, addr + 1);
if (aclRet != ACL_SUCCESS) {
SK_LOGE("Failed to get function address for kernel %s, ret=%d",
kernelInfos.funcName.c_str(), aclRet);
kernelInfos.bindmapFailReason = BindmapFailReason::FUNC_ADDR_GET_FAILED;
return false;
}
uint64_t aicOffset = (uint64_t)addr[0] - (uint64_t)binDevAddr;
uint64_t aivOffset = (uint64_t)addr[1] - (uint64_t)binDevAddr;
SK_LOGI("funcName=%s, binDevAddr=0x%lx, binDevSize=%lu, aicAddr=0x%lx, aivAddr=0x%lx",
kernelInfos.funcName.c_str(), (uint64_t)binDevAddr, binDevSize, (uint64_t)addr[0], (uint64_t)addr[1]);
SK_LOGI("aicOffset=0x%lx, aivOffset=0x%lx", aicOffset, aivOffset);
auto aicItor = bindMap.find(aicOffset);
auto aivItor = bindMap.find(aivOffset);
if (aicItor != bindMap.end() && HasInvalidSkBindValue(aicItor->second)) {
SK_LOGI("Invalid sk bind map for globalFunc=0x%lx, kernel %s has duplicated entries with different values",
aicItor->first, kernelInfos.funcName.c_str());
kernelInfos.bindmapFailReason = BindmapFailReason::BINDMAP_ENTRY_CONFLICT;
return false;
}
if (aivItor != bindMap.end() && HasInvalidSkBindValue(aivItor->second)) {
SK_LOGI("Invalid sk bind map for globalFunc=0x%lx, kernel %s has duplicated entries with different values",
aivItor->first, kernelInfos.funcName.c_str());
kernelInfos.bindmapFailReason = BindmapFailReason::BINDMAP_ENTRY_CONFLICT;
return false;
}
if (aicItor == bindMap.end() && aivItor == bindMap.end()) {
SK_LOGI("Function is not found in sk bind map for kernel %s", kernelInfos.funcName.c_str());
kernelInfos.bindmapFailReason = BindmapFailReason::FUNC_NOT_FOUND;
return false;
}
bool hasCap = false;
uint64_t cap = 0;
if (aicItor != bindMap.end() && !UpdateKernelCap(aicItor->second, hasCap, cap)) {
SK_LOGI("Invalid sk bind map for kernel %s, cap is inconsistent", kernelInfos.funcName.c_str());
kernelInfos.bindmapFailReason = BindmapFailReason::BINDMAP_CAP_INCONSISTENT;
return false;
}
if (aivItor != bindMap.end() && !UpdateKernelCap(aivItor->second, hasCap, cap)) {
SK_LOGI("Invalid sk bind map for kernel %s, cap is inconsistent", kernelInfos.funcName.c_str());
kernelInfos.bindmapFailReason = BindmapFailReason::BINDMAP_CAP_INCONSISTENT;
return false;
}
kernelInfos.cap = hasCap ? cap : 0;
const KernelCapBits capBits = ParseKernelCapBits(kernelInfos.cap);
kernelInfos.capBits = capBits;
SK_LOGI("bindMap size=%lu, aicFound=%d, aivFound=%d, earlyStartWaitFlag=%d, "
"earlyStartSetFlag=%d, disableDcci=%d, disableScheMode=%d",
bindMap.size(), aicItor != bindMap.end(), aivItor != bindMap.end(),
capBits.earlyStartWaitFlag, capBits.earlyStartSetFlag, capBits.disableDcci, capBits.disableScheMode);
if (capBits.disableScheMode == true) {
const bool originScheModeOn = kernelInfos.isScheModeOn;
kernelInfos.isScheModeOn = false;
SK_LOGI("Disable ScheMode by kernel cap, funcName=%s, cap=0x%lx, originIsScheModeOn=%d, "
"currentIsScheModeOn=%d",
kernelInfos.funcName.c_str(), kernelInfos.cap, originScheModeOn, kernelInfos.isScheModeOn);
}
kernelInfos.resolvedNum = 0;
for (size_t i = 0; i < K_MAX_SPLIT_BIN_COUNT; ++i) {
ResolvedFunctionInfo info{};
BindmapFailReason failReason = BindmapFailReason::NONE;
if (!InitSingleSplitFunc(info, i, bindMap, aicItor, aivItor,
binHdl, binDevAddr, kernelInfos.resolvedNum, failReason)) {
SK_LOGI("Failed to initialize kernel function in sk Node split[%zu]", i);
kernelInfos.bindmapFailReason = failReason;
return false;
}
kernelInfos.resolvedFuncs[i] = info;
SK_LOGI("split[%zu] funcAddr[0]=0x%lx, funcAddr[1]=0x%lx, "
"prefetchCnt[0]=0x%lx, prefetchCnt[1]=0x%lx, cap=%lu",
i, info.funcAddr[0], info.funcAddr[1], info.prefetchCnt[0], info.prefetchCnt[1],
kernelInfos.cap);
}
if (kernelInfos.resolvedNum == 2) {
kernelInfos.resolvedFuncs[2] = kernelInfos.resolvedFuncs[0];
kernelInfos.resolvedFuncs[3] = kernelInfos.resolvedFuncs[1];
}
return true;
}
SkKernelType NormalizeKernelType(uint32_t kernelType, const uint32_t taskRatio[2]) {
switch (kernelType) {
case ACL_KERNEL_TYPE_CUBE:
return SkKernelType::AIC_ONLY;
case ACL_KERNEL_TYPE_VECTOR:
return SkKernelType::AIV_ONLY;
case ACL_KERNEL_TYPE_MIX:
if (taskRatio[1] == 0) {
return SkKernelType::AIC_ONLY;
}
if (taskRatio[1] == 1) {
return SkKernelType::MIX_AIC_1_1;
}
if (taskRatio[1] == 2) {
return SkKernelType::MIX_AIC_1_2;
}
break;
default:
break;
}
return SkKernelType::DEFAULT;
}
bool IsMixKernelType(SkKernelType kernelType)
{
return kernelType == SkKernelType::MIX_AIC_1_1 || kernelType == SkKernelType::MIX_AIC_1_2;
}
constexpr char SK_SCOPE_KERNEL_SUFFIX_SEPARATOR = '_';
bool IsScopeKernelNameWithSupportedArch(const char* kernelName, const char* baseName)
{
const size_t baseLen = strlen(baseName);
if (strncmp(kernelName, baseName, baseLen) != 0) {
return false;
}
if (kernelName[baseLen] != SK_SCOPE_KERNEL_SUFFIX_SEPARATOR) {
return false;
}
for (const auto arch : SK_SUPPORTED_KERNEL_ARCHS) {
if (strcmp(kernelName + baseLen + 1, GetSkKernelArchSymbolSuffix(arch)) == 0) {
return true;
}
}
return false;
}
}
static std::string PtrToHexString(const void* ptr)
{
std::stringstream hexStream;
hexStream << "0x" << std::hex << reinterpret_cast<uintptr_t>(ptr);
return hexStream.str();
}
static std::string Uint64ToHexString(uint64_t value)
{
std::stringstream hexStream;
hexStream << "0x" << std::hex << value;
return hexStream.str();
}
Json KernelInfosToJson(const KernelInfos& kernelInfos)
{
Json kernelJson;
kernelJson["funcName"] = kernelInfos.funcName;
kernelJson["funcHandle"] = PtrToHexString(kernelInfos.funcHdl);
kernelJson["numBlocks"] = kernelInfos.numBlocks;
kernelJson["cap"] = Uint64ToHexString(kernelInfos.cap);
kernelJson["devargs"] = PtrToHexString(kernelInfos.devArgs);
kernelJson["argsSize"] = 0;
kernelJson["isHostArgs"] = false;
kernelJson["launchKernelCfg"] = PtrToHexString(kernelInfos.launchKernelCfg);
kernelJson["binHandle"] = PtrToHexString(kernelInfos.binHdl);
kernelJson["kernelTypeInt"] = kernelInfos.kernelTypeInt;
kernelJson["kernelType"] = to_string(kernelInfos.kernelType);
kernelJson["needMixKernelSplit"] = kernelInfos.needMixKernelSplit;
kernelJson["isSimtOp"] = kernelInfos.isSimtOp;
kernelJson["taskRatio"] = Json::array({kernelInfos.taskRatio[0], kernelInfos.taskRatio[1]});
kernelJson["opInfoPtr"] = PtrToHexString(kernelInfos.opInfoPtr);
kernelJson["opInfoSize"] = static_cast<uint64_t>(kernelInfos.opInfoSize);
kernelJson["taskGrp"] = "0x0";
kernelJson["resolvedNum"] = kernelInfos.resolvedNum;
Json resolvedFuncs = Json::array();
for (size_t i = 0; i < kernelInfos.resolvedNum && i < K_MAX_SPLIT_BIN_COUNT; ++i) {
Json rfJson;
rfJson["funcAddr"][0] = Uint64ToHexString(kernelInfos.resolvedFuncs[i].funcAddr[0]);
rfJson["funcAddr"][1] = Uint64ToHexString(kernelInfos.resolvedFuncs[i].funcAddr[1]);
rfJson["prefetchCnt"][0] = kernelInfos.resolvedFuncs[i].prefetchCnt[0];
rfJson["prefetchCnt"][1] = kernelInfos.resolvedFuncs[i].prefetchCnt[1];
rfJson["funcOffset"][0] = Uint64ToHexString(kernelInfos.resolvedFuncs[i].funcOffset[0]);
rfJson["funcOffset"][1] = Uint64ToHexString(kernelInfos.resolvedFuncs[i].funcOffset[1]);
rfJson["symbolBind"][0] = kernelInfos.resolvedFuncs[i].symbolBind[0];
rfJson["symbolBind"][1] = kernelInfos.resolvedFuncs[i].symbolBind[1];
resolvedFuncs.push_back(rfJson);
}
kernelJson["resolvedFuncs"] = resolvedFuncs;
return kernelJson;
}
Json SyncInfosToJson(const SyncInfos& syncInfos, SkNodeType nodeType)
{
Json syncJson;
syncJson["eventId"] = Uint64ToHexString(syncInfos.eventId);
if (nodeType == SkNodeType::NODE_WAIT || nodeType == SkNodeType::NODE_MEMORY_WAIT) {
syncJson["correspondingNotifyNodeId"] = syncInfos.correspondingNotifyNodeId;
}
syncJson["addrValue"] = PtrToHexString(syncInfos.addrValue);
if (!syncInfos.correspondingWaitNodeIds.empty()) {
syncJson["correspondingWaitNodeIds"] = syncInfos.correspondingWaitNodeIds;
}
if (!syncInfos.correspondingResetNodeIds.empty()) {
syncJson["correspondingResetNodeIds"] = syncInfos.correspondingResetNodeIds;
}
if (syncInfos.memoryValue != std::numeric_limits<uint64_t>::max()) {
syncJson["memoryValue"] = Uint64ToHexString(syncInfos.memoryValue);
}
if (syncInfos.memoryWaitFlag != std::numeric_limits<uint32_t>::max()) {
syncJson["memoryWaitFlag"] = syncInfos.memoryWaitFlag;
}
if (syncInfos.eventFlag != std::numeric_limits<uint64_t>::max()) {
syncJson["eventFlag"] = Uint64ToHexString(syncInfos.eventFlag);
}
return syncJson;
}
Json NodeInfosToJson(const NodeInfos& nodeInfos, SkNodeType nodeType)
{
Json nodeInfosJson;
nodeInfosJson["kernelInfos"] = KernelInfosToJson(nodeInfos.kernelInfos);
if (nodeType != SkNodeType::NODE_KERNEL) {
nodeInfosJson["syncInfos"] = SyncInfosToJson(nodeInfos.syncInfos, nodeType);
}
return nodeInfosJson;
}
* @brief Get kernel type string from kernelType value
*/
const char* GetKernelTypeString(uint32_t kernelType, const uint32_t taskRatio[2])
{
SkKernelType skType = NormalizeKernelType(kernelType, taskRatio);
switch (skType) {
case SkKernelType::AIC_ONLY:
return "AIC_ONLY";
case SkKernelType::AIV_ONLY:
return "AIV_ONLY";
case SkKernelType::MIX_AIC_1_1:
return "MIX_AIC_1_1";
case SkKernelType::MIX_AIC_1_2:
return "MIX_AIC_1_2";
default:
return "DEFAULT";
}
}
size_t AlignUpAndClamp(size_t value, size_t coreIdx)
{
constexpr size_t aicFuncMaxPrefetchCnt = 0x800 * 16;
constexpr size_t aivFuncMaxPrefetchCnt = 0x800 * 8;
constexpr size_t alignNum = 0x800;
size_t prefetchCntValue = (value + alignNum - 1) & ~(alignNum - 1);
if (coreIdx == 0 && prefetchCntValue > aicFuncMaxPrefetchCnt) {
prefetchCntValue = aicFuncMaxPrefetchCnt;
} else if (coreIdx == 1 && prefetchCntValue > aivFuncMaxPrefetchCnt) {
prefetchCntValue = aivFuncMaxPrefetchCnt;
}
return prefetchCntValue / alignNum;
}
const char* SuperKernelBaseNode::GetUpdateTargetTypeName(aclmdlRITaskType type) const
{
switch (type) {
case ACL_MODEL_RI_TASK_KERNEL:
return "KERNEL";
case ACL_MODEL_RI_TASK_VALUE_WRITE:
return "VALUE_WRITE";
case ACL_MODEL_RI_TASK_VALUE_WAIT:
return "VALUE_WAIT";
default:
return "UNKNOWN";
}
}
bool SuperKernelBaseNode::InitNode(const SuperKernelOptionsManager* opts) {
(void)opts;
if (originTask == nullptr) {
SK_LOGE("Origin task is null for %s", Format().c_str());
return false;
}
uint32_t seqId = 0;
if (aclmdlRITaskGetSeqId(*originTask, &seqId) != ACL_SUCCESS) {
SK_LOGE("Failed to get nodeId for %s", Format().c_str());
return false;
}
nodeId = static_cast<uint64_t>(seqId);
return true;
}
bool SuperKernelBaseNode::Update(const UpdateContext &ctx) {
if (isUpdate) {
SK_LOGE("Node has already been updated and cannot be updated again, %s", Format().c_str());
return false;
}
isUpdate = true;
SK_LOGI("Updating node %lu.", nodeId);
return true;
}
void SuperKernelBaseNode::LogNodeUpdateResult(const aclmdlRITaskParams* resultParams) const
{
std::ostringstream oss;
oss << "node update result: nodeId=" << nodeId;
if (resultParams == nullptr) {
oss << ", type=INVALID";
SK_LOGI("%s", oss.str().c_str());
return;
}
oss << ", type=" << GetUpdateTargetTypeName(resultParams->type);
switch (resultParams->type) {
case ACL_MODEL_RI_TASK_KERNEL:
oss << ", opInfoPtr=" << resultParams->opInfoPtr
<< ", opInfoSize=" << resultParams->opInfoSize
<< ", funcHandle=" << resultParams->kernelTaskParams.funcHandle
<< ", args=" << resultParams->kernelTaskParams.args
<< ", argsSize=" << resultParams->kernelTaskParams.argsSize
<< ", numBlocks=" << static_cast<uint32_t>(resultParams->kernelTaskParams.numBlocks);
break;
case ACL_MODEL_RI_TASK_VALUE_WRITE:
oss << ", addr=" << resultParams->valueWriteTaskParams.devAddr
<< ", value=0x" << std::hex << resultParams->valueWriteTaskParams.value << std::dec;
break;
case ACL_MODEL_RI_TASK_VALUE_WAIT:
oss << ", addr=" << resultParams->valueWaitTaskParams.devAddr
<< ", value=0x" << std::hex << resultParams->valueWaitTaskParams.value
<< ", flag=0x" << resultParams->valueWaitTaskParams.flag << std::dec;
break;
default:
break;
}
SK_LOGI("%s", oss.str().c_str());
}
aclError SuperKernelBaseNode::InValidateNode() {
SK_LOGI("Invalidating node %lu for super kernel fusion.", nodeId);
aclError aclRet = aclmdlRITaskDisable(*originTask);
if (aclRet != ACL_SUCCESS) {
SK_LOGE("Failed to invalidate node %s", Format().c_str());
return aclRet;
}
isInvalidated = true;
SK_LOGI("Node %lu was invalidated successfully.", nodeId);
return ACL_SUCCESS;
}
struct JudgeTaskKernelInfo {
bool isBegin = false;
bool isEnd = false;
bool isPlaceholder = false;
bool isFuseEnable = true;
std::unique_ptr<char[]> scopeName;
};
bool IsScopeKernel(aclmdlRIKernelTaskParams params, JudgeTaskKernelInfo* info) {
const char* defaultScopeName = "default_sk_scope_name";
char kernelName[MAX_SCOPE_NAME_LEN] = {0};
int32_t ret = aclrtGetFunctionName(params.funcHandle, sizeof(kernelName), kernelName);
if (ret != ACL_SUCCESS) {
SK_LOGE("Failed to get kernel name for funcHandle, ret: %d", ret);
return false;
}
bool isBegin = IsScopeKernelNameWithSupportedArch(kernelName, "sk_scope_kernel_begin");
bool isEnd = IsScopeKernelNameWithSupportedArch(kernelName, "sk_scope_kernel_end");
bool isPlaceholder = IsScopeKernelNameWithSupportedArch(kernelName, "sk_placeholder_kernel");
if (!isBegin && !isEnd && !isPlaceholder) {
SK_LOGD("Current kernel is not a scope kernel or uses unsupported arch suffix, kernelName=%s", kernelName);
return false;
}
auto parseArgsAddr = std::make_unique<ScopeKernelArgs>();
ret = aclrtMemcpy((void*)parseArgsAddr.get(), sizeof(ScopeKernelArgs), params.args, sizeof(ScopeKernelArgs),
ACL_MEMCPY_DEVICE_TO_HOST);
if (ret != ACL_SUCCESS) {
SK_LOGE("Failed to copy kernel args from device to host, ret: %d, direction=DEVICE_TO_HOST", ret);
return false;
}
parseArgsAddr->name[MAX_SCOPE_NAME_LEN - 1] = '\0';
size_t nameLen = strlen(parseArgsAddr->name);
info->scopeName = std::make_unique<char[]>(nameLen + 1);
errno_t res = memcpy_s(info->scopeName.get(), nameLen + 1, parseArgsAddr->name, nameLen + 1);
if (res != 0) {
SK_LOGE("Failed to copy scope name '%s', memcpy_s error code: %d", parseArgsAddr->name, res);
return false;
}
info->isBegin = isBegin;
info->isEnd = isEnd;
info->isPlaceholder = isPlaceholder;
if (strcmp(info->scopeName.get(), defaultScopeName) == 0) {
info->isFuseEnable = false;
}
SK_LOGI("Success parse scope kernel task, kernelName: %s, scopeName: %s, isBegin: %d, isEnd: %d, "
"isPlaceholder: %d, isFuseEnable: %d",
kernelName, info->scopeName.get(), info->isBegin, info->isEnd, info->isPlaceholder, info->isFuseEnable);
return true;
}
bool SuperKernelKernelNode::InitNode(const SuperKernelOptionsManager* opts) {
if (!SuperKernelBaseNode::InitNode(opts)) {
SK_LOGE("Failed to init kernel node for %s", Format().c_str());
return false;
}
nodeType = SkNodeType::NODE_KERNEL;
aclError aclRet = aclmdlRITaskGetParams(*originTask, &taskParams);
if (aclRet != ACL_SUCCESS) {
SK_LOGE("Failed to get kernel params for %s", Format().c_str());
return false;
}
JudgeTaskKernelInfo scopeKernelInfo;
auto &kernelParams = taskParams.kernelTaskParams;
if (IsScopeKernel(kernelParams, &scopeKernelInfo)){
SK_LOGI("Kernel node %lu is a scope kernel node.", nodeId);
isScopeNode = true;
isFusible = scopeKernelInfo.isFuseEnable;
isScopeBegin = scopeKernelInfo.isBegin;
isScopeEnd = scopeKernelInfo.isEnd;
isPlaceholder = scopeKernelInfo.isPlaceholder;
if (isFusible && scopeKernelInfo.scopeName != nullptr){
char* rawPtr = scopeKernelInfo.scopeName.get();
scopeName = std::string(rawPtr);
}
} else {
SK_LOGI("Kernel node %lu is a regular kernel node.", nodeId);
}
int64_t kernelType = 0;
aclRet = aclrtGetFunctionAttribute(kernelParams.funcHandle, ACL_FUNC_ATTR_KERNEL_TYPE, &kernelType);
if (aclRet != ACL_SUCCESS) {
SK_LOGE("Failed to get kernel type for node %s, ret=%d", Format().c_str(), aclRet);
SetFusionFailReason(FusionFailReason::KERNEL_ATTR_GET_FAILED);
return false;
}
int64_t taskRatio = 0;
aclRet = aclrtGetFunctionAttribute(kernelParams.funcHandle, ACL_FUNC_ATTR_KERNEL_RATIO, &taskRatio);
if (aclRet != ACL_SUCCESS) {
SK_LOGE("Failed to get task ratio for node %s, ret=%d", Format().c_str(), aclRet);
SetFusionFailReason(FusionFailReason::KERNEL_ATTR_GET_FAILED);
return false;
}
const int16_t* taskRatioInt16 = reinterpret_cast<const int16_t*>(&taskRatio);
uint32_t skTaskTatio[2] = {static_cast<uint32_t>(taskRatioInt16[1]), static_cast<uint32_t>(taskRatioInt16[0])};
nodeInfos.kernelInfos.taskRatio[0] = skTaskTatio[0];
nodeInfos.kernelInfos.taskRatio[1] = skTaskTatio[1];
nodeInfos.kernelInfos.kernelType = NormalizeKernelType((uint32_t)(kernelType), skTaskTatio);
nodeInfos.kernelInfos.kernelTypeInt = static_cast<uint32_t>(kernelType);
nodeInfos.kernelInfos.numBlocks = kernelParams.numBlocks;
nodeInfos.kernelInfos.devArgs = kernelParams.args;
nodeInfos.kernelInfos.opInfoPtr = taskParams.opInfoPtr;
nodeInfos.kernelInfos.opInfoSize = taskParams.opInfoSize;
nodeInfos.kernelInfos.launchKernelCfg = kernelParams.cfg;
nodeInfos.kernelInfos.isScheModeOn = GetScheMode();
aclRet = aclrtFunctionGetBinary(kernelParams.funcHandle, &nodeInfos.kernelInfos.binHdl);
if (aclRet != ACL_SUCCESS) {
SK_LOGE("Failed to get kernel bin handle for %s, ret=%d",
Format().c_str(), aclRet);
return false;
}
nodeInfos.kernelInfos.funcHdl = kernelParams.funcHandle;
uint32_t numBlocks = kernelParams.numBlocks;
SkKernelType kt = nodeInfos.kernelInfos.kernelType;
if (kt == SkKernelType::AIC_ONLY || kt == SkKernelType::MIX_AIC_1_0) {
nodeInfos.kernelInfos.cubeNum = numBlocks;
nodeInfos.kernelInfos.vecNum = 0;
} else if (kt == SkKernelType::AIV_ONLY || kt == SkKernelType::MIX_AIV_1_0) {
nodeInfos.kernelInfos.cubeNum = 0;
nodeInfos.kernelInfos.vecNum = numBlocks;
} else if (kt == SkKernelType::MIX_AIC_1_1) {
nodeInfos.kernelInfos.cubeNum = numBlocks;
nodeInfos.kernelInfos.vecNum = numBlocks;
} else if (kt == SkKernelType::MIX_AIC_1_2) {
nodeInfos.kernelInfos.cubeNum = numBlocks;
nodeInfos.kernelInfos.vecNum = numBlocks << 1;
}
char tmpFuncName[256] = {0};
aclRet = aclrtGetFunctionName(kernelParams.funcHandle, sizeof(tmpFuncName), tmpFuncName);
if (aclRet != ACL_SUCCESS) {
SK_LOGE("Failed to get function name for node %s, ret=%d", Format().c_str(), aclRet);
SetFusionFailReason(FusionFailReason::KERNEL_ATTR_GET_FAILED);
return false;
}
nodeInfos.kernelInfos.funcName = std::string(tmpFuncName);
nodeInfos.kernelInfos.needMixKernelSplit = IsMixKernelType(nodeInfos.kernelInfos.kernelType);
const auto* ubufLockIgnoreKernelOpt = opts == nullptr ? nullptr :
opts->GetOption(aclskOptionType::UBUF_LOCK_IGNORE_KERNEL);
if (nodeInfos.kernelInfos.needMixKernelSplit && ubufLockIgnoreKernelOpt != nullptr &&
!nodeInfos.kernelInfos.funcName.empty() &&
opts->JudgeUbufLockIgnoreKernel(
ubufLockIgnoreKernelOpt->GetStringListValue(), nodeInfos.kernelInfos.funcName)) {
nodeInfos.kernelInfos.needMixKernelSplit = false;
}
SK_LOGI("Kernel node %lu mix split flag initialized, funcName=%s, kernelType=%s, needMixKernelSplit=%d",
nodeId, nodeInfos.kernelInfos.funcName.c_str(), to_string(nodeInfos.kernelInfos.kernelType),
static_cast<int>(nodeInfos.kernelInfos.needMixKernelSplit));
if (!isScopeNode && !nodeInfos.kernelInfos.funcName.empty() && nodeInfos.kernelInfos.binHdl != nullptr) {
isFusible = InitKernelResolvedFuncs(nodeInfos.kernelInfos);
if (!isFusible) {
SetFusionFailReason(FusionFailReason::BINDMAP_IS_EMPTY, nodeInfos.kernelInfos.bindmapFailReason);
}
}
IdentifyAndHandleSimtKernel(opts);
if (taskParams.taskGrp != nullptr) {
SK_LOGI("Kernel node %lu has a non-null task group and cannot be fused in super kernel.", nodeId);
isFusible = false;
SetFusionFailReason(FusionFailReason::TASK_GROUP_NOT_EMPTY);
}
return true;
}
bool SuperKernelKernelNode::GetScheMode() const
{
const aclmdlRIKernelTaskParams& kernelParams = taskParams.kernelTaskParams;
const ScheModeState funcAttrScheModeState = GetScheModeFromFuncAttr(kernelParams.funcHandle);
const ScheModeState launchAttrScheModeState = GetScheModeFromKernelTask(*originTask);
ScheModeState finalScheModeState = ScheModeState::SCHE_MODE_OFF;
if (launchAttrScheModeState != ScheModeState::NONE) {
finalScheModeState = launchAttrScheModeState;
} else if (funcAttrScheModeState != ScheModeState::NONE) {
finalScheModeState = funcAttrScheModeState;
}
SK_LOGI("schemode detect result: funcAttrState=%ld, launchAttrState=%ld, finalState=%ld",
static_cast<int64_t>(funcAttrScheModeState),
static_cast<int64_t>(launchAttrScheModeState),
static_cast<int64_t>(finalScheModeState));
return finalScheModeState == ScheModeState::SCHE_MODE_ON;
}
void SuperKernelKernelNode::IdentifyAndHandleSimtKernel(const SuperKernelOptionsManager* opts) {
nodeInfos.kernelInfos.isSimtOp = false;
if (opts == nullptr) {
return;
}
const auto* simtCheckOpt = opts->GetOption(SkInnerOptionType::ENABLE_SIMT_OP_CHECK);
if (simtCheckOpt == nullptr || simtCheckOpt->GetIntValue() != 1) {
return;
}
SkKernelType kernelType = nodeInfos.kernelInfos.kernelType;
bool hasAivSection = (kernelType == SkKernelType::AIV_ONLY ||
kernelType == SkKernelType::MIX_AIV_1_0 ||
kernelType == SkKernelType::MIX_AIC_1_1 ||
kernelType == SkKernelType::MIX_AIC_1_2);
if (!hasAivSection) {
SK_LOGI("IdentifyAndHandleSimtKernel: %s has no AIV section (kernelType=%s), skip SIMT check",
Format().c_str(), to_string(kernelType));
return;
}
SK_LOGI("IdentifyAndHandleSimtKernel: checking for %s, kernelType=%s, nodeId=%lu",
Format().c_str(), to_string(kernelType), nodeId);
uint32_t aivType = 0;
rtError_t ret = rtFunctionGetMetaInfo(taskParams.kernelTaskParams.funcHandle,
RT_FUNCTION_TYPE_AIV_TYPE_FLAG, &aivType, sizeof(uint32_t));
if (ret != RT_ERROR_NONE) {
SK_LOGD("rtFunctionGetMetaInfo AIV_TYPE_FLAG failed for %s, ret=%d", Format().c_str(), ret);
return;
}
bool isSimt = (aivType == AIV_TYPE_SIMT_VF_ONLY || aivType == AIV_TYPE_SIMD_SIMT_MIX_VF);
if (isSimt) {
nodeInfos.kernelInfos.isSimtOp = true;
isFusible = false;
SetFusionFailReason(FusionFailReason::SIMT_OP_NOT_SUPPORTED);
SK_LOGI("%s is SIMT type, aivType=%u, not fusible", Format().c_str(), aivType);
}
return;
}
std::string SuperKernelKernelNode::Format() const {
std::ostringstream oss;
oss << "[nodeId:" << nodeId
<< ", streamId:" << streamId
<< ", streamIdxInGraph:" << streamIdxInGraph
<< ", nodeIdxInStream:" << nodeIdxInStream
<< "] - " << nodeInfos.kernelInfos.Format();
return oss.str();
}
std::string KernelInfos::Format() const {
std::ostringstream oss;
oss << "KernelInfos{funcName:" << funcName
<< ", kernelType:" << to_string(kernelType)
<< ", taskRatio:[" << taskRatio[0] << "," << taskRatio[1] << "]"
<< ", cap:" << cap
<< ", numBlocks:" << numBlocks
<< ", cubeNum:" << cubeNum
<< ", vecNum:" << vecNum
<< ", isScheModeOn:" << isScheModeOn
<< ", needMixKernelSplit:" << needMixKernelSplit;
if (isSimtOp) {
oss << ", isSimtOp:" << isSimtOp;
}
oss << ", resolvedNum:" << resolvedNum;
if (binHdl != nullptr) {
oss << ", binHdl:0x" << std::hex << reinterpret_cast<uintptr_t>(binHdl) << std::dec;
}
if (funcHdl != nullptr) {
oss << ", funcHdl:0x" << std::hex << reinterpret_cast<uintptr_t>(funcHdl) << std::dec;
}
if (launchKernelCfg != nullptr) {
oss << ", launchKernelCfg:0x" << std::hex << reinterpret_cast<uintptr_t>(launchKernelCfg) << std::dec;
}
if (devArgs != nullptr) {
oss << ", devArgs:0x" << std::hex << reinterpret_cast<uintptr_t>(devArgs) << std::dec;
}
oss << "}";
return oss.str();
}
bool SuperKernelKernelNode::Update(const UpdateContext &ctx) {
if (!SuperKernelBaseNode::Update(ctx)) {
SK_LOGE("Failed to update base node for %s", Format().c_str());
return false;
}
const aclmdlRITaskParams* resultParams = nullptr;
if (ctx.customParams != nullptr && ctx.customParams->type != 0) {
switch (ctx.customParams->type) {
case ACL_MODEL_RI_TASK_VALUE_WRITE:
case ACL_MODEL_RI_TASK_VALUE_WAIT:
if (ctx.customParams->valueWriteTaskParams.devAddr == nullptr) {
SK_LOGE("Custom params for kernel node %s has null devAddr, invalid params.", Format().c_str());
return false;
}
break;
default:
SK_LOGI("custom param type : %u not in check list, which will direct update, %s", ctx.customParams->type, Format().c_str());
break;
}
aclError aclRet = aclmdlRITaskSetParams(*originTask, ctx.customParams);
if (aclRet != ACL_SUCCESS) {
SK_LOGE("Failed to set kernel with custom params for %s", Format().c_str());
return false;
}
taskParams = *ctx.customParams;
resultParams = &taskParams;
} else if (ctx.launchInfo != nullptr && ctx.launchInfo->entryInfo.skEntryFunc != nullptr) {
taskParams.kernelTaskParams.args = static_cast<void*>(ctx.launchInfo->devArgs.Get());
taskParams.kernelTaskParams.argsSize = ctx.launchInfo->devArgs.Get()->skHeader.totalSize;
taskParams.kernelTaskParams.isHostArgs = true;
taskParams.kernelTaskParams.funcHandle = ctx.launchInfo->entryInfo.skEntryFunc;
taskParams.kernelTaskParams.numBlocks = ctx.launchInfo->entryInfo.numBlocks;
taskParams.type = ACL_MODEL_RI_TASK_KERNEL;
taskParams.opInfoPtr = ctx.launchInfo->cacheInfo;
taskParams.opInfoSize = ctx.launchInfo->cacheopInfoSize;
aclError aclRet = aclmdlRITaskSetParams(*originTask, &taskParams);
if (aclRet != ACL_SUCCESS) {
SK_LOGE("Failed to update kernel node %s", Format().c_str());
return false;
}
resultParams = &taskParams;
} else {
aclError aclRet = InValidateNode();
if (aclRet != ACL_SUCCESS) {
return false;
}
}
LogNodeUpdateResult(resultParams);
return true;
}
bool SuperKernelMemoryNode::InitNode(const SuperKernelOptionsManager* opts) {
if (!SuperKernelBaseNode::InitNode(opts)) {
SK_LOGE("Failed to init memory node for %s", Format().c_str());
return false;
}
aclError aclRet = aclmdlRITaskGetParams(*originTask, &taskParams);
if (aclRet != ACL_SUCCESS) {
SK_LOGE("Failed to get task params (aclRet=%d) for %s", aclRet, Format().c_str());
return false;
}
if (rtNodeType != ACL_MODEL_RI_TASK_VALUE_WRITE && rtNodeType != ACL_MODEL_RI_TASK_VALUE_WAIT) {
switch (rtNodeType) {
case ACL_MODEL_RI_TASK_EVENT_RECORD: {
const auto &eventParam = taskParams.eventRecordTaskParams;
nodeType = SkNodeType::NODE_NOTIFY;
nodeInfos.syncInfos.eventId = reinterpret_cast<uintptr_t>(eventParam.event);
nodeInfos.syncInfos.eventFlag = eventParam.eventFlag;
nodeInfos.syncInfos.memoryValue = SK_DEFAULT_NOTIFY_VALUE;
nodeInfos.syncInfos.memoryWaitFlag = SK_DEFAULT_WRITE_FLAG;
break;
}
case ACL_MODEL_RI_TASK_EVENT_WAIT: {
const auto &eventParam = taskParams.eventWaitTaskParams;
nodeType = SkNodeType::NODE_WAIT;
nodeInfos.syncInfos.eventId = reinterpret_cast<uintptr_t>(eventParam.event);
nodeInfos.syncInfos.eventFlag = eventParam.eventFlag;
nodeInfos.syncInfos.memoryValue = SK_DEFAULT_WAIT_VALUE;
nodeInfos.syncInfos.memoryWaitFlag = static_cast<uint32_t>(SkMemoryWaitFlag::EQ);
break;
}
case ACL_MODEL_RI_TASK_EVENT_RESET: {
const auto &eventParam = taskParams.eventResetTaskParams;
nodeType = SkNodeType::NODE_RESET;
nodeInfos.syncInfos.eventId = reinterpret_cast<uintptr_t>(eventParam.event);
nodeInfos.syncInfos.eventFlag = eventParam.eventFlag;
nodeInfos.syncInfos.memoryValue = SK_DEFAULT_RESET_VALUE;
nodeInfos.syncInfos.memoryWaitFlag = SK_DEFAULT_WRITE_FLAG;
break;
}
default:
SK_LOGE("Unsupported event type %u for %s, which cannot be fused in super kernel.",
rtNodeType, Format().c_str());
SetFusionFailReason(FusionFailReason::UNSUPPORT_EVENT_TYPE);
return false;
}
if ((nodeInfos.syncInfos.eventFlag & ACL_EVENT_EXTERNAL) == 0) {
isFusible = true;
SK_LOGI("Event %s: internal to ModelRI, fusible in super kernel", Format().c_str());
} else {
isFusible = false;
SetFusionFailReason(FusionFailReason::EXTERNAL_DEPEND);
SK_LOGI("Event %s: has external dependencies or is reset, cannot be fused in super kernel",
Format().c_str());
}
if (rtNodeType == ACL_MODEL_RI_TASK_EVENT_RESET) {
isFusible = false;
SetFusionFailReason(FusionFailReason::RESET_TYPE_NODE);
SK_LOGI("Event %s: is reset type, cannot be fused in super kernel", Format().c_str());
}
return true;
}
if (rtNodeType == ACL_MODEL_RI_TASK_VALUE_WRITE) {
const auto& memoryParam = taskParams.valueWriteTaskParams;
nodeType = SkNodeType::NODE_MEMORY_WRITE;
nodeInfos.syncInfos.eventId = static_cast<uint64_t>(reinterpret_cast<uintptr_t>(memoryParam.devAddr));
nodeInfos.syncInfos.addrValue = memoryParam.devAddr;
nodeInfos.syncInfos.memoryValue = memoryParam.value;
} else {
const auto& memoryParam = taskParams.valueWaitTaskParams;
nodeType = SkNodeType::NODE_MEMORY_WAIT;
nodeInfos.syncInfos.eventId = static_cast<uint64_t>(reinterpret_cast<uintptr_t>(memoryParam.devAddr));
nodeInfos.syncInfos.addrValue = memoryParam.devAddr;
nodeInfos.syncInfos.memoryValue = memoryParam.value;
nodeInfos.syncInfos.memoryWaitFlag = memoryParam.flag;
}
if (nodeInfos.syncInfos.addrValue == nullptr) {
SK_LOGE("Memory node %s has null device address, which is invalid for super kernel fusion.", Format().c_str());
return false;
}
SK_LOGI("Memory node %s default not fusible, but it may be bypassed", Format().c_str());
return true;
}
bool SuperKernelMemoryNode::Update(const UpdateContext &ctx) {
if (!SuperKernelBaseNode::Update(ctx)) {
SK_LOGE("Failed to update base node for %s", Format().c_str());
return false;
}
const aclmdlRITaskParams* resultParams = nullptr;
if (ctx.customParams != nullptr && ctx.customParams->type != 0) {
switch (ctx.customParams->type) {
case ACL_MODEL_RI_TASK_VALUE_WRITE:
case ACL_MODEL_RI_TASK_VALUE_WAIT:
if (ctx.customParams->valueWriteTaskParams.devAddr == nullptr) {
SK_LOGE("Custom params for memory node %s has null devAddr, invalid params.", Format().c_str());
return false;
}
break;
default:
SK_LOGI("custom param type : %u not in check list, which will direct update, %s", ctx.customParams->type, Format().c_str());
break;
}
aclError aclRet = aclmdlRITaskSetParams(*originTask, ctx.customParams);
if (aclRet != ACL_SUCCESS) {
SK_LOGE("Failed to set custom params on memory node %s", Format().c_str());
return false;
}
taskParams = *ctx.customParams;
resultParams = &taskParams;
} else {
aclError aclRet = InValidateNode();
if (aclRet != ACL_SUCCESS) {
return false;
}
}
LogNodeUpdateResult(resultParams);
return true;
}
std::string SuperKernelMemoryNode::Format() const {
std::ostringstream oss;
const char* eventType = nullptr;
switch (rtNodeType) {
case ACL_MODEL_RI_TASK_EVENT_RECORD:
eventType = "EventNotify";
break;
case ACL_MODEL_RI_TASK_EVENT_WAIT:
eventType = "EventWait";
break;
case ACL_MODEL_RI_TASK_EVENT_RESET:
eventType = "EventReset";
break;
case ACL_MODEL_RI_TASK_VALUE_WRITE:
oss << "[nodeId:" << nodeId
<< ", streamId:" << streamId
<< ", streamIdxInGraph:" << streamIdxInGraph
<< ", nodeIdxInStream:" << nodeIdxInStream
<< ", MemoryWrite(value:0x" << std::hex << nodeInfos.syncInfos.memoryValue
<< std::dec << ", eventId:0x" << std::hex << GetEventId() << std::dec << ")]";
return oss.str();
case ACL_MODEL_RI_TASK_VALUE_WAIT:
oss << "[nodeId:" << nodeId
<< ", streamId:" << streamId
<< ", streamIdxInGraph:" << streamIdxInGraph
<< ", nodeIdxInStream:" << nodeIdxInStream
<< ", MemoryWait(flag:0x" << std::hex << nodeInfos.syncInfos.memoryWaitFlag
<< ", value:0x" << std::hex << nodeInfos.syncInfos.memoryValue
<< std::dec << ", eventId:0x" << std::hex << GetEventId() << std::dec << ")]";
return oss.str();
default:
eventType = "Unknown";
break;
}
uint64_t eventId = GetEventId();
uint64_t eventFlag = nodeInfos.syncInfos.eventFlag;
oss << "[nodeId:" << nodeId
<< ", streamId:" << streamId
<< ", streamIdxInGraph:" << streamIdxInGraph
<< ", nodeIdxInStream:" << nodeIdxInStream
<< ", " << eventType << "(eventId:0x" << std::hex << eventId
<< ", eventFlag:0x" << eventFlag << std::dec << ")]";
return oss.str();
}
bool SuperKernelDefaultNode::InitNode(const SuperKernelOptionsManager* opts) {
if (!SuperKernelBaseNode::InitNode(opts)) {
SK_LOGE("Failed to init default node for %s", Format().c_str());
return false;
}
nodeType = SkNodeType::NODE_DEFAULT;
SK_LOGI("Default node %lu cannot be fused in super kernel.", nodeId);
return true;
}
aclError SuperKernelDefaultNode::InValidateNode() {
SK_LOGE("Default node %s should not be invalidated.", Format().c_str());
return ACL_ERROR_FAILURE;
}
std::string SuperKernelDefaultNode::Format() const {
std::ostringstream oss;
oss << "[nodeId:" << nodeId
<< ", streamId:" << streamId
<< ", streamIdxInGraph:" << streamIdxInGraph
<< ", nodeIdxInStream:" << nodeIdxInStream
<< ", type: Default]";
return oss.str();
}
static int TaskTypeToInt(aclmdlRITaskType type)
{
return static_cast<int>(type);
}
Json SuperKernelBaseNodeToJson(const SuperKernelBaseNode* node)
{
Json nodeJson;
if (node == nullptr) {
return nodeJson;
}
nodeJson["taskId"] = node->GetNodeId();
nodeJson["streamId"] = node->GetStreamId();
aclmdlRITaskType taskType = node->GetTaskParams().type;
nodeJson["taskType"] = TaskTypeToString(taskType);
nodeJson["taskTypeInt"] = TaskTypeToInt(taskType);
return nodeJson;
}
Json SuperKernelKernelNodeToJson(const SuperKernelKernelNode* node)
{
Json nodeJson = SuperKernelBaseNodeToJson(node);
if (node == nullptr) {
return nodeJson;
}
const auto& kernelInfos = node->GetNodeInfos().kernelInfos;
Json kernelParams = KernelInfosToJson(kernelInfos);
const auto& taskParams = node->GetTaskParams();
kernelParams["argsSize"] = taskParams.kernelTaskParams.argsSize;
kernelParams["isHostArgs"] = taskParams.kernelTaskParams.isHostArgs;
nodeJson["kernelParams"] = kernelParams;
return nodeJson;
}
Json SuperKernelMemoryNodeToJson(const SuperKernelMemoryNode* node)
{
Json nodeJson = SuperKernelBaseNodeToJson(node);
if (node == nullptr) {
return nodeJson;
}
const auto& syncInfos = node->GetNodeInfos().syncInfos;
SkNodeType nodeType = node->GetNodeType();
switch (nodeType) {
case SkNodeType::NODE_NOTIFY:
case SkNodeType::NODE_WAIT:
case SkNodeType::NODE_RESET: {
Json eventParams;
eventParams["eventId"] = PtrToHexString(reinterpret_cast<const void*>(syncInfos.eventId));
eventParams["eventFlag"] = syncInfos.eventFlag;
nodeJson["eventParams"] = eventParams;
break;
}
case SkNodeType::NODE_MEMORY_WRITE: {
Json valueParams;
valueParams["devAddr"] = PtrToHexString(syncInfos.addrValue);
valueParams["value"] = Uint64ToHexString(syncInfos.memoryValue);
nodeJson["valueWriteParams"] = valueParams;
break;
}
case SkNodeType::NODE_MEMORY_WAIT: {
Json valueParams;
valueParams["devAddr"] = PtrToHexString(syncInfos.addrValue);
valueParams["value"] = Uint64ToHexString(syncInfos.memoryValue);
valueParams["flag"] = syncInfos.memoryWaitFlag;
nodeJson["valueWaitParams"] = valueParams;
break;
}
default:
break;
}
return nodeJson;
}
Json SuperKernelDefaultNodeToJson(const SuperKernelDefaultNode* node)
{
return SuperKernelBaseNodeToJson(node);
}
static uint32_t DumpKernelBinariesToDir(const SuperKernelGraph& graph, const std::string& kernelBinsDir);
static uint32_t DumpSkEntryBinary(const std::string& kernelBinsDir);
bool DumpKernelBinaries(const SuperKernelGraph& graph, const std::string& binPath) {
if (!sk::logger::FileLogger::Instance().IsEnabled()) {
return true;
}
SK_LOGI("Starting to dump kernel binaries to: %s", binPath.c_str());
std::string baseDir = binPath.empty() ? "." : binPath;
std::string kernelBinsDir = baseDir + "/bin_files";
if (!CreateDirectoryRecursive(kernelBinsDir)) {
SK_LOGE("Failed to create kernel binaries directory: %s", kernelBinsDir.c_str());
return false;
}
uint32_t kernelCount = DumpKernelBinariesToDir(graph, kernelBinsDir);
kernelCount += DumpSkEntryBinary(kernelBinsDir);
SK_LOGI("Successfully dumped %u kernel binaries to directory: %s", kernelCount, kernelBinsDir.c_str());
return true;
}
* @brief Dump a single kernel binary to file
* @param kernelInfo Kernel information containing binary handle
* @param kernelBinsDir Output directory path
* @return true if dumped successfully, false otherwise
*/
bool DumpSingleKernelBinary(const KernelInfos& kernelInfo, const std::string& kernelBinsDir) {
void* binHostAddr = nullptr;
uint32_t binHostSize = 0;
int rtRet = rtGetBinBuffer(kernelInfo.binHdl, RT_BIN_HOST_ADDR, &binHostAddr, &binHostSize);
if (rtRet != 0 || binHostAddr == nullptr || binHostSize == 0) {
SK_LOGW("Failed to get bin buffer for kernel %s, rtRet=%d, addr=%p, size=%u",
kernelInfo.funcName.c_str(), rtRet, binHostAddr, binHostSize);
return false;
}
void* binDevAddr = nullptr;
size_t binDevSize = 0;
rtRet = aclrtBinaryGetDevAddress(kernelInfo.binHdl, &binDevAddr, &binDevSize);
if (rtRet != 0) {
SK_LOGW("Failed to get bin dev address for kernel %s, rtRet=%d", kernelInfo.funcName.c_str(), rtRet);
return false;
}
uint64_t codeSegmentAddr = reinterpret_cast<uint64_t>(binDevAddr);
std::string kernelName = SanitizePathComponent(kernelInfo.funcName);
std::ostringstream addrOss;
addrOss << "0x" << std::hex << std::uppercase << codeSegmentAddr;
std::string oFilePath = kernelBinsDir + "/" + kernelName + "_" + addrOss.str() + ".o";
std::ofstream outFile(oFilePath, std::ios::binary);
if (!outFile.is_open()) {
SK_LOGW("Failed to open file for writing: %s", oFilePath.c_str());
return false;
}
outFile.write(static_cast<char*>(binHostAddr), binHostSize);
outFile.close();
SK_LOGI("Dumped kernel binary: %s, size=%u, codeSegAddr=%lu", oFilePath.c_str(), binHostSize, codeSegmentAddr);
return true;
}
uint32_t DumpKernelBinariesToDir(const SuperKernelGraph& graph, const std::string& kernelBinsDir) {
std::unordered_set<uint64_t> seenBinHdls;
uint32_t kernelCount = 0;
std::vector<uint64_t> sortedNodeIds = graph.GetSortedNodeIds();
for (uint64_t nodeId : sortedNodeIds) {
const SuperKernelBaseNode* node = graph.GetNodeById(nodeId);
if (node == nullptr) {
SK_LOGE("Failed to get node %lu from graph", nodeId);
continue;
}
if (node->GetNodeType() != SkNodeType::NODE_KERNEL ||
node->IsScopeBegin() || node->IsScopeEnd() || node->IsScopePlaceholder()) {
continue;
}
const KernelInfos& kernelInfo = node->GetNodeInfos().kernelInfos;
uint64_t binHdl = reinterpret_cast<uint64_t>(kernelInfo.binHdl);
if (binHdl == 0 || seenBinHdls.count(binHdl) > 0) {
continue;
}
seenBinHdls.insert(binHdl);
if (DumpSingleKernelBinary(kernelInfo, kernelBinsDir)) {
kernelCount++;
}
}
return kernelCount;
}
uint32_t DumpSkEntryBinary(const std::string& kernelBinsDir) {
aclrtBinHandle entryBinHandle = AscendGetEntryBinHandle();
if (entryBinHandle == nullptr) {
SK_LOGI("SK entry bin handle is null, skip SK binary dump");
return 0;
}
void* entryBinAddr = nullptr;
uint32_t entryBinSize = 0;
int rtRet = rtGetBinBuffer(entryBinHandle, RT_BIN_HOST_ADDR, &entryBinAddr, &entryBinSize);
if (rtRet != 0 || entryBinAddr == nullptr || entryBinSize == 0) {
SK_LOGW("Failed to get SK entry bin buffer, rtRet=%d", rtRet);
return 0;
}
void* entryBinDevAddr = nullptr;
size_t entryBinDevSize = 0;
rtRet = aclrtBinaryGetDevAddress(entryBinHandle, &entryBinDevAddr, &entryBinDevSize);
if (rtRet != 0) {
SK_LOGW("Failed to get SK entry bin dev address, rtRet=%d", rtRet);
return 0;
}
uint64_t codeSegmentAddr = reinterpret_cast<uint64_t>(entryBinDevAddr);
std::ostringstream skAddrOss;
skAddrOss << "0x" << std::hex << std::uppercase << codeSegmentAddr;
std::string skOFilePath = kernelBinsDir + "/sk_entry_" + skAddrOss.str() + ".o";
std::ofstream skOutFile(skOFilePath, std::ios::binary);
if (!skOutFile.is_open()) {
SK_LOGW("Failed to open SK binary file for writing: %s", skOFilePath.c_str());
return 0;
}
skOutFile.write(static_cast<char*>(entryBinAddr), entryBinSize);
skOutFile.close();
SK_LOGI("Dumped SK entry binary: %s, size=%u, codeSegAddr=%lu", skOFilePath.c_str(), entryBinSize, codeSegmentAddr);
return 1;
}