* Copyright (c) 2025 Huawei Technologies Co., Ltd.
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
* CANN Open Software License Agreement Version 2.0 (the "License").
* Please refer to the License for details. You may not use this file except in compliance with the License.
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
* See LICENSE in the root of the software repository for the full text of the License.
*/
* \file device_runner.cpp
* \brief
*/
#include "machine/runtime/runner/device_runner.h"
#include <cstdlib>
#include <mutex>
#include "tilefwk/platform.h"
#include "tilefwk/pypto_fwk_log.h"
#include "tilefwk/error_code.h"
#include "adapter/api/msprof_api.h"
#include "adapter/api/acl_api.h"
#include "adapter/api/runtime_api.h"
#include "interface/utils/common.h"
#include "interface/utils/op_info_manager.h"
#include "interface/configs/config_manager.h"
#include "interface/machine/host/host_machine.h"
#include "interface/utils/file_utils.h"
#include "machine/runtime/runner/runtime_agent.h"
#include "machine/runtime/runner/runtime_utils.h"
#include "machine/runtime/context/stream_context.h"
#include "machine/runtime/runner/pmu_common.h"
#include "machine/runtime/runner/load_aicpu_op.h"
#include "machine/runtime/runner/device_error_tracking.h"
#include "machine/runtime/runner/device_exception_dump.h"
#include "machine/runtime/memory_utils/memory_pool.h"
#include "machine/host/perf_analysis.h"
#include "machine/device/dynamic/device_common.h"
#include "machine/utils/machine_ws_intf.h"
#include "securec.h"
extern "C" {
__attribute__((weak)) int AdxDataDumpServerUnInit();
__attribute__((weak)) int dlog_getlevel(int32_t moduled, int32_t* enableEvent);
__attribute__((weak)) int drvDeviceGetPhyIdByIndex(uint32_t logicDevId, uint32_t* phyDevId);
}
namespace npu::tile_fwk {
namespace {
constexpr uint32_t MIX_BLOCK_DIM = 2;
constexpr uint32_t HIGHT_BIT = 16;
constexpr uint32_t SUB_CORE = 3;
constexpr uint32_t AIV_PER_AICORE = 2;
constexpr uint32_t AICPU_NUM_OF_RUN_AICPU_TASKS = 1;
void* DevAlloc(const uint64_t size)
{
uint8_t* devPtr = nullptr;
DevMemoryPool::Instance().AllocDevAddr(&devPtr, size);
if (devPtr == nullptr) {
MACHINE_LOGE(RtErr::RT_MALLOC_FAILED, "Failed to alloc dev addr of size[%lu].", size);
return nullptr;
}
if (RuntimeMemset(devPtr, size, 0, size) != RT_SUCCESS) {
DevMemoryPool::Instance().FreeDevAddr(devPtr);
MACHINE_LOGE(RtErr::RT_MEMSET_FAILED, "RuntimeMemset failed size=%lu.", size);
return nullptr;
}
return devPtr;
}
void* CopyDataToDevice(const void *dataPtr, const uint64_t dataSize)
{
void* devAddr = DevAlloc(dataSize);
if (devAddr == nullptr) {
MACHINE_LOGE(DevCommonErr::ALLOC_FAILED, "Failed to alloc dev memory of size %lu", dataSize);
return nullptr;
}
if (RuntimeMemcpy(devAddr, dataSize, dataPtr, dataSize, RtMemcpyKind::HOST_TO_DEVICE) != RT_SUCCESS) {
DevMemoryPool::Instance().FreeDevAddr(devAddr);
MACHINE_LOGE(DevCommonErr::ALLOC_FAILED, "Failed to copy data to dev of size %lu", dataSize);
return nullptr;
}
return devAddr;
}
}
DeviceRunner::~DeviceRunner() {}
DeviceRunner& DeviceRunner::Get()
{
static DeviceRunner runner;
std::call_once(runner.once_, [&]() { ASSERT(DevCommonErr::INIT_FAILED, runner.Init() == 0); });
return runner;
}
void DeviceRunner::SetHostProfFunction(Function* function, const std::vector<npu::tile_fwk::dynamic::DeviceTensorData>& tensors)
{
hostProf_.SetProfFunction(function, tensors);
}
uint32_t DeviceRunner::GetHostProfType() const
{
return hostProf_.GetProfType();
}
void DeviceRunner::InitDevDfxArgs(const bool isPerfTrace, DevDfxArgs &devDfxArg)
{
int logLevel = -1;
if (dlog_getlevel != nullptr) {
int32_t enableLog = -1;
logLevel = dlog_getlevel(PYPTO, &enableLog);
}
devDfxArg.logLevel = logLevel;
uint32_t logicalDevId = GetLogDeviceId();
uint32_t phyDevId = 0;
if (drvDeviceGetPhyIdByIndex != nullptr) {
drvDeviceGetPhyIdByIndex(logicalDevId, &phyDevId);
} else {
MACHINE_LOGW("Get device Local deviceId failed");
}
MACHINE_LOGI("Current device info: logical devId: %u, phyDevId: %u", logicalDevId, phyDevId);
devDfxArg.deviceId = phyDevId;
if (isPerfTrace) {
devDfxArg.isOpenPerfTrace = 1;
}
MACHINE_LOGI("Dfx info: log level is: %d, openPerTrace: %d, deviceId: %u",
logLevel, devDfxArg.isOpenPerfTrace, devDfxArg.deviceId);
}
void DeviceRunner::ResetPerData() const { devicePerf_.ResetPerData(); }
void DeviceRunner::InitMetaData(DeviceArgs& devArgs) const
{
devArgs.runtimeDataRingBufferAddr = args_.runtimeDataRingBufferAddr;
devArgs.sharedBuffer = args_.sharedBuffer;
devArgs.coreRegAddr = args_.coreRegAddr;
devArgs.nrAic = args_.nrAic;
devArgs.nrAiv = args_.nrAiv;
devArgs.corePmuRegAddr = args_.corePmuRegAddr;
devArgs.corePmuAddr = args_.corePmuAddr;
devArgs.taskWastTime = args_.taskWastTime;
devArgs.pmuEventAddr = args_.pmuEventAddr;
devArgs.aicpuPerfAddr = args_.aicpuPerfAddr;
devArgs.devDfxArgAddr = args_.devDfxArgAddr;
}
int DeviceRunner::InitDeviceArgsCore(DeviceArgs& args)
{
std::vector<int64_t> regs;
std::vector<int64_t> regsPmu;
GetAicoreRegs(args.archInfo, regs, regsPmu);
args.nrAic = regs.size() / SUB_CORE;
args.nrAiv = args.nrAic * AIV_PER_AICORE;
uint64_t nrCore = regs.size() + AICPU_NUM_OF_RUN_AICPU_TASKS;
args.sharedBuffer = reinterpret_cast<uint64_t>(DevAlloc(nrCore * SHARED_BUFFER_SIZE));
args.corePmuAddr = reinterpret_cast<uint64_t>(DevAlloc(nrCore * PMU_BUFFER_SIZE));
if (args.sharedBuffer == 0 || args.corePmuAddr == 0) {
return -1;
}
size_t regSize = regs.size() * sizeof(uint64_t);
args.coreRegAddr = reinterpret_cast<uint64_t>(CopyDataToDevice(regs.data(), regSize));
if (args.coreRegAddr == 0) {
MACHINE_LOGE(DevCommonErr::ALLOC_FAILED, "Fail to copy aicore reg data from host to device.");
return -1;
}
args.corePmuRegAddr = reinterpret_cast<uint64_t>(CopyDataToDevice(regsPmu.data(), regSize));
if (args.corePmuRegAddr == 0) {
MACHINE_LOGE(DevCommonErr::ALLOC_FAILED, "Fail to copy aicore pmu reg data from host to device.");
return -1;
}
MACHINE_LOGI("Dev args :aic %u aiv %u, sharedBuffer %lx coreRegAddr %lx corePmuRegAddr %lx",
args.nrAic, args.nrAiv, args.sharedBuffer, args.coreRegAddr, args.corePmuRegAddr);
args.taskWastTime = reinterpret_cast<uint64_t>(DevAlloc(sizeof(uint64_t)));
size_t shmSize = sizeof(dynamic::RuntimeDataRingBufferHead) + dynamic::DEVICE_SHM_SIZE +
dynamic::DEVICE_TASK_QUEUE_SIZE * args.nrAicpu;
args.runtimeDataRingBufferAddr = reinterpret_cast<uint64_t>(DevAlloc(shmSize));
std::vector<int64_t> pmuEvtType;
PmuCommon::InitPmuEventType(args.archInfo, pmuEvtType);
args.pmuEventAddr =
reinterpret_cast<uint64_t>(CopyDataToDevice(pmuEvtType.data(), pmuEvtType.size() * sizeof(int64_t)));
if (args.pmuEventAddr == 0) {
MACHINE_LOGE(DevCommonErr::ALLOC_FAILED, "Fail to copy pmu evt type from host to device.");
return -1;
}
InitAicpuPerfAddr(args);
DevDfxArgs dfxArgs;
InitDevDfxArgs(args.aicpuPerfAddr != 0, dfxArgs);
args.devDfxArgAddr = reinterpret_cast<uint64_t>(CopyDataToDevice(&dfxArgs, sizeof(DevDfxArgs)));
if (args.devDfxArgAddr == 0) {
MACHINE_LOGE(DevCommonErr::ALLOC_FAILED, "Fail to copy dfx info from host to device.");
return -1;
}
return 0;
}
void DeviceRunner::InitAicpuPerfAddr(DeviceArgs& args)
{
if (GetEnvVar("DUMP_DEVICE_PERF") == "true") {
auto aicpuDevPtr = DevMallocWithAlignSize(MAX_ROUND_NUM * sizeof(MetricPerf), TWO_MB_HUGE_PAGE_FLAGS);
if (aicpuDevPtr == nullptr) {
MACHINE_LOGW("Aicpu per addr malloc failed");
} else {
args.aicpuPerfAddr = npu::tile_fwk::dynamic::PtrToValue(aicpuDevPtr);
}
}
}
void DeviceRunner::GetAicoreRegs(const ArchInfo archInfo, std::vector<int64_t> ®s, std::vector<int64_t> ®sPmu)
{
if (archInfo == ArchInfo::DAV_3510) {
RuntimeAgent::GetAgent()->GetAicoreRegInfoForDAV3510(regs, regsPmu);
return;
}
if (archInfo == ArchInfo::DAV_2201) {
std::vector<int64_t> aiv;
std::vector<int64_t> aic;
if (RuntimeAgent::GetAgent()->GetAicoreRegInfo(aic, aiv, ADDR_MAP_TYPE_REG_AIC_CTRL) != 0) {
return;
}
regs.insert(regs.end(), aic.begin(), aic.end());
regs.insert(regs.end(), aiv.begin(), aiv.end());
std::vector<int64_t> aivPmu;
std::vector<int64_t> aicPmu;
if (RuntimeAgent::GetAgent()->GetAicoreRegInfo(aicPmu, aivPmu, ADDR_MAP_TYPE_REG_AIC_PMU_CTRL) != 0) {
return;
}
regsPmu.insert(regsPmu.end(), aicPmu.begin(), aicPmu.end());
regsPmu.insert(regsPmu.end(), aivPmu.begin(), aivPmu.end());
}
}
int DeviceRunner::InitDeviceArgs(DeviceArgs& args)
{
memset_s(&args, sizeof(args), 0, sizeof(args));
args.deviceId = GetLogDeviceId();
args.archInfo = static_cast<ArchInfo>(Platform::Instance().GetSoc().GetNPUArch());
uint32_t aicpuNum = args.archInfo == ArchInfo::DAV_3510 ? dynamic::DEVICE_MAX_AICPU_NUM : 5;
args.maxAicpuNum = static_cast<uint32_t>(Platform::Instance().GetSoc().GetAICPUNum());
args.nrValidAic = GetCfgBlockdim();
args.nrAicpu = std::min(aicpuNum, args.maxAicpuNum);
args.scheCpuNum = dynamic::CalcSchAicpuNumByBlockDim(args.nrValidAic, args.nrAicpu, args.archInfo);
MACHINE_LOGD("DevArgs: block dim[%u], aicpu num[%u], max aicpu num[%u], sche cpu num[%u].",
args.nrValidAic, args.nrAicpu, args.maxAicpuNum, args.scheCpuNum);
InitAiCpuSoBin(args);
return InitDeviceArgsCore(args);
}
void DeviceRunner::SyncProfData(const bool forceSync) { devicePerf_.SyncProfData(forceSync); }
int DeviceRunner::DynamicLaunchSynchronize(RtStream schedStream, RtStream ctrlStream, RtStream aicoreStream)
{
int rcAicore = RuntimeStreamSynchronize(aicoreStream);
int rcAicpu = RuntimeStreamSynchronize(schedStream);
int rcCtrl = 0;
if (ctrlStream != nullptr) {
rcCtrl = RuntimeStreamSynchronize(ctrlStream);
}
if (IsPtoDataDumpEnabled()) {
MACHINE_LOGD("DataDumpServerInit is called \n");
AdxDataDumpServerUnInit();
}
if (rcAicore != 0 || rcAicpu != 0 || rcCtrl != 0) {
MACHINE_LOGW("sync stream failed aicpu:%d aicore:%d ctrl cpu:%d", rcAicpu, rcAicore, rcCtrl);
}
return rcAicore + rcAicpu + rcCtrl;
}
int DeviceRunner::LaunchDynamicAiCore(void *binHandle, RtStream aicoreStream, uint32_t blockDim,
DeviceKernelArgs* kernelArgs)
{
RtArgsEx rtArgs;
memset_s(&rtArgs, sizeof(rtArgs), 0, sizeof(rtArgs));
std::vector<void*> kArgs = {nullptr, nullptr, nullptr, nullptr, nullptr, kernelArgs->cfgdata};
rtArgs.args = kArgs.data();
rtArgs.argsSize = kArgs.size() * sizeof(int64_t);
uint64_t tilingKey = OpInfoManager::GetInstance().GetOpTilingKey();
RtTaskCfgInfo cfg = {};
cfg.schemMode = static_cast<uint8_t>(RtSchemModeType::BATCH);
return RuntimeKernelLaunchWithHandleV2(binHandle, tilingKey, blockDim, &rtArgs, nullptr, aicoreStream, &cfg);
}
int DeviceRunner::LaunchDynamicAiCpu(RtStream aicpuStream, uint32_t aicpuNum, DeviceKernelArgs* kernelArgs)
{
#ifdef BUILD_WITH_NEW_CANN
return LoadAicpuOp::GetInstance().LaunchBuiltInOp(aicpuStream, kernelArgs, aicpuNum, "PyptoRun");
#endif
auto args = reinterpret_cast<AiCpuArgs*>(kernelArgs->inputs);
kernelArgs->inputs = nullptr;
args->kArgs = *kernelArgs;
RtAicpuArgsEx rtArgs;
memset_s(&rtArgs, sizeof(rtArgs), 0, sizeof(rtArgs));
rtArgs.args = args;
rtArgs.argsSize = reinterpret_cast<uint64_t>(kernelArgs->outputs);
rtArgs.kernelNameAddrOffset = offsetof(AiCpuArgs, kernelName);
rtArgs.soNameAddrOffset = offsetof(AiCpuArgs, soName);
rtArgs.hostInputInfoNum = 1;
RtHostInputInfo hostInputInfo;
hostInputInfo.addrOffset = reinterpret_cast<int8_t*>(&args->kArgs.inputs) - reinterpret_cast<int8_t*>(args);
hostInputInfo.dataOffset = sizeof(AiCpuArgs);
rtArgs.hostInputInfoPtr = &hostInputInfo;
rtArgs.timeout = dynamic::AICPU_EXECUTE_TIMEOUT;
MACHINE_LOGI("Copy flow addrOffset %u argsSize %u", hostInputInfo.addrOffset, hostInputInfo.dataOffset);
return RuntimeAicpuKernelLaunchExWithArgs(static_cast<uint32_t>(RtKernelType::AICPU_KFC), "AST_DYN_AICPU", aicpuNum,
&rtArgs, nullptr, aicpuStream, RT_KERNEL_USE_SPECIAL_TIMEOUT);
}
void DeviceRunner::InitAiCpuSoBin(DeviceArgs& devArgs)
{
std::vector<char> buffer;
std::string fileName = GetCurrentSharedLibPath() + "/libtilefwk_backend_server.so";
if (!ReadBytesFromFile(fileName, buffer)) {
MACHINE_LOGE(
DevCommonErr::FILE_ERROR, "Read bin form tilefwk_backend_server.so failed, please check the so[%s]",
fileName.c_str());
return;
}
void* devBufferPtr = CopyDataToDevice(buffer.data(), buffer.size());
if (devBufferPtr == nullptr) {
MACHINE_LOGE(DevCommonErr::MEMCPY_FAILED, "Failed to copy buffer of [%s] to device.", fileName.c_str());
return;
}
devArgs.aicpuSoBin = reinterpret_cast<uint64_t>(devBufferPtr);
devArgs.aicpuSoLen = buffer.size();
HOST_PERF_TRACE(TracePhase::RunDevKernelInitAicpuSo);
}
int DeviceRunner::LaunchAicpuServerInit(int64_t *devArgsAddr)
{
auto aicpuStream = GetStreamContext().GetScheStream();
#ifdef BUILD_WITH_NEW_CANN
return LoadAicpuOp::GetInstance().LaunchBuiltInOp(aicpuStream, kArgs, 1, "PyptoInit");
#endif
struct Args {
DeviceKernelArgs kArgs;
const char kernelName[32] = {"DynTileFwkKernelServerInit"};
const char soName[32] = {"libaicpu_extend_kernels.so"};
const char opName[32] = {""};
} args;
args.kArgs.cfgdata = devArgsAddr;
RtAicpuArgsEx rtArgs;
memset_s(&rtArgs, sizeof(rtArgs), 0, sizeof(rtArgs));
rtArgs.args = &args;
rtArgs.argsSize = sizeof(args);
rtArgs.kernelNameAddrOffset = offsetof(struct Args, kernelName);
rtArgs.soNameAddrOffset = offsetof(struct Args, soName);
int ret = RuntimeAicpuKernelLaunchExWithArgs(
static_cast<uint32_t>(RtKernelType::AICPU_KFC), "AST_DYN_AICPU", 1, &rtArgs, nullptr, aicpuStream, 0);
if (ret != RT_SUCCESS) {
MACHINE_LOGE(RtErr::RT_LAUNCH_FAILED, "Aicpu server init failed %d", ret);
return ret;
}
return RuntimeStreamSynchronize(aicpuStream);
}
bool DeviceRunner::GetEnableDumpDevPref() const { return args_.aicpuPerfAddr != 0; }
void DeviceRunner::SetDebugEnable() { devicePerf_.SetDebugEnable(); }
void DeviceRunner::RunPost(RtStream aicpuStream, RtStream aicoreStream)
{
AclRtEvent event;
int rc = AclRtCreateEventExWithFlag(&event, ACL_EVENT_SYNC);
if (rc < 0) {
MACHINE_LOGI("CreateEvent failed rc=%d.", rc);
}
rc = AclRtRecordEvent(event, aicpuStream);
if (rc < 0) {
MACHINE_LOGI("RecordEvent failed rc=%d", rc);
}
rc = AclRtStreamWaitEvent(aicoreStream, event);
if (rc < 0) {
MACHINE_LOGI("StreamWaitEvent failed rc=%d", rc);
}
}
int DeviceRunner::DynamicKernelLaunch(const KernelLaunchInfo &launchInfo, DeviceKernelArgs* kernelArgs) const
{
HOST_PERF_TRACE(TracePhase::RunDevKernelLaunchAicpuInit);
uint64_t startTime = MspfSysCycleTime();
auto rc = LaunchDynamicAiCpu(launchInfo.schedStream, launchInfo.aicpuNum, kernelArgs);
if (rc < 0) {
MACHINE_LOGE(HostLauncherErr::LAUNCH_AICPU_FAILED, "launch aicpu failed %d\n", rc);
return rc;
}
ReportHostProfInfo(launchInfo.aicoreStream, startTime, launchInfo.aicpuNum, MSPF_GE_TASK_TYPE_AI_CPU);
HOST_PERF_TRACE(TracePhase::RunDevKernelLaunchAicpuRun);
startTime = MspfSysCycleTime();
rc = LaunchDynamicAiCore(launchInfo.binHandle, launchInfo.aicoreStream, launchInfo.blockDim, kernelArgs);
if (rc < 0) {
MACHINE_LOGE(HostLauncherErr::LAUNCH_AICPU_FAILED, "launch aicpu failed %d\n", rc);
return rc;
}
ReportHostProfInfo(launchInfo.aicoreStream, startTime, launchInfo.blockDim, MSPF_GE_TASK_TYPE_MIX_AIC, true);
HOST_PERF_TRACE(TracePhase::RunDevKernelLaunchAIcore);
return rc;
}
int DeviceRunner::DynamicTripleStreamLaunch(const KernelLaunchInfo &launchInfo, DeviceKernelArgs* kernelArgs) const
{
LoadAicpuOp::GetInstance().CustomAiCpuSoLoad();
auto args = reinterpret_cast<AiCpuArgs*>(kernelArgs->inputs);
kernelArgs->inputs = nullptr;
args->kArgs = *kernelArgs;
RtAicpuArgsEx rtArgs;
memset_s(&rtArgs, sizeof(rtArgs), 0, sizeof(rtArgs));
rtArgs.args = args;
rtArgs.argsSize = reinterpret_cast<uint64_t>(kernelArgs->outputs);
rtArgs.hostInputInfoNum = 1;
rtArgs.kernelNameAddrOffset = offsetof(AiCpuArgs, kernelName);
rtArgs.soNameAddrOffset = offsetof(AiCpuArgs, soName);
RtHostInputInfo hostInputInfo;
hostInputInfo.addrOffset = reinterpret_cast<int8_t*>(&args->kArgs.inputs) - reinterpret_cast<int8_t*>(args);
hostInputInfo.dataOffset = sizeof(AiCpuArgs);
rtArgs.hostInputInfoPtr = &hostInputInfo;
MACHINE_LOGI("Copy flow addrOffset %u argsSize %u", hostInputInfo.addrOffset, hostInputInfo.dataOffset);
args->kArgs.parameter.runMode = RUN_SPLITTED_STREAM_CTRL;
uint64_t startTime = MspfSysCycleTime();
int rc = RuntimeAicpuKernelLaunchExWithArgs(
static_cast<uint32_t>(RtKernelType::AICPU_KFC), "AST_DYN_AICPU", 1, &rtArgs, nullptr, launchInfo.ctrlStream, 0);
if (rc < 0) {
MACHINE_LOGE(HostLauncherErr::LAUNCH_AICPU_FAILED, "triple stream launch ctrl aicpu failed %d\n", rc);
return rc;
}
ReportHostProfInfo(launchInfo.ctrlStream, startTime, 1, MSPF_GE_TASK_TYPE_AI_CPU, false);
startTime = MspfSysCycleTime();
args->kArgs.parameter.runMode = RUN_SPLITTED_STREAM_SCHE;
rc = RuntimeAicpuKernelLaunchExWithArgs(static_cast<uint32_t>(RtKernelType::AICPU_KFC), "AST_DYN_AICPU",
launchInfo.aicpuNum, &rtArgs, nullptr, launchInfo.schedStream, 0);
if (rc < 0) {
MACHINE_LOGE(HostLauncherErr::LAUNCH_AICPU_FAILED, "triple stream launch sche aicpu failed %d\n", rc);
return rc;
}
ReportHostProfInfo(launchInfo.schedStream, startTime, launchInfo.aicpuNum, MSPF_GE_TASK_TYPE_AI_CPU, false);
startTime = MspfSysCycleTime();
rc = LaunchDynamicAiCore(launchInfo.binHandle, launchInfo.aicoreStream, launchInfo.blockDim, kernelArgs);
if (rc < 0) {
MACHINE_LOGE(HostLauncherErr::LAUNCH_AICORE_FAILED, "triple stream launch aicore failed %d\n", rc);
return rc;
}
ReportHostProfInfo(launchInfo.aicoreStream, startTime, launchInfo.blockDim, MSPF_GE_TASK_TYPE_MIX_AIC, true);
RunPost(launchInfo.ctrlStream, launchInfo.aicoreStream);
return rc;
}
int DeviceRunner::DynamicLaunch(const KernelLaunchInfo &launchInfo, DeviceKernelArgs* kernelArgs)
{
#ifdef BUILD_WITH_NEW_CANN
auto ret = LoadAicpuOp::GetInstance().LaunchPyptoNullOp(launchInfo.schedStream, kernelArgs, 1);
if (ret != 0) {
MACHINE_LOGE(HostLauncherErr::LAUNCH_BUILTIN_OP_NULL_FAILED, "launch built null failed");
return ret;
}
#endif
if (!devicePerf_.RunPrepare()) {
MACHINE_LOGE(HostLauncherErr::LAUNCH_PREPARE_FAILED, "Prepare failed.");
return -1;
}
HOST_PERF_TRACE(TracePhase::RunDevKernelInitRunPrepare);
if (launchInfo.isCaptureActivate) {
ExchangeCaptureModeGlobal();
}
if (launchInfo.ctrlStream == nullptr) {
return DynamicKernelLaunch(launchInfo, kernelArgs);
}
return DynamicTripleStreamLaunch(launchInfo, kernelArgs);
}
void DeviceRunner::ReportHostProfInfo(
RtStream stream, uint64_t startTime, uint32_t blockDim, uint16_t taskType, bool isCore) const
{
if (hostProf_.GetProfType() == MSPF_COMMANDHANDLE_TYPE_START) {
uint64_t endTime = MspfSysCycleTime();
if (isCore) {
uint32_t mixBlockDim = MIX_BLOCK_DIM;
blockDim = (mixBlockDim << HIGHT_BIT) | blockDim;
hostProf_.HostProfReportContextInfo(endTime);
}
if ((hostProf_.GetProfSwitch() & MSPF_TASK_TIME_L1_MASK) != 0) {
hostProf_.HostProfReportNodeInfo(endTime, blockDim, taskType);
}
endTime = MspfSysCycleTime();
(void)hostProf_.HostProfReportApi(startTime, endTime);
}
if (taskType == MSPF_GE_TASK_TYPE_MIX_AIC) {
hostProf_.HostProfReportCacheTaskInfo(stream, blockDim, taskType);
}
}
int DeviceRunner::DynamicRun(const KernelLaunchInfo &launchInfo, DeviceKernelArgs* kernelArgs)
{
int rc = DynamicLaunch(launchInfo, kernelArgs);
if (rc < 0) {
return rc;
}
if (launchInfo.isCaptureActivate) {
return 0;
}
return DynamicLaunchSynchronize(launchInfo.schedStream, launchInfo.ctrlStream, launchInfo.aicoreStream);
}
int DeviceRunner::Init()
{
std::string builtInOpPath = config::LogTopFolder() + "/built_in";
CreateMultiLevelDir(builtInOpPath);
LoadAicpuOp::GetInstance().GenBuiltInOpInfo(builtInOpPath);
if (LoadAicpuOp::GetInstance().GetBuiltInOpBinHandle() != 0) {
MACHINE_LOGE(DevCommonErr::GET_HANDLE_FAILED, "Get builtInOp Funchandle failed\n");
return -1;
}
hostProf_.RegHostProf();
InitializeErrorCallback();
if (InitDeviceArgs(args_) != 0) {
MACHINE_LOGE(HostLauncherErr::PREPARE_ARGS_FAILED, "prepareArgs failed\n");
return -1;
}
int64_t *devArgsAddr = static_cast<int64_t*>(CopyDataToDevice(&args_, sizeof(DeviceArgs)));
if (devArgsAddr == nullptr) {
MACHINE_LOGE(DevCommonErr::MEMCPY_FAILED, "Failed to copy args to device.");
return -1;
}
if (config::GetRuntimeOption<int64_t>(CFG_RUN_MODE) != CFG_RUN_MODE_SIM) {
LaunchAicpuServerInit(devArgsAddr);
}
devicePerf_.InitAndStartDumpThread(args_);
npu::tile_fwk::dynamic::AdumpRegExceptionDump();
return 0;
}
}