* Copyright (c) 2025 Huawei Technologies Co., Ltd.
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
* CANN Open Software License Agreement Version 2.0 (the "License").
* Please refer to the License for details. You may not use this file except in compliance with the License.
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
* See LICENSE in the root of the software repository for the full text of the License.
*/
* \file device_ctrl.h
* \brief
*/
#pragma once
#include "device_common.h"
#include <cstdint>
#include <cstdlib>
#include "device_utils.h"
#include "device_perf.h"
#include "machine/device/dynamic/context/device_execute_context.h"
#include "machine/utils/dynamic/dev_tensor_creator.h"
#include "machine/utils/dynamic/dev_encode_tensor.h"
#include "machine/utils/machine_ws_intf.h"
#include "machine/utils/device_log.h"
#include "machine/utils/barrier.h"
#include "machine/device/dynamic/perf_event_sampler.h"
#include "machine/device/dynamic/aicore_prof.h"
#include "device_trace.h"
#ifdef __DEVICE__
#include "log_types.h"
#endif
#ifdef __USE_CUSTOM_CTRLFLOW__
extern "C" __attribute__((visibility("default"))) void* GetCtrlFlowFunc();
#endif
extern "C" __attribute__((weak)) int dlog_setlevel(int32_t moduled, int32_t level, int32_t enableEvent);
namespace npu::tile_fwk::dynamic {
class DeviceCtrlMachine {
public:
DeviceTaskCtrl* InitTaskCtrl(int idx, DeviceTask* devTask, DeviceExecuteContext* ctx)
{
if (ctx == nullptr) {
DEV_ERROR(
CtrlErr::ROOT_ALLOC_CTX_NULL, "#ctrl.push.init_dtask: Init Task control failed, which ctx is null.");
return nullptr;
}
if (idx < 0 || idx >= MAX_DEVICE_TASK_NUM) {
DEV_ERROR(
CtrlErr::CTRL_FLOW_EXEC_FAILED, "#ctrl.push.init_dtask: Init Task control failed, idx=%d out of bounds.", idx);
return nullptr;
}
auto taskCtrl = &GetTaskCtrlInPool(idx);
taskCtrl->BindTask(devTask);
taskCtrl->SetSchNumCnt(GetScheAicpuNum());
devTask->aicoreModel = reinterpret_cast<uint64_t>(ctx->aicoreModel);
if (ctx->costModelData != nullptr) {
devTask->costModelData = reinterpret_cast<uint64_t>(ctx->costModelData);
}
return taskCtrl;
}
int AllocNewTaskCtrl()
{
uint32_t& taskCtrlIndex = devStartArgs_->devCtrlState.taskCtrlIndex;
TIMEOUT_CHECK_INIT(devStartArgs_->devProg->devArgs.archInfo, TIMEOUT_1MIN);
while (true) {
if (taskCtrlIndex == MAX_DEVICE_TASK_NUM)
taskCtrlIndex = 0;
if (!GetTaskCtrlInPool(taskCtrlIndex).IsNotFree()) {
return taskCtrlIndex++;
}
taskCtrlIndex++;
__PYPTO_TIMEOUT_CHECK(CtrlErr::CTRL_ALLOC_TIMEOUT,
return DEVICE_MACHINE_ERROR,
"#ctrl.alloc: AllocNewTaskCtrl, taskCtrlIndex=%u.",
taskCtrlIndex);
}
}
bool SameParallelIterTaskCtrl(DeviceTaskCtrl* srcTaskCtrl, DeviceTaskCtrl* dstTaskCtrl)
{
if (!srcTaskCtrl->SupportParallel() || !dstTaskCtrl->SupportParallel()) {
return false;
}
if (srcTaskCtrl->ParallelForId() != dstTaskCtrl->ParallelForId() ||
srcTaskCtrl->ParallelIterId() != dstTaskCtrl->ParallelIterId()) {
return false;
}
return true;
}
int PushTask(DynDeviceTask* dynTask, DeviceExecuteContext* ctx)
{
auto idx = AllocNewTaskCtrl();
if (idx < 0) {
DEV_ERROR(CtrlErr::CTRL_ALLOC_TIMEOUT, "#ctrl.push.alloc: AllocNewTaskCtrl failed, idx=%d.", idx);
return idx;
}
bool appendLastTaskCtrl = false;
DeviceTaskCtrl* newTaskCtrl = InitTaskCtrl(idx, &dynTask->devTask, ctx);
if (lastTaskCtrl_ && lastTaskCtrl_->SupportParallel()) {
if (SameParallelIterTaskCtrl(lastTaskCtrl_, newTaskCtrl)) {
lastTaskCtrl_->existNextSameIterTask = true;
lastTaskCtrl_->nextSameIterTaskCtrl = reinterpret_cast<uint64_t>(newTaskCtrl);
appendLastTaskCtrl = true;
} else {
lastTaskCtrl_->existNextSameIterTask = false;
}
}
lastTaskCtrl_ = newTaskCtrl;
if (dynTask->IsParallelSameIterLastDevTask()) {
lastTaskCtrl_->existNextSameIterTask = false;
}
if (!appendLastTaskCtrl && !ctx->devProg->ctrlFlowCacheAnchor->IsRecording()) {
for (uint32_t i = 0; i < GetScheAicpuNum(); ++i) {
GetTaskQueue(i).Enqueue(newTaskCtrl);
}
}
return idx;
}
void StopAicoreManager()
{
for (uint32_t i = 0; i < GetScheAicpuNum(); ++i) {
GetTaskQueue(i).Enqueue(nullptr);
}
}
void RegisterTaskInspector(DeviceTaskInspectorEntry inspectorEntry, void* inspector)
{
inspectorEntry_ = inspectorEntry;
inspector_ = inspector;
}
void InitTaskPipeWithSched(DevAscendProgram* devProg)
{
for (uint32_t i = 0; i < devProg->devArgs.scheCpuNum; ++i) {
GetTaskQueue(i).ResetEmpty();
}
}
void InitCtrlFlowCache(
DevAscendProgram* devProg, DevControlFlowCache* ctrlFlowCache, DevStartArgs* devStartArgs, bool firstInit)
{
DevControlFlowCache* devCtrlFlowCache = nullptr;
devCtrlFlowCache = &devProg->controlFlowCache;
if (devProg->controlFlowCache.isRecording) {
DEV_INFO("Init dev program cache");
devProg->controlFlowCache.contextWorkspaceAddr = devStartArgs->contextWorkspaceAddr;
} else if (ctrlFlowCache != nullptr) {
DEV_INFO("Init independent anchor program cache %p.", ctrlFlowCache);
if (ctrlFlowCache->isRecording) {
DEV_ASSERT_MSG(
CtrlErr::CTRL_FLOW_EXEC_FAILED, !devProg->controlFlowCache.isRecording,
"#ctrl.flow.exec: dev program ctr cache should not record");
ctrlFlowCache->contextWorkspaceAddr = devStartArgs->contextWorkspaceAddr;
} else {
DEV_ASSERT_MSG(
CtrlErr::CTRL_FLOW_EXEC_FAILED,
!devProg->controlFlowCache.isActivated && ctrlFlowCache->isActivated,
"#ctrl.flow.exec: should not active dev program cache and independent ctrl cache at same time");
}
devCtrlFlowCache = ctrlFlowCache;
if (devCtrlFlowCache->isActivated && !devCtrlFlowCache->isRelocMetaDev) {
DEV_INFO("ControlFlowCache: reloc meta cache");
devCtrlFlowCache->isRelocMetaDev = true;
devCtrlFlowCache->RelocMetaCache(0, reinterpret_cast<uint64_t>(devCtrlFlowCache));
}
}
DEV_INFO(
"ControlFlowCache: deviceTaskCount=%d, firstInit=%d.", (int)devCtrlFlowCache->deviceTaskCount,
(int)firstInit);
* However, it should be moved into the execute context. */
devProg->ctrlFlowCacheAnchor = devCtrlFlowCache;
if (devCtrlFlowCache->deviceTaskCount == 0) {
if (!firstInit) {
devProg->ResetRerun();
}
DEV_INFO("ControlFlowCache: cache have no devtask , ignore it");
return;
}
if (devCtrlFlowCache->IsActivatedPartialCache(devStartArgs)) {
DEV_INFO("ControlFlowCache: 1");
if (!devCtrlFlowCache->isRelocDataDev) {
devCtrlFlowCache->isRelocDataDev = true;
devCtrlFlowCache->TaskAddrRelocProgramAndCtrlCache(
0, 0, reinterpret_cast<uint64_t>(devProg), reinterpret_cast<uint64_t>(devCtrlFlowCache));
devCtrlFlowCache->RuntimeAddrRelocProgram(0, reinterpret_cast<uint64_t>(devProg));
}
devCtrlFlowCache->IncastOutcastAddrRestore();
devCtrlFlowCache->IncastOutcastAddrReloc(0, devStartArgs->contextWorkspaceAddr, devStartArgs);
if (devCtrlFlowCache->workspaceAddr != devStartArgs->contextWorkspaceAddr) {
devCtrlFlowCache->workspaceAddr = devStartArgs->contextWorkspaceAddr;
devCtrlFlowCache->TaskAddrRestoreWorkspace();
devCtrlFlowCache->TaskAddrRelocWorkspace(0, devStartArgs->contextWorkspaceAddr, devStartArgs);
}
devProg->ResetRerun();
}
}
bool InitDevProgram(DevAscendProgram* devProg)
{
bool firstInit = false;
if (devProg->controlFlowBinaryAddr == nullptr) {
devProg->RelocProgram(0, reinterpret_cast<uint64_t>(devProg), true);
RuntimeDataRingBufferHead* ringBufferHead =
reinterpret_cast<RuntimeDataRingBufferHead*>(devProg->GetRuntimeDataList());
ringBufferHead->Initialize(devProg->GetDeviceRuntimeOffset().size, devProg->GetDeviceRuntimeOffset().count, devProg->devArgs.archInfo);
devProg->runtimeDataRingBufferInited = true;
firstInit = true;
}
memBarrier();
#ifdef __USE_CUSTOM_CTRLFLOW__
DEV_INFO("Use built in ctrl flow func.");
devProg->controlFlowBinaryAddr = GetCtrlFlowFunc();
#else
auto execProg = DeviceExecuteProgram(devProg, nullptr);
devProg->controlFlowBinaryAddr = execProg.GetControlFlowEntry();
#endif
return firstInit;
}
int InitDyn(DeviceKernelArgs* kargs)
{
DEV_INFO("AscendCppDyInitTask begin");
DevAscendProgram* devProg = PtrToPtr<int64_t, DevAscendProgram>(kargs->cfgdata);
ApplyDynamicCellMatchDescPatchesFromLaunchArgs(devProg, kargs->inputs);
auto &tensorBudget = devProg->memBudget.tensor;
auto &metadataBudget = devProg->memBudget.metadata;
if (kargs->maxDynamicAssembleOutcastMem != 0) {
tensorBudget.maxDynamicAssembleOutcastMem = kargs->maxDynamicAssembleOutcastMem;
}
if (kargs->maxDynamicCellMatchTableMem != 0) {
metadataBudget.maxDynamicCellMatchTableMem = kargs->maxDynamicCellMatchTableMem;
uint64_t totalDynamicCellMatchSlotNum = metadataBudget.dynamicCellMatchSlotNum;
metadataBudget.dynamicCellMatch = totalDynamicCellMatchSlotNum * metadataBudget.maxDynamicCellMatchTableMem;
}
PerfBegin(PERF_EVT_INIT);
bool firstInit = InitDevProgram(devProg);
const uint64_t dynCmCap = devProg->devArgs.dynamicCellMatchCapacity;
const uint64_t dynCmAddrU64 = devProg->devArgs.dynamicCellMatchAddr;
if (dynCmAddrU64 != 0 && dynCmCap != 0) {
DEV_ASSERT_MSG(DevCommonErr::PARAM_INVALID, (dynCmCap % sizeof(uint64_t)) == 0,
"#ctrl.initdyn: dynamicCellMatch cap not uint64 aligned, cap=%lu", dynCmCap);
const size_t numWords = static_cast<size_t>(dynCmCap / sizeof(uint64_t));
auto* table = reinterpret_cast<uint64_t*>(dynCmAddrU64);
for (size_t i = 0; i < numWords; ++i) {
table[i] = AICORE_TASK_INIT;
}
}
PerfEnd(PERF_EVT_INIT);
RuntimeDataRingBufferHead* ringBufferHead = devProg->GetRuntimeDataList();
DEV_INFO(
"AllocatePrepare begin runtimedata: %lu, %lu %lu", ringBufferHead->GetIndexFinished(),
ringBufferHead->GetIndexPending(), ringBufferHead->GetRuntimeDataCount());
DevStartArgs* devStartArgs = reinterpret_cast<DevStartArgs*>(ringBufferHead->AllocatePrepare());
DEV_INFO("AllocatePrepare end");
devStartArgs->syncFlag = 0;
devStartArgs->InitProgram(devProg, reinterpret_cast<uint64_t>(devStartArgs));
devStartArgs->devCtrlState.schAicpuNum = devProg->devArgs.scheCpuNum;
devStartArgs->devCtrlState.taskCtrlIndex = 0;
devStartArgs->devScheState.threadIdx = CTRL_THREAD_INDEX;
devStartArgs->devScheState.finished = 0;
devStartArgs_ = devStartArgs;
InitTaskPipeWithSched(devProg);
devStartArgs->controlFlowEntry = devProg->controlFlowBinaryAddr;
uint64_t inputSize = *kargs->inputs;
uint64_t outputSize = *(kargs->inputs + 1);
auto inputPtr = PtrToPtr<int64_t, DevTensorData>(kargs->inputs + TENSOR_INFO_OFFSET);
DEV_INFO("inputSize=%lu, outputSize=%lu, tensorListPtr=%p.", inputSize, outputSize, inputPtr);
devStartArgs->devTensorList = inputPtr;
devStartArgs->inputTensorSize = static_cast<uint64_t>(inputSize);
devStartArgs->outputTensorSize = static_cast<uint64_t>(outputSize);
devStartArgs->contextWorkspaceAddr = PtrToValue(kargs->workspace);
devStartArgs->contextWorkspaceSize = devProg->workspaceSize;
devStartArgs->inputSymbolList = nullptr;
devStartArgs->inputSymbolSize = 0;
devStartArgs->commGroupNum = (kargs->commContexts == nullptr) ? 0 : static_cast<uint64_t>(*kargs->commContexts);
devStartArgs->commContexts = (devStartArgs->commGroupNum == 0) ? nullptr : kargs->commContexts + 1;
DevControlFlowCache* ctrlFlowCacheBase = reinterpret_cast<DevControlFlowCache*>(kargs->ctrlFlowCache);
DevControlFlowCache* ctrlFlowCache;
if (ctrlFlowCacheBase == nullptr) {
ctrlFlowCache = ctrlFlowCacheBase;
} else if (ctrlFlowCacheBase->IsRecording()) {
ctrlFlowCache = ctrlFlowCacheBase;
} else {
ctrlFlowCache = reinterpret_cast<DevControlFlowCache*>(
reinterpret_cast<uint8_t*>(kargs->ctrlFlowCache) +
ctrlFlowCacheBase->usedCacheSize * ringBufferHead->GetIndexPendingIndex());
}
InitCtrlFlowCache(devProg, ctrlFlowCache, devStartArgs, firstInit);
ringBufferHead->AllocateSubmit();
lastTaskCtrl_ = nullptr;
DEV_INFO("AscendCppDyInitTask done.");
return 0;
}
int ExecDyn(npu::tile_fwk::DeviceKernelArgs* args)
{
DEV_INFO("start control flow.");
auto devProg = PtrToPtr<int64_t, DevAscendProgram>(args->cfgdata);
auto devStartArgs = (DevStartArgs*)devProg->GetRuntimeDataList()->GetRuntimeDataPending();
DeviceExecuteContext ctx(devStartArgs);
ctx.costModelData = reinterpret_cast<CostModel::ModelData*>(args->costmodeldata);
ctx.aicoreModel = args->aicoreModel;
PerfBegin(PERF_EVT_EXEC_DYN);
PerfBegin(PERF_EVT_CONTROL_FLOW_CALL);
int ret = ctx.GELaunch(devStartArgs, [this](DynDeviceTask* dynTask, DeviceExecuteContext* exeCtx) {
if (unlikely(inspectorEntry_ != nullptr)) {
inspectorEntry_(inspector_, exeCtx, dynTask);
}
DEV_IF_DEBUG { DumpTask(dynTask->GetIndex(), (DeviceTask*)dynTask, true); }
PushTask(dynTask, exeCtx);
});
PerfEnd(PERF_EVT_CONTROL_FLOW_CALL);
if (ret != DEVICE_MACHINE_OK) {
DeviceTrace::GetInstance().ReportTraceMsg();
return ret;
}
DEV_INFO("end control flow.");
PerfBegin(PERF_EVT_STAGE_STOP_AICORE);
if (!devProg->ctrlFlowCacheAnchor->IsRecording()) {
StopAicoreManager();
}
PerfEnd(PERF_EVT_STAGE_STOP_AICORE);
DEV_INFO("aicore manager stopped");
PerfEnd(PERF_EVT_EXEC_DYN);
#if ENABLE_PERF_EVT
ctx.ShowStats();
PerfEvtMgr::Instance().Dump();
PerfettoMgr::Instance().Dump("/tmp/perfetto.txt");
#endif
return ret;
}
void SetModuleLogLevel([[maybe_unused]] DeviceKernelArgs* kargs)
{
#ifdef __DEVICE__
DeviceArgs* devArgs = reinterpret_cast<DeviceArgs*>(kargs->cfgdata);
if (devArgs->devDfxArgAddr != 0) {
DevDfxArgs* devDfxArgs = reinterpret_cast<DevDfxArgs*>(devArgs->devDfxArgAddr);
if (devDfxArgs->logLevel != -1 && dlog_setlevel != nullptr) {
(void)dlog_setlevel(LOG_MOD_ID, devDfxArgs->logLevel, 1);
}
}
#endif
}
int EntryInit(DeviceKernelArgs* kargs)
{
SetModuleLogLevel(kargs);
PerfBegin(PERF_EVT_DEVICE_MACHINE_INIT_DYN);
if (kargs == nullptr) {
return -1;
}
if (kargs->inputs == nullptr || kargs->cfgdata == nullptr) {
DEV_ERROR(
DevCommonErr::NULLPTR, "#ctrl.init: Args has null in inputs[%p] work[%p] or cfg[%p].\n", kargs->inputs,
kargs->workspace, kargs->cfgdata);
return -1;
}
InitDyn(kargs);
PerfEnd(PERF_EVT_DEVICE_MACHINE_INIT_DYN);
return 0;
}
int EntryMain(DeviceKernelArgs* kargs)
{
int rc = ExecDyn(kargs);
if (rc == npu::tile_fwk::dynamic::DEVICE_MACHINE_OK) {
return 0;
}
return -1;
}
private:
static void DumpTaskDetail(DeviceTask* devTask, bool isDyn)
{
DEV_DEBUG("===== ready aic func =====");
ReadyCoreFunctionQueue* readyFunc = reinterpret_cast<ReadyCoreFunctionQueue*>(devTask->readyAicCoreFunctionQue);
DEV_DEBUG("aic [%s]", readyFunc->Dump().c_str());
DEV_DEBUG("===== ready aiv func =====");
readyFunc = reinterpret_cast<ReadyCoreFunctionQueue*>(devTask->readyAivCoreFunctionQue);
DEV_DEBUG("aiv [%s]", readyFunc->Dump().c_str());
DEV_DEBUG("===== ready aicpu func =====");
readyFunc = reinterpret_cast<ReadyCoreFunctionQueue*>(devTask->readyAicpuFunctionQue);
DEV_DEBUG("aicpu [%s]", readyFunc->Dump().c_str());
if (isDyn) {
DEV_DEBUG("===== dyn info =====");
auto dyntask = PtrToPtr<DeviceTask, DynDeviceTask>(devTask);
int funcIdx = 0;
for (auto& func : dyntask->stitchedList) {
DEV_DEBUG("funcIdx=%d, %s.", funcIdx, func.DumpDyn(funcIdx, dyntask->cceBinary).c_str());
funcIdx++;
(void)func;
}
} else {
auto coreFunc = reinterpret_cast<CoreFunctionWsAddr*>(devTask->coreFuncData.coreFunctionWsAddr);
DEV_DEBUG("===== core func =====");
for (uint64_t i = 0; i < devTask->coreFunctionCnt; i++) {
DEV_DEBUG(
"taskId[%lu]: binAddr=%#lx, invokeEntry=%#lx, topo=%#lx.", i, coreFunc[i].functionBinAddr,
coreFunc[i].invokeEntryAddr, coreFunc[i].topoAddr);
auto topo = reinterpret_cast<CoreFunctionTopo*>(coreFunc[i].topoAddr);
DEV_DEBUG(
" topo: coreType=%lu, psgId=%lu, readyCount=%ld, depNum=%lu.", topo->coreType, topo->psgId,
topo->readyCount, topo->depNum);
(void)topo;
}
DEV_DEBUG("===== ready state =====");
auto readyState = reinterpret_cast<CoreFunctionReadyState*>(devTask->coreFunctionReadyStateAddr);
for (uint64_t i = 0; i < devTask->coreFunctionCnt; i++) {
DEV_DEBUG(
"taskId[%lu]: readyCount=%ld, coreType=%lu.", i, readyState[i].readyCount, readyState[i].coreType);
}
(void)(readyState);
}
DEV_DEBUG("===== dev task end =====");
}
static void DumpTask(int64_t taskId, DeviceTask* devTask, bool isDyn)
{
DEV_DEBUG("taskId=%ld, devTask=%p, isDyn=%d.", taskId, devTask, static_cast<int>(isDyn));
if (devTask == nullptr) {
return;
}
DEV_DEBUG(
"devtask { coreFunctionCnt=%lu, readyStateAddr=%#lx, "
"readyAicQue=%#lx, readyAivQue=%#lx, readyAicpuQue=%#lx, "
"coreFuncWsAddr=%#lx, stackWsAddr=%#lx, stackWsSize=%lu }.",
devTask->coreFunctionCnt, devTask->coreFunctionReadyStateAddr, devTask->readyAicCoreFunctionQue,
devTask->readyAivCoreFunctionQue, devTask->readyAicpuFunctionQue, devTask->coreFuncData.coreFunctionWsAddr,
devTask->coreFuncData.stackWorkSpaceAddr, devTask->coreFuncData.stackWorkSpaceSize);
DumpTaskDetail(devTask, isDyn);
}
private:
DeviceTaskCtrl& GetTaskCtrlInPool(int index) { return devStartArgs_->deviceRuntimeDataDesc.taskCtrlPool[index]; }
DeviceTaskCtrlQueue& GetTaskQueue(int index) { return devStartArgs_->deviceRuntimeDataDesc.taskQueueList[index]; }
uint32_t GetScheAicpuNum() { return devStartArgs_->devCtrlState.schAicpuNum; }
private:
DevStartArgs* devStartArgs_{nullptr};
DeviceTaskCtrl* lastTaskCtrl_{nullptr};
DeviceTaskInspectorEntry inspectorEntry_;
void* inspector_;
};
}