* Copyright (c) 2025 Huawei Technologies Co., Ltd.
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
* CANN Open Software License Agreement Version 2.0 (the "License").
* Please refer to the License for details. You may not use this file except in compliance with the License.
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
* See LICENSE in the root of the software repository for the full text of the License.
*/
* \file dev_encode_program.h
* \brief
*/
#pragma once
#include "dev_encode_program_ctrlflow_cache.h"
#include "interface/tensor/symbol_handler.h"
namespace npu::tile_fwk {
class DyndevFunctionAttribute;
}
namespace npu::tile_fwk::dynamic {
struct DevAscendProgramSymbol {
DevRelocVector<char> name;
uint64_t index;
};
struct RuntimeDataRingBufferHead;
struct DevAscendProgram {
DeviceArgs devArgs;
uint64_t workspaceSize;
uint64_t l2CacheOffset;
uint64_t configKey;
uint64_t hashKey;
uint32_t slotSize;
uint32_t runtimeOutcastPoolSize;
uint32_t assembleSlotSize;
uint32_t slottableOutcastSlotSize;
uint32_t ctrlBlockDim{0};
struct {
struct {
uint64_t rootInner;
uint64_t devTaskInnerExclusiveOutcasts;
uint64_t maxStaticOutcastMem;
uint64_t maxDynamicAssembleOutcastMem{0};
uint64_t devTaskBoundaryOutcastNum{0};
uint32_t parallelism{1};
uint64_t MaxOutcastMem() const { return std::max(maxStaticOutcastMem, maxDynamicAssembleOutcastMem); }
uint64_t Total() const
{
uint64_t total =
rootInner +
devTaskInnerExclusiveOutcasts +
MaxOutcastMem() * devTaskBoundaryOutcastNum;
static constexpr uint64_t ALIGNMENT_32K = 32 * 1024;
return AlignUp(total, ALIGNMENT_32K) * parallelism;
}
} tensor;
uint64_t aicoreSpilled;
struct {
uint64_t general;
uint64_t dynamicCellMatch{0};
uint64_t stitchPool;
uint64_t maxDynamicCellMatchTableMem{0};
uint64_t dynamicCellMatchSlotNum{0};
uint64_t stitchCacheSize{0};
uint32_t generalSlabSize;
uint32_t stitchSlabSize;
uint64_t Total() const { return general + dynamicCellMatch + stitchPool; }
} metadata;
struct {
uint64_t dumpTensor;
uint64_t leafDump;
} debug;
uint64_t Total() const { return tensor.Total() + aicoreSpilled + debug.dumpTensor + debug.leafDump; }
} memBudget;
DeviceRuntimeOffset deviceRuntimeOffset;
const void* controlFlowBinaryAddr{nullptr};
std::atomic<bool> runtimeDataRingBufferInited{false};
uint32_t stitchFunctionsize{0};
uint32_t stitchMaxFunctionNum{0};
uint32_t ctrlFlowCacheSize{0};
uint32_t disableCtrlFlowCache{0};
uint32_t rootFuncMaxCallOpsize{0};
DevRelocVector<DevAscendProgramSymbol> symbolTable;
DevRelocVector<char> symbolTableNameList;
uint64_t expressionTableSize;
DevRelocVector<uint64_t> expressionTableOffsetList;
DevRelocVector<uint8_t> preGuardPage;
DevRelocVector<uint8_t> expressionTableBinary;
DevRelocVector<uint8_t> hostControlFlowBinary;
DevRelocVector<uint8_t> devControlFlowBinary;
DevRelocVector<uint8_t> postGuardPage;
DevRelocVector<DevRelocVector<uint8_t>> devEncodeList;
DevRelocVector<uint8_t> devEncodeDataList;
DevRelocVector<DevCceBinary> cceCodeList;
DevRelocVector<DevAicpuLeafBinary> aicpuLeafCodeList;
DevRelocVector<int32_t> aicpuLeafCodeDataList;
DevRelocVector<uint64_t> startArgsInputTensorSlotIndexList;
DevRelocVector<uint64_t> startArgsOutputTensorSlotIndexList;
DevRelocVector<uint64_t> startArgsInputSymbolIndexList;
DevRelocVector<uint64_t> assembleSlotIndexList;
DevRelocVector<uint64_t> outputInplaceSlotList;
DevRelocVector<DevAscendProgramPartialUpdate> partialUpdateList;
DevRelocVector<uint64_t> cellMatchRuntimePartialUpdateTableList;
DevRelocVector<PrefetchInfo> prefetchInfoList;
DevRelocVector<uint8_t> disableL2List;
DevControlFlowCache* ctrlFlowCacheAnchor{nullptr};
DevControlFlowCache controlFlowCache;
#define programLastField controlFlowCache.cacheData
uint64_t dataSize;
uint8_t data[0];
* DevAscendProgramSymbol symbolTableData[]
* char symbolTableNameListData[]
* uint64_t expressionTableOffsetListData[]
* uint8_t preGuardPageData[PAGE_SIZE]
* uint8_t expressionTableBinaryData[]
* uint8_t hostControlFlowBinaryData[]
* uint8_t devControlFlowBinaryData[]
* DevRelocVector<uint8_t> devEncodeList[]
* uint8_t devEncodeDataList[]
* DevRelocVector<uint8_t> cceCodeList[]
* uint64_t startArgsInputTensorSlotIndexListData[]
* uint64_t startArgsOutputTensorSlotIndexListData[]
* uint64_t startArgsInputSymbolIndexListData[]
* uint64_t assembleSlotIndexList[]
* uint64_t outputInplaceSlotList[];
* DevAscendProgramPartialUpdate partialUpdateList[]
* DevAscendProgramSlot slotList[]
*/
RuntimeDataRingBufferHead* GetRuntimeDataList()
{
return reinterpret_cast<RuntimeDataRingBufferHead*>(devArgs.runtimeDataRingBufferAddr);
}
template <typename T>
const T& At(const DevRelocVector<T>& localvec, int index) const
{
return localvec[index];
}
template <typename T>
T& At(DevRelocVector<T>& localvec, int index)
{
return localvec[index];
}
void DumpCce(std::ostringstream& oss, int indent) const;
void DumpControlFlow(const int indent, const bool dumpAddr, std::ostringstream& oss) const;
void DumpExpressionTable(const int indent, const bool dumpAddr, std::ostringstream& oss) const;
void DumpBasicInfo(const int indent, std::ostringstream& oss) const;
void DumpSymbolTable(const int indent, std::ostringstream& oss) const;
void DumpInputOutputSlots(const int indent, std::ostringstream& oss) const;
void DumpAssembleAndInplaceSlots(const int indent, std::ostringstream& oss) const;
void DumpPartialUpdate(const int indent, std::ostringstream& oss) const;
void DumpInputSymbols(const int indent, std::ostringstream& oss) const;
std::string Dump(const int indent = 0, const bool dumpAddr = false) const;
void DumpFile(const std::string& filePath) const;
std::vector<int> GetInputTensorSlotIndexList() const
{
std::vector<int> indexList;
for (size_t i = 0; i < startArgsInputTensorSlotIndexList.size(); i++) {
indexList.push_back(At(startArgsInputTensorSlotIndexList, i));
}
return indexList;
}
std::vector<int> GetOutputTensorSlotIndexList() const
{
std::vector<int> indexList;
for (size_t i = 0; i < startArgsOutputTensorSlotIndexList.size(); i++) {
indexList.push_back(At(startArgsOutputTensorSlotIndexList, i));
}
return indexList;
}
std::vector<int> GetAssembleTensorSlotIndexList() const
{
std::vector<int> indexList;
for (size_t i = 0; i < assembleSlotIndexList.size(); i++) {
indexList.push_back(At(assembleSlotIndexList, i));
}
return indexList;
}
std::vector<int> GetPartialUpdateTensorSlotIndexList() const
{
const int& front = At(assembleSlotIndexList, 0);
const int& back = At(assembleSlotIndexList, assembleSlotIndexList.size() - 1);
std::vector<int> slotIndexList(&front, &back + 1);
return slotIndexList;
}
std::tuple<const void*, uint64_t> GetDevControlFlowBinary() const
{
return std::make_tuple(
reinterpret_cast<const void*>(devControlFlowBinary.Data()), (uint64_t)devControlFlowBinary.size());
}
std::tuple<const void*, uint64_t> GetHostControlFlowBinary() const
{
return std::make_tuple(
reinterpret_cast<const void*>(hostControlFlowBinary.Data()), (uint64_t)hostControlFlowBinary.size());
}
std::tuple<const void*, uint64_t, const uint64_t*, uint64_t> GetExpressionTableBinary() const
{
return std::make_tuple(
reinterpret_cast<const void*>(expressionTableBinary.Data()),
static_cast<uint64_t>(expressionTableBinary.size()), expressionTableOffsetList.Data(),
static_cast<uint64_t>(expressionTableOffsetList.size()));
}
uint64_t GetSymbolTableSize() const { return symbolTable.size(); }
uint64_t GetExpressionTableSize() const { return expressionTableSize; }
uint64_t GetFunctionSize() const { return devEncodeList.size(); }
DevAscendFunction* GetFunction(int index) const
{
return reinterpret_cast<DevAscendFunction*>(const_cast<uint8_t*>(devEncodeList[index].Data()));
}
DevAscendFunction* GetFunctionByRawName(const std::string& rawName) const
{
for (size_t i = 0; i < GetFunctionSize(); i++) {
DevAscendFunction* func = GetFunction(static_cast<int>(i));
if (func->GetRawName() == rawName) {
return func;
}
}
return nullptr;
}
const DevCceBinary* GetCceBinary(int index) const { return &cceCodeList[index]; }
const DevAicpuLeafBinary* GetAicpuLeafBinary(int index) const { return &aicpuLeafCodeList[index]; }
DevControlFlowCache* GetControlFlowCache() { return ctrlFlowCacheAnchor; }
template <typename Ty>
typename Ty::ElementType* RelocOffset(intptr_t shift, void*& offset, Ty& list)
{
typename Ty::ElementType* ptr = reinterpret_cast<typename Ty::ElementType*>(offset);
offset = (void*)((uintptr_t)(offset) + list.ElementSize() * list.size());
list.DeviceRelocData(shift);
return ptr;
}
void RelocProgram(uint64_t srcProgram, uint64_t dstProgram, bool relocFunc = false)
{
intptr_t shift = static_cast<int64_t>(dstProgram) - static_cast<int64_t>(srcProgram);
void* offset = data;
auto symbolTablePtr = RelocOffset(shift, offset, symbolTable);
for (size_t i = 0; i < symbolTable.size(); i++) {
symbolTablePtr[i].name.DeviceRelocData(shift);
}
RelocOffset(shift, offset, symbolTableNameList);
RelocOffset(shift, offset, expressionTableOffsetList);
RelocOffset(shift, offset, preGuardPage);
RelocOffset(shift, offset, expressionTableBinary);
RelocOffset(shift, offset, hostControlFlowBinary);
RelocOffset(shift, offset, devControlFlowBinary);
auto devEncodeListPtr = RelocOffset(shift, offset, devEncodeList);
for (size_t i = 0; i < devEncodeList.size(); i++) {
devEncodeListPtr[i].DeviceRelocData(shift);
}
RelocOffset(shift, offset, devEncodeDataList);
RelocOffset(shift, offset, cceCodeList);
auto aicpuLeafCodeListPtr = RelocOffset(shift, offset, aicpuLeafCodeList);
for (size_t i = 0; i < aicpuLeafCodeList.size(); i++) {
aicpuLeafCodeListPtr[i].aicpuLeafCode.DeviceRelocData(shift);
}
RelocOffset(shift, offset, aicpuLeafCodeDataList);
RelocOffset(shift, offset, startArgsInputTensorSlotIndexList);
RelocOffset(shift, offset, startArgsOutputTensorSlotIndexList);
RelocOffset(shift, offset, startArgsInputSymbolIndexList);
RelocOffset(shift, offset, assembleSlotIndexList);
RelocOffset(shift, offset, outputInplaceSlotList);
auto partialUpdateListPtr = RelocOffset(shift, offset, partialUpdateList);
for (size_t i = 0; i < partialUpdateList.size(); i++) {
partialUpdateListPtr[i].cellMatchRuntimePartialUpdateTable.DeviceRelocDataMaybeNull(shift);
}
RelocOffset(shift, offset, cellMatchRuntimePartialUpdateTableList);
RelocOffset(shift, offset, prefetchInfoList);
RelocOffset(shift, offset, disableL2List);
if (relocFunc) {
for (int i = 0; i < static_cast<int>(GetFunctionSize()); i++) {
DevAscendFunction* func = GetFunction(i);
func->Reloc(reinterpret_cast<uint64_t>(func), true);
}
}
RelocOffset(shift, offset, controlFlowCache.inputTensorDataList);
RelocOffset(shift, offset, controlFlowCache.outputTensorDataList);
for (uint32_t i = 0; i < SCH_DEVTASK_MAX_PARALLELISM; i++) {
RelocOffset(
shift, offset, controlFlowCache.runtimeBackup.workspace.tensorAllocators[i].slottedOutcastsBlockList);
}
RelocOffset(shift, offset, controlFlowCache.runtimeBackup.slotContext.slotList);
RelocOffset(shift, offset, controlFlowCache.runtimeBackup.workspace.runtimeOutcastTensorPool);
RelocOffset(shift, offset, controlFlowCache.deviceTaskCacheList);
RelocOffset(shift, offset, controlFlowCache.cacheData);
}
struct DevArgsPreservedParams {
uint32_t nrAic;
uint32_t nrAiv;
uint32_t nrAicpu;
uint32_t nrValidAic;
uint32_t scheCpuNum;
uint32_t maxAicpuNum;
uint32_t launchSchedAicpuNum;
ArchInfo archInfo;
uint64_t dynamicCellMatchAddr;
uint64_t dynamicCellMatchCapacity;
bool hasAicpuTask;
bool launchSchedSameCluster;
};
DevArgsPreservedParams BackupDevArgsParams(const DeviceArgs& src)
{
DevArgsPreservedParams params;
params.nrAic = src.nrAic;
params.nrAiv = src.nrAiv;
params.nrAicpu = src.nrAicpu;
params.nrValidAic = src.nrValidAic;
params.scheCpuNum = src.scheCpuNum;
params.maxAicpuNum = src.maxAicpuNum;
params.launchSchedAicpuNum = src.launchSchedAicpuNum;
params.archInfo = src.archInfo;
params.dynamicCellMatchAddr = src.dynamicCellMatchAddr;
params.dynamicCellMatchCapacity = src.dynamicCellMatchCapacity;
params.hasAicpuTask = src.hasAicpuTask;
params.launchSchedSameCluster = src.launchSchedSameCluster;
return params;
}
void RestoreDevArgsParams(DeviceArgs& dst, const DevArgsPreservedParams& params)
{
dst.nrAic = params.nrAic;
dst.nrAiv = params.nrAiv;
dst.nrAicpu = params.nrAicpu;
dst.nrValidAic = params.nrValidAic;
dst.scheCpuNum = params.scheCpuNum;
dst.maxAicpuNum = params.maxAicpuNum;
dst.launchSchedAicpuNum = params.launchSchedAicpuNum;
dst.archInfo = params.archInfo;
dst.dynamicCellMatchAddr = params.dynamicCellMatchAddr;
dst.dynamicCellMatchCapacity = params.dynamicCellMatchCapacity;
dst.hasAicpuTask = params.hasAicpuTask;
dst.launchSchedSameCluster = params.launchSchedSameCluster;
}
void ResetFromLaunch()
{
DevArgsPreservedParams preservedParams = BackupDevArgsParams(devArgs);
memset_s(&devArgs, sizeof(devArgs), 0, sizeof(devArgs));
RestoreDevArgsParams(devArgs, preservedParams);
controlFlowBinaryAddr = nullptr;
runtimeDataRingBufferInited = false;
workspaceSize = 0;
ctrlFlowCacheAnchor = nullptr;
RelocProgram(reinterpret_cast<int64_t>(this), 0);
}
void ResetRerun()
{
uint64_t* RuntimePartialUpdateTable = cellMatchRuntimePartialUpdateTableList.Data();
uint64_t RuntimePartialUpdateTableSize = cellMatchRuntimePartialUpdateTableList.DataSize();
memset_s(RuntimePartialUpdateTable, RuntimePartialUpdateTableSize, 0xFF, RuntimePartialUpdateTableSize);
}
struct DevRelocRange {
template <typename T>
DevRelocRange(const DevRelocVector<T>& v)
: begin(reinterpret_cast<uintptr_t>(v.begin())), end(reinterpret_cast<uintptr_t>(v.end()))
{}
uintptr_t begin;
uintptr_t end;
};
void RuntimeVerify(uintptr_t workspaceBegin, uintptr_t workspaceEnd) const
{
(void)workspaceBegin, (void)workspaceEnd;
DEV_IF_VERBOSE_DEBUG {}
else
{
return;
}
std::vector<DevRelocRange> rangeList = {
symbolTable,
symbolTableNameList,
expressionTableOffsetList,
hostControlFlowBinary,
devControlFlowBinary,
devEncodeList,
devEncodeDataList,
cceCodeList,
aicpuLeafCodeList,
aicpuLeafCodeDataList,
startArgsInputTensorSlotIndexList,
startArgsOutputTensorSlotIndexList,
assembleSlotIndexList,
outputInplaceSlotList,
partialUpdateList,
cellMatchRuntimePartialUpdateTableList,
prefetchInfoList,
disableL2List,
controlFlowCache.inputTensorDataList,
controlFlowCache.outputTensorDataList,
controlFlowCache.runtimeBackup.workspace.tensorAllocators[0].slottedOutcastsBlockList,
controlFlowCache.runtimeBackup.slotContext.slotList,
controlFlowCache.runtimeBackup.workspace.runtimeOutcastTensorPool,
controlFlowCache.deviceTaskCacheList,
controlFlowCache.cacheData,
};
if ((uintptr_t)data != rangeList[0].begin) {
DEV_ERROR(
ProgEncodeErr::RANGE_VERIFY_FAILED,
"#ctrl.program.verify: Assertion failed: data (0x%p) != rangeList[0].begin (0x%p)", data,
(void*)rangeList[0].begin);
}
DEV_ASSERT(ProgEncodeErr::RANGE_VERIFY_FAILED, (uintptr_t)data == rangeList[0].begin);
if (rangeList[0].begin > rangeList[0].end) {
DEV_ERROR(
ProgEncodeErr::RANGE_VERIFY_FAILED,
"#ctrl.program.verify: Assertion failed: rangeList[0].begin (0x%p) > rangeList[0].end (0x%p)",
(void*)rangeList[0].begin, (void*)rangeList[0].end);
}
DEV_ASSERT(ProgEncodeErr::RANGE_VERIFY_FAILED, rangeList[0].begin <= rangeList[0].end);
for (size_t k = 1; k < rangeList.size(); k++) {
if (rangeList[k - 1].end > rangeList[k].begin) {
DEV_ERROR(
ProgEncodeErr::RANGE_VERIFY_FAILED,
"#ctrl.program.verify: Ranges overlap: range[%d].end (0x%p) > range[%d].begin (0x%p)", (int)(k - 1),
(void*)rangeList[k - 1].end, (int)k, (void*)rangeList[k].begin);
}
if (rangeList[k].begin > rangeList[k].end) {
DEV_ERROR(
ProgEncodeErr::RANGE_VERIFY_FAILED,
"#ctrl.program.verify: Invalid range: range[%d].begin (0x%p) > range[%d].end (0x%p)", (int)k,
(void*)rangeList[k].begin, (int)k, (void*)rangeList[k].end);
}
DEV_ASSERT_MSG(
ProgEncodeErr::RANGE_VERIFY_FAILED, rangeList[k - 1].end <= rangeList[k].begin, "range:%d->%d",
(int)(k - 1), (int)(k));
DEV_ASSERT_MSG(
ProgEncodeErr::RANGE_VERIFY_FAILED, rangeList[k].begin <= rangeList[k].end, "range:%d", (int)k);
}
uintptr_t lastEnd = rangeList.back().end;
uintptr_t dataEnd = (uintptr_t)(&data[dataSize]);
if (lastEnd != dataEnd) {
DEV_ERROR(
ProgEncodeErr::RANGE_VERIFY_FAILED,
"#ctrl.program.verify: Last range end does not match data end: rangeList.back().end (0x%p) != dataEnd "
"(0x%p)",
(void*)lastEnd, (void*)dataEnd);
}
DEV_ASSERT(ProgEncodeErr::RANGE_VERIFY_FAILED, lastEnd == dataEnd);
}
uint64_t GetSize() const
{
return reinterpret_cast<uintptr_t>(programLastField.End()) - reinterpret_cast<uintptr_t>(this);
}
const DeviceRuntimeOffset& GetDeviceRuntimeOffset() const { return deviceRuntimeOffset; }
void SetParallelism(uint32_t parallelism) { memBudget.tensor.parallelism = parallelism; }
uint32_t GetParallelism() { return memBudget.tensor.parallelism; }
private:
friend struct EncodeDevAscendProgramInfo;
void InitSymbolTable(uintdevptr_t& initOffset, SymbolicSymbolTable* symbolTableInput, bool fillContent);
void InitExpressionTableBinary(
uintdevptr_t& initOffset, const std::vector<std::vector<uint8_t>>& expressionTableBinaryListInput,
bool fillContent);
void InitControlFlowBinary(
uintdevptr_t& initOffset, const std::vector<uint8_t>& hostControlFlowBinaryInput,
const std::vector<uint8_t>& devControlFlowBinaryInput, bool fillContent);
void InitDevEncodeList(
uintdevptr_t& initOffset, const std::vector<std::vector<uint8_t>>& devEncodeListInput, bool fillContent);
void InitCceCodeList(uintdevptr_t& initOffset, const std::vector<CceCodeInfo>& cceInfo, bool fillContent);
void InitPrefetchInfoList(uintdevptr_t& initOffset, const std::vector<L2Info>& l2InfoList, bool fillContent);
void InitDisableL2List(uintdevptr_t& initOffset, const std::vector<uint8_t>& disableL2, bool fillContent);
void InitStartArgsABIParamList(
uintdevptr_t& initOffset, const std::vector<int>& tStartArgsInputTensorSlotIndexList,
const std::vector<int>& tStartArgsOutputTensorSlotIndexList,
const std::vector<int>& tStartArgsInputSymbolIndexList, const std::vector<int>& tAsembleSlotIndexList,
const std::vector<int>& tInplaceSlotIndexList, bool fillContent);
void InitPartialUpdateSlot(
uintdevptr_t& initOffset, const std::vector<std::vector<uint8_t>>& devEncodeListInput,
const std::unordered_map<Function*, int>& rootFuncKeyDict,
const std::unordered_map<int, std::unordered_map<Function*, int>>& slotRootIncastDict,
const std::unordered_map<int, std::unordered_map<Function*, int>>& slotRootOutcastDict,
const std::vector<int>& tPartialUpdateSlotIndexList, bool fillContent);
void InitControlFlowCache(
uintdevptr_t& initOffset, const std::shared_ptr<DyndevFunctionAttribute>& dyndevAttr, bool fillContent);
};
}