* Copyright (c) 2025 Huawei Technologies Co., Ltd.
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
* CANN Open Software License Agreement Version 2.0 (the "License").
* Please refer to the License for details. You may not use this file except in compliance with the License.
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
* See LICENSE in the root of the software repository for the full text of the License.
*/
#include "sk_task_builder.h"
#include "sk_graph.h"
#include "sk_dump_json.h"
#include "sk_log.h"
#include "sk_constant_codegen.h"
#include <algorithm>
#include <cstring>
#include <limits>
#include <memory>
#include <new>
#include <string>
#include <tuple>
#include "securec.h"
#include "sk_event_recorder.h"
#include "runtime/kernel.h"
extern "C" aclrtBinHandle AscendGetEntryBinHandle();
namespace {
constexpr size_t kCounterAlignBytes = 64;
inline size_t AlignUp(size_t value, size_t align)
{
if (align == 0) {
return value;
}
return ((value + align - 1) / align) * align;
}
SkQueueType ToQueueType(SkKernelType kernelType)
{
switch (kernelType) {
case SkKernelType::AIC_ONLY:
case SkKernelType::MIX_AIC_1_0:
return SkQueueType::AIC;
case SkKernelType::AIV_ONLY:
case SkKernelType::MIX_AIV_1_0:
return SkQueueType::AIV;
case SkKernelType::MIX_AIC_1_1:
return SkQueueType::MIX_1_1;
case SkKernelType::MIX_AIC_1_2:
return SkQueueType::MIX_1_2;
default:
SK_LOGE("unsupported kernel type %s for super kernel, using default value : aic", to_string(kernelType));
return SkQueueType::AIC;
}
}
aclrtFuncHandle ResolveSkEntryFunc(const char* funcName)
{
aclrtBinHandle bhdl = AscendGetEntryBinHandle();
if (bhdl == nullptr) {
SK_LOGE("failed to get entry bin handle: AscendGetEntryBinHandle() returned null");
return nullptr;
}
aclrtFuncHandle fhdl = nullptr;
CHECK_ACL(aclrtBinaryGetFunction(bhdl, funcName, &fhdl));
if (fhdl == nullptr) {
SK_LOGE("failed to resolve entry func handle: funcName=%s, binHandle=%p", funcName, bhdl);
return nullptr;
}
return fhdl;
}
bool UsesAic(SkQueueType type)
{
return type == SkQueueType::AIC || type == SkQueueType::MIX_1_1 || type == SkQueueType::MIX_1_2;
}
bool UsesAiv(SkQueueType type)
{
return type == SkQueueType::AIV || type == SkQueueType::MIX_1_1 || type == SkQueueType::MIX_1_2;
}
const KernelInfos& GetKernelInfos(const SuperKernelBaseNode* node)
{
return node->GetNodeInfos().kernelInfos;
}
SkQueueType ToEventQueueType(SkQueueType queueType, bool aicAvailable = true, bool aivAvailable = true)
{
SkQueueType eventQueueType = (queueType == SkQueueType::AIC) ? SkQueueType::AIC : SkQueueType::AIV;
if (eventQueueType == SkQueueType::AIC && !aicAvailable) {
return SkQueueType::AIV;
}
if (eventQueueType == SkQueueType::AIV && !aivAvailable) {
return SkQueueType::AIC;
}
return eventQueueType;
}
SkQueueType InferFirstKernelEventQueueType(const std::vector<SuperKernelBaseNode*>& tasks, bool aicAvailable = true,
bool aivAvailable = true)
{
for (const auto* node : tasks) {
if (node != nullptr && node->GetNodeType() == SkNodeType::NODE_KERNEL) {
return ToEventQueueType(ToQueueType(GetKernelInfos(node).kernelType), aicAvailable, aivAvailable);
}
}
return SkQueueType::UNKNOWN;
}
SkQueueType ResolveMixWaitEventQueueType(const SuperKernelBaseNode* prevKernel, const SuperKernelBaseNode* mixKernel)
{
if (prevKernel == nullptr) {
return SkQueueType::AIV;
}
SkQueueType prevKernelQueueType =
(prevKernel == nullptr) ? SkQueueType::UNKNOWN : ToQueueType(GetKernelInfos(prevKernel).kernelType);
if (prevKernelQueueType != SkQueueType::AIC && prevKernelQueueType != SkQueueType::AIV) {
return SkQueueType::AIV;
}
const bool sameStream = prevKernel->GetStreamIdxInGraph() == mixKernel->GetStreamIdxInGraph();
const bool hasDirectDep = prevKernel->sendToNodeId.count(mixKernel->GetNodeId()) > 0 ||
mixKernel->receiveNodeId.count(prevKernel->GetNodeId()) > 0;
return (sameStream || hasDirectDep) ? prevKernelQueueType
: ((prevKernelQueueType == SkQueueType::AIV) ? SkQueueType::AIC
: SkQueueType::AIV);
}
void DumpTaskQueDetail(const TaskQue* que, const char* name)
{
SK_LOGD("%s TaskQue: cap=%u, tasks=%u", name, que->cap, que->taskCnt);
for (uint32_t i = 0; i < que->taskCnt; ++i) {
const TaskInfo& ti = que->taskInfos[i];
SK_LOGD("[%u] type=%s, idx=%u, blk=%u, entries=%u, args=0x%llx, debugOptions=0x%llx", i, to_string(ti.type), ti.index,
ti.numBlocks, ti.entryCnt, (unsigned long long)ti.args, (unsigned long long)ti.debugOptions);
for (uint32_t j = 0; j < ti.entryCnt; ++j) {
SK_LOGD(" entry[%u]=0x%llx", j, (unsigned long long)ti.entry[j]);
}
}
}
void DumpDeviceArgsDetail(std::string skFuncName, const SkDeviceEntryArgs* args)
{
SK_LOGD("Dumping device args for function: %s", skFuncName.c_str());
const uint8_t* base = (const uint8_t*)args;
const SkHeaderInfo& hdr = args->skHeader;
SK_LOGD("SkHeaderInfo: aicOff=%u, aivOff=%u, counterOff=%u, dfxOff=%u, eventConfigOff=%u, nodeCnt=%u, totalSize=%lu\n",
hdr.aicQueOffset, hdr.aivQueOffset, hdr.counterOffset, hdr.dfxOffset, hdr.eventConfigOffset, hdr.nodeCnt,
hdr.totalSize);
DumpTaskQueDetail((const TaskQue*)(base + args->skHeader.aicQueOffset), "AIC");
DumpTaskQueDetail((const TaskQue*)(base + args->skHeader.aivQueOffset), "AIV");
const SkDfxInfo* dfx = (const SkDfxInfo*)(base + args->skHeader.dfxOffset);
for (uint32_t i = 0; i < args->skHeader.nodeCnt; ++i) {
SK_LOGD("dfx[%u]: bin=0x%llx, ori=0x%llx, aicSize=0x%x, aivSize=0x%x, numBlocks=%u, cubeNum=%u, vecNum=%u",
i, (unsigned long long)dfx[i].binHdl,
(unsigned long long)dfx[i].funcHdlOri,
dfx[i].aicSize, dfx[i].aivSize,
dfx[i].numBlocks, dfx[i].cubeNum, dfx[i].vecNum);
SK_LOGD(" entryAic[0]=0x%llx, entryAic[1]=0x%llx, entryAic[2]=0x%llx, entryAic[3]=0x%llx",
(unsigned long long)dfx[i].entryAic[0], (unsigned long long)dfx[i].entryAic[1],
(unsigned long long)dfx[i].entryAic[2], (unsigned long long)dfx[i].entryAic[3]);
SK_LOGD(" entryAiv[0]=0x%llx, entryAiv[1]=0x%llx, entryAiv[2]=0x%llx, entryAiv[3]=0x%llx",
(unsigned long long)dfx[i].entryAiv[0], (unsigned long long)dfx[i].entryAiv[1],
(unsigned long long)dfx[i].entryAiv[2], (unsigned long long)dfx[i].entryAiv[3]);
}
}
void DumpDeviceArgs(std::string skFuncName, const SkDeviceEntryArgs* args){
{
SK_LOG_CONTEXT_SIMPLE("sk_device_args.log");
DumpDeviceArgsDetail(skFuncName, args);
}
DumpDeviceArgsDetail(skFuncName, args);
}
enum class SearchDirection : uint8_t { PREV, NEXT };
const char* to_string(SearchDirection dir)
{
return (dir == SearchDirection::PREV) ? "PREV" : "NEXT";
}
std::string BuildSearchPathString(const std::vector<uint64_t>& path)
{
std::string pathStr;
for (size_t idx = 0; idx < path.size(); ++idx) {
if (idx > 0) {
pathStr += "->";
}
pathStr += std::to_string(path[idx]);
}
return pathStr.empty() ? "empty" : pathStr;
}
void LogKernelSearchFailure(uint64_t startNodeId, SearchDirection direction, const std::vector<uint64_t>& path,
const char* reason, uint64_t failedNodeId)
{
std::string pathStr = BuildSearchPathString(path);
SK_LOGI("FindKernelNodeInDirection unsuccessful: startNodeId=%lu, direction=%s, reason=%s, failedNodeId=%lu, path=%s",
startNodeId, to_string(direction), reason, failedNodeId, pathStr.c_str());
}
const SuperKernelBaseNode* ResolveCurrentKernel(uint64_t curNodeId, const SuperKernelBaseNode* current,
const std::unordered_map<uint64_t, const SuperKernelBaseNode*>& cache)
{
auto it = cache.find(curNodeId);
if (it != cache.end()) {
return it->second;
}
if (current->GetNodeType() == SkNodeType::NODE_KERNEL) {
return current;
}
return nullptr;
}
bool CacheTraversedNodes(const SuperKernelGraph& graph, const std::vector<uint64_t>& path, SkNodeType startNodeType,
const SuperKernelBaseNode* result,
std::unordered_map<uint64_t, const SuperKernelBaseNode*>& cache, uint64_t& failedNodeId)
{
for (uint64_t nodeId : path) {
auto* pathNode = graph.GetNodeById(nodeId);
if (pathNode == nullptr) {
failedNodeId = nodeId;
return false;
}
auto nodeType = pathNode->GetNodeType();
if (nodeType == SkNodeType::NODE_KERNEL || nodeType == startNodeType) {
cache[nodeId] = result;
}
}
return true;
}
const SuperKernelBaseNode* FindKernelNodeInDirection(uint64_t startNodeId, const SuperKernelGraph& graph,
SearchDirection direction,
std::unordered_map<uint64_t, const SuperKernelBaseNode*>& cache,
int maxHops = 100)
{
if (startNodeId == INVALID_TASK_ID) {
SK_LOGI("FindKernelNodeInDirection skipped: startNodeId is INVALID_TASK_ID, direction=%s",
to_string(direction));
return nullptr;
}
const auto* startNode = graph.GetNodeById(startNodeId);
if (startNode == nullptr) {
SK_LOGI("FindKernelNodeInDirection incomplete: startNodeId=%lu, direction=%s, reason=start-node-not-found",
startNodeId, to_string(direction));
return nullptr;
}
const SkNodeType startNodeType = startNode->GetNodeType();
uint64_t curNodeId = startNodeId;
std::vector<uint64_t> path;
for (int i = 0; i < maxHops; ++i) {
path.push_back(curNodeId);
const auto* current = graph.GetNodeById(curNodeId);
if (current == nullptr) {
LogKernelSearchFailure(startNodeId, direction, path, "node-not-found", curNodeId);
SK_LOGI("Node with ID %lu not found in graph.", curNodeId);
return nullptr;
}
const auto* result = ResolveCurrentKernel(curNodeId, current, cache);
if (result) {
uint64_t failedNodeId = INVALID_TASK_ID;
if (!CacheTraversedNodes(graph, path, startNodeType, result, cache, failedNodeId)) {
LogKernelSearchFailure(startNodeId, direction, path, "path-node-not-found-during-cache-fill",
failedNodeId);
return nullptr;
}
return result;
}
curNodeId = (direction == SearchDirection::PREV) ? current->GetPreNodeId() : current->GetNextNodeId();
if (curNodeId == INVALID_TASK_ID) {
const char* reason =
(direction == SearchDirection::PREV) ? "no-suitable-prev-node" : "no-suitable-next-node";
LogKernelSearchFailure(startNodeId, direction, path, reason, path.back());
SK_LOGI("nodeId:%lu has no %s-node.", startNodeId, to_string(direction));
return nullptr;
}
if (std::find(path.cbegin(), path.cend(), curNodeId) != path.cend()) {
LogKernelSearchFailure(startNodeId, direction, path, "loop-detected", curNodeId);
SK_LOGI("nodeId:%lu detected loop in %s direction.", startNodeId, to_string(direction));
return nullptr;
}
}
LogKernelSearchFailure(startNodeId, direction, path, "max-hops-exceeded", curNodeId);
SK_LOGI("nodeId:%lu search exceeded max hops (%d) in %s direction.", startNodeId, maxHops, to_string(direction));
return nullptr;
}
SyncDirection GenSyncDirection(SkQueueType preType, SkQueueType currType)
{
switch (preType) {
case SkQueueType::MIX_1_1:
case SkQueueType::MIX_1_2:
if (currType == SkQueueType::MIX_1_1 || currType == SkQueueType::MIX_1_2) {
return SyncDirection::MIX_TO_MIX;
}
return (currType == SkQueueType::AIC) ? SyncDirection::VEC_TO_CUB : SyncDirection::CUB_TO_VEC;
case SkQueueType::AIC:
switch (currType) {
case SkQueueType::MIX_1_1:
case SkQueueType::MIX_1_2:
return SyncDirection::CUB_TO_VEC;
case SkQueueType::AIC:
return SyncDirection::CUB_TO_CUB;
default:
return SyncDirection::CUB_TO_VEC;
}
case SkQueueType::AIV:
switch (currType) {
case SkQueueType::MIX_1_1:
case SkQueueType::MIX_1_2:
return SyncDirection::VEC_TO_CUB;
case SkQueueType::AIV:
return SyncDirection::VEC_TO_VEC;
default:
return SyncDirection::VEC_TO_CUB;
}
default:
return SyncDirection::VEC_TO_CUB;
}
}
std::pair<bool, bool> CheckResourceStatus(const std::vector<SuperKernelBaseNode*>& tasks)
{
bool aicAvailable = false;
bool aivAvailable = false;
for (const auto* task : tasks) {
if (task->GetNodeType() != SkNodeType::NODE_KERNEL) {
continue;
}
const KernelInfos& kernelInfo = GetKernelInfos(task);
aicAvailable |= UsesAic(ToQueueType(kernelInfo.kernelType));
aivAvailable |= UsesAiv(ToQueueType(kernelInfo.kernelType));
if (aicAvailable && aivAvailable) {
break;
}
}
return {aicAvailable, aivAvailable};
}
}
bool SkTaskBuilder::InitTaskSyncInfos(const std::vector<SuperKernelBaseNode*>& tasks)
{
taskSyncInfos_.clear();
taskSyncInfos_.resize(tasks.size());
std::tie(aicAvailable_, aivAvailable_) = CheckResourceStatus(tasks);
if (!aicAvailable_ && !aivAvailable_) {
SK_LOGE("no AIC or AIV resource detected in the tasks, unable to assign queue types.");
return false;
}
std::unordered_map<uint64_t, const SuperKernelBaseNode*> kernelNodeCache;
size_t kernelCount = 0;
size_t notifyCount = 0;
size_t waitCount = 0;
size_t resetCount = 0;
SkQueueType firstKernelEventQueueType = InferFirstKernelEventQueueType(tasks, aicAvailable_, aivAvailable_);
SuperKernelBaseNode* prevKernelTask = nullptr;
for (size_t i = 0; i < tasks.size(); i++) {
SkNodeType nodeType = tasks[i]->GetNodeType();
switch (nodeType) {
case SkNodeType::NODE_KERNEL: {
const KernelInfos& kernelInfo = GetKernelInfos(tasks[i]);
taskSyncInfos_[i].queueType = ToQueueType(kernelInfo.kernelType);
prevKernelTask = tasks[i];
kernelCount++;
break;
}
case SkNodeType::NODE_NOTIFY: {
auto* kernel =
FindKernelNodeInDirection(tasks[i]->GetPreNodeId(), graph_, SearchDirection::PREV, kernelNodeCache);
if (kernel == nullptr) {
if (firstKernelEventQueueType == SkQueueType::UNKNOWN) {
SK_LOGE(
"%s node %zu failed to resolve previous KERNEL node and fallback queue type is UNKNOWN, nodeId=%lu",
to_string(nodeType), i, tasks[i]->GetNodeId());
return false;
}
taskSyncInfos_[i].queueType = firstKernelEventQueueType;
SK_LOGI("%s node %zu unable to resolve previous KERNEL node, nodeId=%lu, fallback = %s",
to_string(nodeType), i, tasks[i]->GetNodeId(), to_string(taskSyncInfos_[i].queueType));
} else {
kernelNodeCache[tasks[i]->GetNodeId()] = kernel;
taskSyncInfos_[i].queueType =
ToEventQueueType(ToQueueType(GetKernelInfos(kernel).kernelType), aicAvailable_, aivAvailable_);
}
notifyCount++;
break;
}
case SkNodeType::NODE_RESET: {
auto* kernel =
FindKernelNodeInDirection(tasks[i]->GetPreNodeId(), graph_, SearchDirection::PREV, kernelNodeCache);
if (kernel == nullptr) {
if (firstKernelEventQueueType == SkQueueType::UNKNOWN) {
SK_LOGE(
"%s node %zu failed to resolve previous KERNEL node and fallback queue type is UNKNOWN, nodeId=%lu",
to_string(nodeType), i, tasks[i]->GetNodeId());
return false;
}
taskSyncInfos_[i].queueType = firstKernelEventQueueType;
SK_LOGI("%s node %zu unable to resolve previous KERNEL node, nodeId=%lu, fallback = %s",
to_string(nodeType), i, tasks[i]->GetNodeId(), to_string(taskSyncInfos_[i].queueType));
} else {
kernelNodeCache[tasks[i]->GetNodeId()] = kernel;
taskSyncInfos_[i].queueType =
ToEventQueueType(ToQueueType(GetKernelInfos(kernel).kernelType), aicAvailable_, aivAvailable_);
}
resetCount++;
break;
}
case SkNodeType::NODE_WAIT: {
auto* kernel = FindKernelNodeInDirection(tasks[i]->GetNextNodeId(), graph_, SearchDirection::NEXT,
kernelNodeCache);
if (kernel == nullptr) {
if (firstKernelEventQueueType == SkQueueType::UNKNOWN) {
SK_LOGE(
"%s node %zu failed to resolve next KERNEL node and fallback queue type is UNKNOWN, nodeId=%lu",
to_string(nodeType), i, tasks[i]->GetNodeId());
return false;
}
taskSyncInfos_[i].queueType = firstKernelEventQueueType;
SK_LOGI("%s node %zu unable to resolve next KERNEL node, nodeId=%lu, fallback = %s",
to_string(nodeType), i, tasks[i]->GetNodeId(), to_string(taskSyncInfos_[i].queueType));
} else {
kernelNodeCache[tasks[i]->GetNodeId()] = kernel;
SkQueueType nextKernelQueueType = ToQueueType(GetKernelInfos(kernel).kernelType);
if (nextKernelQueueType != SkQueueType::AIC && nextKernelQueueType != SkQueueType::AIV) {
taskSyncInfos_[i].queueType =
ToEventQueueType(ResolveMixWaitEventQueueType(prevKernelTask, kernel), aicAvailable_,
aivAvailable_);
} else {
taskSyncInfos_[i].queueType =
ToEventQueueType(nextKernelQueueType, aicAvailable_, aivAvailable_);
}
}
waitCount++;
break;
}
default:
SK_LOGE("unsupported node type for sync info initialization");
return false;
}
if (nodeType != SkNodeType::NODE_NOTIFY && nodeType != SkNodeType::NODE_RESET && i < tasks.size() - 1) {
switch (taskSyncInfos_[i].queueType) {
case SkQueueType::AIC:
taskSyncInfos_[i].crossSyncInfo[static_cast<size_t>(SkQueueType::AIC)] = SyncDirection::CUB_TO_CUB;
break;
case SkQueueType::AIV:
taskSyncInfos_[i].crossSyncInfo[static_cast<size_t>(SkQueueType::AIV)] = SyncDirection::VEC_TO_VEC;
break;
case SkQueueType::MIX_1_1:
case SkQueueType::MIX_1_2:
taskSyncInfos_[i].crossSyncInfo[static_cast<size_t>(SkQueueType::AIC)] = SyncDirection::CUB_TO_CUB;
taskSyncInfos_[i].crossSyncInfo[static_cast<size_t>(SkQueueType::AIV)] = SyncDirection::VEC_TO_VEC;
break;
default:
SK_LOGE("unsupported kernel type : %s for inter-sync.", to_string(taskSyncInfos_[i].queueType));
return false;
}
}
}
SK_LOGI("Initialized TaskSyncInfos for %zu tasks (%zu KERNEL, %zu NOTIFY, %zu WAIT, %zu RESET)", tasks.size(),
kernelCount, notifyCount, waitCount, resetCount);
return true;
}
void SkTaskBuilder::InsertSyncEvent(size_t preIdx, size_t currIdx)
{
SkQueueType preType = taskSyncInfos_[preIdx].queueType;
SkQueueType currType = taskSyncInfos_[currIdx].queueType;
SyncDirection dir = GenSyncDirection(preType, currType);
SK_LOGI(" InsertSyncEvent: task[%zu](%s)(nodeId=%lu) -> task[%zu](%s)(nodeId=%lu), dir=%s",
preIdx, to_string(preType), indexToNodeId_[preIdx],
currIdx, to_string(currType), indexToNodeId_[currIdx],
to_string(dir));
if (dir == SyncDirection::CUB_TO_VEC || dir == SyncDirection::MIX_TO_MIX) {
taskSyncInfos_[preIdx].cubSendInfo[currIdx] = SyncDirection::CUB_TO_VEC;
}
if (dir == SyncDirection::VEC_TO_CUB || dir == SyncDirection::MIX_TO_MIX) {
taskSyncInfos_[preIdx].vecSendInfo[currIdx] = SyncDirection::VEC_TO_CUB;
}
if (dir == SyncDirection::CUB_TO_VEC || dir == SyncDirection::MIX_TO_MIX) {
taskSyncInfos_[currIdx].vecRecvInfo[preIdx] = SyncDirection::CUB_TO_VEC;
}
if (dir == SyncDirection::VEC_TO_CUB || dir == SyncDirection::MIX_TO_MIX) {
taskSyncInfos_[currIdx].cubRecvInfo[preIdx] = SyncDirection::VEC_TO_CUB;
}
}
void SkTaskBuilder::PrintSyncInfo(const char* stage)
{
SK_LOGI("%s", stage);
SK_LOGI("[VEC LIST OP]:");
for (size_t i = 0; i < taskSyncInfos_.size(); i++) {
auto& info = taskSyncInfos_[i];
if (UsesAiv(info.queueType)) {
std::string vecSendStr = "vecSend={";
for (auto& kv : info.vecSendInfo) {
vecSendStr += std::to_string(kv.first) + ":" + to_string(kv.second) + " ";
}
vecSendStr += "}, vecRecv={";
for (auto& kv : info.vecRecvInfo) {
vecSendStr += std::to_string(kv.first) + ":" + to_string(kv.second) + " ";
}
vecSendStr += "}";
SK_LOGI(" task[%zu](%s): %s", i, to_string(info.queueType), vecSendStr.c_str());
}
}
SK_LOGI("[CUB LIST OP]:");
for (size_t i = 0; i < taskSyncInfos_.size(); i++) {
auto& info = taskSyncInfos_[i];
if (UsesAic(info.queueType)) {
std::string cubSendStr = "cubSend={";
for (auto& kv : info.cubSendInfo) {
cubSendStr += std::to_string(kv.first) + ":" + to_string(kv.second) + " ";
}
cubSendStr += "}, cubRecv={";
for (auto& kv : info.cubRecvInfo) {
cubSendStr += std::to_string(kv.first) + ":" + to_string(kv.second) + " ";
}
cubSendStr += "}";
SK_LOGI(" task[%zu](%s): %s", i, to_string(info.queueType), cubSendStr.c_str());
}
}
}
bool SkTaskBuilder::PrecomputeSyncRelationsFromGraph(const std::vector<SuperKernelBaseNode*>& tasks)
{
SK_LOGI("Precomputing sync relations from graph, taskCount=%zu", tasks.size());
if (!InitTaskSyncInfos(tasks)) {
SK_LOGE("PrecomputeSyncRelationsFromGraph failed: InitTaskSyncInfos failed");
return false;
}
SK_LOGI("[Sync by stream idx]");
ExtractIntraStreamSync(tasks);
SK_LOGI("[Sync by event]");
bool flag = ExtractInterStreamSync(tasks);
return flag;
}
bool SkTaskBuilder::IsMixKernelTask(const SuperKernelBaseNode* task) const
{
if (task == nullptr) {
return false;
}
if (task->GetNodeType() != SkNodeType::NODE_KERNEL) {
return false;
}
const KernelInfos& kernelInfo = GetKernelInfos(task);
return kernelInfo.kernelType == SkKernelType::MIX_AIC_1_1 ||
kernelInfo.kernelType == SkKernelType::MIX_AIC_1_2;
}
bool SkTaskBuilder::SplitTasksByMixGroups(const std::vector<SuperKernelBaseNode*>& tasks,
std::vector<std::vector<SuperKernelBaseNode*>>& splitTasks,
bool& hasMixKernel) const
{
splitTasks.clear();
std::vector<SuperKernelBaseNode*> subTasks;
bool inMixGroup = false;
hasMixKernel = false;
for (auto* task : tasks) {
if (task == nullptr) {
SK_LOGE("SplitTasksByMixGroups failed: null task found");
return false;
}
const bool curIsMix = IsMixKernelTask(task);
hasMixKernel = hasMixKernel || curIsMix;
if (!subTasks.empty() && curIsMix != inMixGroup) {
splitTasks.push_back(std::move(subTasks));
subTasks.clear();
}
subTasks.push_back(task);
inMixGroup = curIsMix;
}
if (!subTasks.empty()) {
splitTasks.push_back(std::move(subTasks));
}
return true;
}
bool SkTaskBuilder::InitSyncInfoSnapshotForMixGroups(const std::vector<SuperKernelBaseNode*>& tasks,
std::vector<TaskSyncInfo>& taskSyncInfosOrigin)
{
if (!InitTaskSyncInfos(tasks)) {
SK_LOGE("InitSyncInfoSnapshotForMixGroups failed: InitTaskSyncInfos failed");
return false;
}
taskSyncInfosOrigin = taskSyncInfos_;
taskSyncInfos_.clear();
if (taskSyncInfosOrigin.size() != tasks.size()) {
SK_LOGE("InitSyncInfoSnapshotForMixGroups failed: init sync info size mismatch, origin=%zu, expected=%zu",
taskSyncInfosOrigin.size(), tasks.size());
return false;
}
return true;
}
bool SkTaskBuilder::RebaseTaskSyncInfo(TaskSyncInfo& syncInfo, size_t offset) const
{
auto rebaseSyncInfo = [](std::map<size_t, SyncDirection>& syncInfoMap, size_t offsetValue) -> bool {
if (offsetValue == 0 || syncInfoMap.empty()) {
return true;
}
std::map<size_t, SyncDirection> rebasedSyncInfo;
for (const auto& syncPair : syncInfoMap) {
if (syncPair.first > std::numeric_limits<size_t>::max() - offsetValue) {
SK_LOGE("RebaseTaskSyncInfo failed: sync index overflow, index=%zu, offset=%zu",
syncPair.first, offsetValue);
return false;
}
rebasedSyncInfo[syncPair.first + offsetValue] = syncPair.second;
}
syncInfoMap.swap(rebasedSyncInfo);
return true;
};
return rebaseSyncInfo(syncInfo.vecRecvInfo, offset) &&
rebaseSyncInfo(syncInfo.cubRecvInfo, offset) &&
rebaseSyncInfo(syncInfo.vecSendInfo, offset) &&
rebaseSyncInfo(syncInfo.cubSendInfo, offset);
}
void SkTaskBuilder::AddBoundaryAllSync(const std::vector<SuperKernelBaseNode*>& curSplitTasks,
size_t groupIndex,
size_t groupOffset)
{
auto& boundarySyncInfo = taskSyncInfos_.back();
SK_LOGI("AddBoundaryAllSync: add boundary all-sync at group=%zu, localTask=%zu, "
"globalTask=%zu, nodeId=%lu",
groupIndex, curSplitTasks.size() - 1, groupOffset + curSplitTasks.size() - 1,
curSplitTasks.back()->GetNodeId());
boundarySyncInfo.vecSendInfo.clear();
boundarySyncInfo.cubSendInfo.clear();
boundarySyncInfo.crossSyncInfo.clear();
boundarySyncInfo.crossSyncInfo[static_cast<size_t>(SkQueueType::AIC)] = SyncDirection::ALL_SYNC;
}
bool SkTaskBuilder::ProcessSyncRelationSplitGroup(const std::vector<SuperKernelBaseNode*>& curSplitTasks,
size_t groupIndex,
size_t groupOffset,
bool hasNextGroup,
const std::vector<TaskSyncInfo>& taskSyncInfosOrigin,
std::vector<TaskSyncInfo>& mergedTaskSyncInfos)
{
if (curSplitTasks.empty()) {
SK_LOGE("ProcessSyncRelationSplitGroup failed: empty split group, index=%zu", groupIndex);
return false;
}
if (groupOffset > taskSyncInfosOrigin.size() ||
curSplitTasks.size() > taskSyncInfosOrigin.size() - groupOffset) {
SK_LOGE("ProcessSyncRelationSplitGroup failed: split range out of bounds, index=%zu, offset=%zu, "
"groupSize=%zu, originSize=%zu",
groupIndex, groupOffset, curSplitTasks.size(), taskSyncInfosOrigin.size());
return false;
}
const bool groupIsMix = IsMixKernelTask(curSplitTasks.front());
SK_LOGI("ProcessSyncRelationSplitGroup: process group index=%zu, offset=%zu, size=%zu, isMixGroup=%d, "
"firstNodeId=%lu, lastNodeId=%lu",
groupIndex, groupOffset, curSplitTasks.size(), static_cast<int>(groupIsMix),
curSplitTasks.front()->GetNodeId(), curSplitTasks.back()->GetNodeId());
taskSyncInfos_.clear();
nodeIdToIndex_.clear();
indexToNodeId_.clear();
taskSyncInfos_.resize(curSplitTasks.size());
for (size_t j = 0; j < curSplitTasks.size(); j++) {
taskSyncInfos_[j] = taskSyncInfosOrigin[groupOffset + j];
}
SK_LOGI("Build sub-phase-1 begin: precompute sync relations, taskCount=%zu, index=%zu",
curSplitTasks.size(), groupIndex);
SK_LOGI("[sub sync by stream idx]");
ExtractIntraStreamSync(curSplitTasks);
SK_LOGI("[sub sync by event]");
if (!ExtractInterStreamSync(curSplitTasks)) {
SK_LOGE("ProcessSyncRelationSplitGroup failed: ExtractInterStreamSync failed");
return false;
}
SK_LOGI("Build sub-phase-2 begin: optimize sync relations, index=%zu", groupIndex);
OptimizeSyncRelations(curSplitTasks);
if (hasNextGroup) {
AddBoundaryAllSync(curSplitTasks, groupIndex, groupOffset);
}
for (auto& syncInfo : taskSyncInfos_) {
if (!RebaseTaskSyncInfo(syncInfo, groupOffset)) {
return false;
}
mergedTaskSyncInfos.push_back(std::move(syncInfo));
}
SK_LOGI("ProcessSyncRelationSplitGroup: finish group index=%zu, mergedTaskCount=%zu",
groupIndex, mergedTaskSyncInfos.size());
return true;
}
bool SkTaskBuilder::PrecomputeSyncRelationsByMixGroups(const std::vector<SuperKernelBaseNode*>& tasks)
{
if (tasks.empty()) {
SK_LOGE("PrecomputeSyncRelationsByMixGroups failed: empty tasks");
return false;
}
std::vector<std::vector<SuperKernelBaseNode*>> splitTasks;
bool hasMixKernel = false;
if (!SplitTasksByMixGroups(tasks, splitTasks, hasMixKernel)) {
return false;
}
SK_LOGI("PrecomputeSyncRelationsByMixGroups: taskCount=%zu, splitGroupCount=%zu, hasMixKernel=%d",
tasks.size(), splitTasks.size(), static_cast<int>(hasMixKernel));
std::vector<TaskSyncInfo> taskSyncInfosOrigin;
if (!InitSyncInfoSnapshotForMixGroups(tasks, taskSyncInfosOrigin)) {
return false;
}
std::vector<TaskSyncInfo> mergedTaskSyncInfos;
mergedTaskSyncInfos.reserve(tasks.size());
size_t curTaskCount = 0;
for (size_t i = 0; i < splitTasks.size(); i++) {
if (!ProcessSyncRelationSplitGroup(splitTasks[i], i, curTaskCount, i + 1 < splitTasks.size(),
taskSyncInfosOrigin, mergedTaskSyncInfos)) {
return false;
}
curTaskCount += splitTasks[i].size();
}
if (curTaskCount != tasks.size() || mergedTaskSyncInfos.size() != tasks.size()) {
SK_LOGE("PrecomputeSyncRelationsByMixGroups failed: merged task sync info size mismatch, consumed=%zu, "
"merged=%zu, expected=%zu",
curTaskCount, mergedTaskSyncInfos.size(), tasks.size());
return false;
}
taskSyncInfos_.swap(mergedTaskSyncInfos);
nodeIdToIndex_.clear();
indexToNodeId_.clear();
SK_LOGI("PrecomputeSyncRelationsByMixGroups complete: taskCount=%zu, splitGroupCount=%zu",
tasks.size(), splitTasks.size());
return true;
}
void SkTaskBuilder::ExtractIntraStreamSync(const std::vector<SuperKernelBaseNode*>& tasks)
{
SK_LOGI("ExtractIntraStreamSync: processing %zu tasks", tasks.size());
std::map<uint32_t, std::vector<size_t>> streamOps;
for (size_t i = 0; i < tasks.size(); i++) {
uint32_t streamIdx = tasks[i]->GetStreamIdxInGraph();
streamOps[streamIdx].push_back(i);
nodeIdToIndex_[tasks[i]->GetNodeId()] = i;
indexToNodeId_[i] = tasks[i]->GetNodeId();
}
auto streamfusionOption = opts.GetOption(aclskOptionType::STREAM_FUSION);
uint32_t streamFusionValue = 1;
if (streamfusionOption != nullptr) {
streamFusionValue = streamfusionOption->GetIntValue();
}
if (streamFusionValue == 0 && streamOps.size() > 1) {
SK_LOGW("Multi stream fusion is triggered with %zu streams detected, "
"but aclskStreamFusionOption is off (value=%u). "
"To explicitly enable multi stream fusion, set aclskStreamFusionOption to 1. "
"Please confirm whether this fusion behavior meets your expectations.",
streamOps.size(), streamFusionValue);
}
size_t totalInsertedSyncEdges = 0;
for (auto& streamPair : streamOps) {
SK_LOGI("Process stream sync: streamIdx=%u, opCount=%zu", streamPair.first, streamPair.second.size());
auto& opList = streamPair.second;
for (size_t j = 1; j < opList.size(); j++) {
size_t preIdx = opList[j - 1];
size_t currIdx = opList[j];
InsertSyncEvent(preIdx, currIdx);
++totalInsertedSyncEdges;
}
}
SK_LOGI("ExtractIntraStreamSync complete: streamCount=%zu, insertedSyncEdges=%zu", streamOps.size(),
totalInsertedSyncEdges);
}
* @brief Extract inter-stream synchronization relationships based on task dependencies
*
* This function identifies synchronization requirements between different execution streams
* by analyzing task dependencies. It inserts sync events (NOTIFY/WAIT pairs) to establish
* proper synchronization boundaries when tasks from different streams need to coordinate.
*
* @param tasks Vector of super kernel base nodes to analyze
*
* @note This function works in two phases:
* 1. Build a task ID set for quick lookup
* 2. For each KERNEL node, check its successors and insert sync events
* when the successor is also in the current task set
*/
bool SkTaskBuilder::ExtractInterStreamSync(const std::vector<SuperKernelBaseNode*>& tasks)
{
SK_LOGI("ExtractInterStreamSync: starting to extract inter-stream synchronization relationships");
SK_LOGI("ExtractInterStreamSync: total number of tasks to process = %zu", tasks.size());
std::unordered_set<uint64_t> taskIds;
for (size_t i = 0; i < tasks.size(); i++) {
SuperKernelBaseNode* node = tasks[i];
taskIds.insert(node->GetNodeId());
}
SK_LOGI("ExtractInterStreamSync: built task ID set with %zu unique IDs", taskIds.size());
uint32_t syncEventCount = 0;
for (size_t i = 0; i < tasks.size(); i++) {
SuperKernelBaseNode* node = tasks[i];
auto preNodeId = node->GetNodeId();
SkNodeType nodeType = node->GetNodeType();
if (nodeType == SkNodeType::NODE_KERNEL) {
SK_LOGI("ExtractInterStreamSync: processing KERNEL node %lu with %zu successors",
preNodeId, node->sendToNodeId.size());
for (auto nextId : node->sendToNodeId) {
if (taskIds.find(nextId) != taskIds.end()) {
SK_LOGI("ExtractInterStreamSync: inserting sync event between %lu -> %lu",
preNodeId, nextId);
if (nodeIdToIndex_.find(preNodeId) == nodeIdToIndex_.end() || nodeIdToIndex_.find(nextId) == nodeIdToIndex_.end()) {
SK_LOGE("NodeId not exists in task nodes, node id is %lu, %lu", preNodeId, nextId);
return false;
}
InsertSyncEvent(nodeIdToIndex_[preNodeId], nodeIdToIndex_[nextId]);
syncEventCount++;
} else {
SK_LOGI("ExtractInterStreamSync: skipping external successor %lu (not in task set)",
nextId);
}
}
}
}
SK_LOGI("ExtractInterStreamSync: completed, inserted %u sync events", syncEventCount);
return true;
}
void SkTaskBuilder::OptimizeSyncRelations(const std::vector<SuperKernelBaseNode*>& tasks)
{
SK_LOGI("Optimizing Sync Relations");
RemoveCrossedLineSync();
RemoveMultiSendSync();
RemoveMultiRecvSync();
RemoveRedundantCrossSync(tasks);
}
bool SkTaskBuilder::JudgeRemoveCrossSync(size_t sendIdx, size_t recvIdx, bool isCubToVec)
{
if (isCubToVec) {
for (size_t otherRecvIdx = 0; otherRecvIdx < recvIdx; otherRecvIdx++) {
for (auto& kv : taskSyncInfos_[otherRecvIdx].vecRecvInfo) {
size_t otherSendIdx = kv.first;
SyncDirection dir = kv.second;
if (dir == SyncDirection::CUB_TO_VEC || dir == SyncDirection::MIX_TO_MIX) {
if (otherSendIdx > sendIdx) {
SK_LOGI(" Found crossed sync: task[%zu]->task[%zu] crosses with task[%zu]->task[%zu]",
otherSendIdx, otherRecvIdx, sendIdx, recvIdx);
return true;
}
}
}
}
} else {
for (size_t otherRecvIdx = 0; otherRecvIdx < recvIdx; otherRecvIdx++) {
for (auto& kv : taskSyncInfos_[otherRecvIdx].cubRecvInfo) {
size_t otherSendIdx = kv.first;
SyncDirection dir = kv.second;
if (dir == SyncDirection::VEC_TO_CUB || dir == SyncDirection::MIX_TO_MIX) {
if (otherSendIdx > sendIdx) {
SK_LOGI(" Found crossed sync: task[%zu]->task[%zu] crosses with task[%zu]->task[%zu]",
otherSendIdx, otherRecvIdx, sendIdx, recvIdx);
return true;
}
}
}
}
}
return false;
}
void SkTaskBuilder::RemoveSyncInfo(size_t sendIdx, size_t recvIdx, bool isRemoveRecv, SyncDirection dirToRemove)
{
auto removeDirection = [](std::map<size_t, SyncDirection>& info, size_t key, SyncDirection toRemove) {
auto it = info.find(key);
if (it != info.end() && it->second == toRemove) {
info.erase(it);
}
};
if (isRemoveRecv) {
removeDirection(taskSyncInfos_[sendIdx].vecRecvInfo, recvIdx, dirToRemove);
removeDirection(taskSyncInfos_[sendIdx].cubRecvInfo, recvIdx, dirToRemove);
} else {
removeDirection(taskSyncInfos_[sendIdx].vecSendInfo, recvIdx, dirToRemove);
removeDirection(taskSyncInfos_[sendIdx].cubSendInfo, recvIdx, dirToRemove);
}
}
void SkTaskBuilder::RemoveCrossedLineSync()
{
SK_LOGI("RemoveCrossedLineSync: checking crossed sync relations");
for (size_t i = 0; i < taskSyncInfos_.size(); i++) {
auto& info = taskSyncInfos_[i];
std::vector<size_t> toRemove;
for (auto& kv : info.cubSendInfo) {
if (kv.second == SyncDirection::CUB_TO_VEC && JudgeRemoveCrossSync(i, kv.first, true)) {
toRemove.push_back(kv.first);
}
}
for (size_t recvIdx : toRemove) {
SK_LOGI(" Remove crossed cub:vec sync: task[%zu] -> task[%zu]", i, recvIdx);
RemoveSyncInfo(i, recvIdx, false, SyncDirection::CUB_TO_VEC);
RemoveSyncInfo(recvIdx, i, true, SyncDirection::CUB_TO_VEC);
}
}
for (size_t i = 0; i < taskSyncInfos_.size(); i++) {
auto& info = taskSyncInfos_[i];
std::vector<size_t> toRemove;
for (auto& kv : info.vecSendInfo) {
if (kv.second == SyncDirection::VEC_TO_CUB && JudgeRemoveCrossSync(i, kv.first, false)) {
toRemove.push_back(kv.first);
}
}
for (size_t recvIdx : toRemove) {
SK_LOGI(" Remove crossed vec:cub sync: task[%zu] -> task[%zu]", i, recvIdx);
RemoveSyncInfo(i, recvIdx, false, SyncDirection::VEC_TO_CUB);
RemoveSyncInfo(recvIdx, i, true, SyncDirection::VEC_TO_CUB);
}
}
}
void SkTaskBuilder::RemoveMultiSendSync()
{
SK_LOGI("RemoveMultiSendSync: removing redundant send sync");
for (size_t i = 0; i < taskSyncInfos_.size(); i++) {
auto& info = taskSyncInfos_[i];
if (info.vecSendInfo.size() > 1) {
std::vector<size_t> vecSendList;
for (auto& kv : info.vecSendInfo) {
if (kv.second == SyncDirection::VEC_TO_CUB) {
vecSendList.push_back(kv.first);
}
}
if (vecSendList.size() > 1) {
std::sort(vecSendList.begin(), vecSendList.end());
for (size_t j = 1; j < vecSendList.size(); j++) {
size_t recvIdx = vecSendList[j];
SK_LOGI(" Remove multi vec:cub send sync: task[%zu] -> task[%zu]", i, recvIdx);
RemoveSyncInfo(i, recvIdx, false, SyncDirection::VEC_TO_CUB);
RemoveSyncInfo(recvIdx, i, true, SyncDirection::VEC_TO_CUB);
}
}
}
}
for (size_t i = 0; i < taskSyncInfos_.size(); i++) {
auto& info = taskSyncInfos_[i];
if (info.cubSendInfo.size() > 1) {
std::vector<size_t> cubSendList;
for (auto& kv : info.cubSendInfo) {
if (kv.second == SyncDirection::CUB_TO_VEC) {
cubSendList.push_back(kv.first);
}
}
if (cubSendList.size() > 1) {
std::sort(cubSendList.begin(), cubSendList.end());
for (size_t j = 1; j < cubSendList.size(); j++) {
size_t recvIdx = cubSendList[j];
SK_LOGI(" Remove multi cub:vec send sync: task[%zu] -> task[%zu]", i, recvIdx);
RemoveSyncInfo(i, recvIdx, false, SyncDirection::CUB_TO_VEC);
RemoveSyncInfo(recvIdx, i, true, SyncDirection::CUB_TO_VEC);
}
}
}
}
}
void SkTaskBuilder::RemoveMultiRecvSync()
{
SK_LOGI("RemoveMultiRecvSync: removing redundant recv sync");
for (size_t i = 0; i < taskSyncInfos_.size(); i++) {
auto& info = taskSyncInfos_[i];
if (info.vecRecvInfo.size() > 1) {
std::vector<size_t> vecRecvList;
for (auto& kv : info.vecRecvInfo) {
if (kv.second == SyncDirection::CUB_TO_VEC) {
vecRecvList.push_back(kv.first);
}
}
if (vecRecvList.size() > 1) {
std::sort(vecRecvList.begin(), vecRecvList.end(), std::greater<>());
for (size_t j = 1; j < vecRecvList.size(); j++) {
size_t sendIdx = vecRecvList[j];
SK_LOGI(" Remove multi cub:vec recv sync: task[%zu] <- task[%zu]", i, sendIdx);
RemoveSyncInfo(i, sendIdx, true, SyncDirection::CUB_TO_VEC);
RemoveSyncInfo(sendIdx, i, false, SyncDirection::CUB_TO_VEC);
}
}
}
}
for (size_t i = 0; i < taskSyncInfos_.size(); i++) {
auto& info = taskSyncInfos_[i];
if (info.cubRecvInfo.size() > 1) {
std::vector<size_t> cubRecvList;
for (auto& kv : info.cubRecvInfo) {
if (kv.second == SyncDirection::VEC_TO_CUB) {
cubRecvList.push_back(kv.first);
}
}
if (cubRecvList.size() > 1) {
std::sort(cubRecvList.begin(), cubRecvList.end(), std::greater<>());
for (size_t j = 1; j < cubRecvList.size(); j++) {
size_t sendIdx = cubRecvList[j];
SK_LOGI(" Remove multi vec:cub recv sync: task[%zu] <- task[%zu]", i, sendIdx);
RemoveSyncInfo(i, sendIdx, true, SyncDirection::VEC_TO_CUB);
RemoveSyncInfo(sendIdx, i, false, SyncDirection::VEC_TO_CUB);
}
}
}
}
}
namespace {
size_t PickCandidateKernelIdx(size_t lastAicKernelIdx, size_t lastAivKernelIdx,
SkQueueType queueType, size_t invalidIdx)
{
size_t candidateIdx = invalidIdx;
if (queueType == SkQueueType::AIC) {
candidateIdx = lastAicKernelIdx;
} else if (queueType == SkQueueType::AIV) {
candidateIdx = lastAivKernelIdx;
} else {
SK_LOGE("Unexpected queue type %s for WAIT node, cannot determine candidate kernel for redundant sync removal.",
to_string(queueType));
return invalidIdx;
}
return candidateIdx;
}
}
void SkTaskBuilder::RemoveRedundantCrossSync(const std::vector<SuperKernelBaseNode*>& tasks)
{
const size_t invalidIdx = static_cast<size_t>(INVALID_TASK_ID);
size_t lastAicKernelIdx = invalidIdx;
size_t lastAivKernelIdx = invalidIdx;
for (size_t i = 0; i < tasks.size(); ++i) {
const auto nodeType = tasks[i]->GetNodeType();
auto& info = taskSyncInfos_[i];
if (nodeType == SkNodeType::NODE_KERNEL) {
if (UsesAic(info.queueType)) {
lastAicKernelIdx = i;
}
if (UsesAiv(info.queueType)) {
lastAivKernelIdx = i;
}
continue;
}
if (nodeType != SkNodeType::NODE_WAIT) {
continue;
}
const size_t candidateKernelIdx =
PickCandidateKernelIdx(lastAicKernelIdx, lastAivKernelIdx, info.queueType, invalidIdx);
if (candidateKernelIdx == invalidIdx) {
continue;
}
auto& candidateInfo = taskSyncInfos_[candidateKernelIdx];
if (info.queueType == SkQueueType::AIC && candidateInfo.cubSendInfo.empty()) {
candidateInfo.crossSyncInfo.erase(static_cast<size_t>(SkQueueType::AIC));
SK_LOGI("Remove redundant cross sync: task[%zu] -> task[%zu], remove CUB_TO_CUB hint",
candidateKernelIdx, i);
} else if (info.queueType == SkQueueType::AIV && candidateInfo.vecSendInfo.empty()) {
candidateInfo.crossSyncInfo.erase(static_cast<size_t>(SkQueueType::AIV));
SK_LOGI("Remove redundant cross sync: task[%zu] -> task[%zu], remove VEC_TO_VEC hint",
candidateKernelIdx, i);
} else {
SK_LOGI("No redundant cross sync to remove for task[%zu] -> task[%zu]", candidateKernelIdx, i);
}
}
}
namespace {
SkCoreSyncType GetSyncTypesForDirection(SyncDirection dir, bool isSend)
{
SkCoreSyncType syncType = SkCoreSyncType::SYNC_NONE;
switch (dir) {
case SyncDirection::CUB_TO_VEC:
syncType = isSend ? SkCoreSyncType::INTER_SYNC_SET_AIC_TO_AIV : SkCoreSyncType::INTER_SYNC_WAIT_AIC_TO_AIV;
break;
case SyncDirection::VEC_TO_CUB:
syncType = isSend ? SkCoreSyncType::INTER_SYNC_SET_AIV_TO_AIC : SkCoreSyncType::INTER_SYNC_WAIT_AIV_TO_AIC;
break;
case SyncDirection::CUB_TO_CUB:
syncType = SkCoreSyncType::CROSS_SYNC_AIC_TO_AIC;
break;
case SyncDirection::VEC_TO_VEC:
syncType = SkCoreSyncType::CROSS_SYNC_AIV_TO_AIV;
break;
case SyncDirection::MIX_TO_MIX:
syncType = SkCoreSyncType::SYNC_NONE;
SK_LOGI("GetSyncTypesForDirection: unexpected MIX_TO_MIX direction, add syncType %s.", to_string(syncType));
break;
case SyncDirection::NONE:
syncType = SkCoreSyncType::SYNC_NONE;
break;
case SyncDirection::ALL_SYNC:
syncType = SkCoreSyncType::ALL_SYNC;
break;
default:
syncType = SkCoreSyncType::SYNC_NONE;
SK_LOGI("GetSyncTypesForDirection: unknown direction %s, add syncType %s.", to_string(dir), to_string(syncType));
break;
}
return syncType;
}
}
DeviceArgsPtr SkTaskBuilder::GenEntryArgs(const SkTask& skTaskCube, const SkTask& skTaskVec, const SkDfxInfo* dfxInfos,
uint32_t dfxCount, const SkEventConfig *eventConfig)
{
size_t header_size = sizeof(SkHeaderInfo);
size_t aic_que_size = skTaskCube.GetTaskQueSize();
size_t aiv_que_size = skTaskVec.GetTaskQueSize();
size_t counter_size = DEFAULT_COUNTER_COUNT * sizeof(SkCounterInfo);
size_t dfx_size = dfxCount * sizeof(SkDfxInfo);
size_t event_config_size = sizeof(SkEventConfig);
size_t aic_que_offset = header_size;
size_t aiv_que_offset = aic_que_offset + aic_que_size * 4;
size_t counter_offset = AlignUp(aiv_que_offset + aiv_que_size * 4, kCounterAlignBytes);
size_t dfx_offset = counter_offset + counter_size;
size_t event_config_offset = dfx_offset + dfx_size;
uint64_t total_size = event_config_offset + event_config_size;
SK_LOGI(
"sk total args size: %lu, header_size: %zu, aic_que_size: %zu, aiv_que_size: %zu, "
"counter_size: %zu, dfx_size: %zu, counter_offset: %zu",
total_size, header_size, aic_que_size, aiv_que_size, counter_size, dfx_size, counter_offset);
DeviceArgsPtr args;
if (!args.Init(total_size)) {
SK_LOGE("GenEntryArgs init device args failed, total_size=%lu", total_size);
return {};
}
args.Get()->skHeader.aicQueSize = aic_que_size;
args.Get()->skHeader.aivQueSize = aiv_que_size;
args.Get()->skHeader.aicQueOffset = aic_que_offset;
args.Get()->skHeader.aivQueOffset = aiv_que_offset;
args.Get()->skHeader.counterOffset = counter_offset;
args.Get()->skHeader.dfxOffset = dfx_offset;
args.Get()->skHeader.eventConfigOffset = event_config_offset;
args.Get()->skHeader.totalSize = total_size;
uint8_t* base = (uint8_t*)args.Get();
errno_t err = 0;
for (size_t i = 0; i < 4; i++) {
err = memcpy_s(base + aic_que_offset + i * aic_que_size,
aic_que_size, skTaskCube.GetTaskQue(), aic_que_size);
if (err != 0) {
SK_LOGE("GenEntryArgs memcpy_s AIC queue%zu failed, ret=%d", i + 1, err);
return {};
}
}
for (size_t i = 0; i < 4; i++) {
err = memcpy_s(base + aiv_que_offset + i * aiv_que_size,
aiv_que_size, skTaskVec.GetTaskQue(), aiv_que_size);
if (err != 0) {
SK_LOGE("GenEntryArgs memcpy_s AIV queue%zu failed, ret=%d", i + 1, err);
return {};
}
}
if (counter_size > 0) {
err = memset_s(base + counter_offset, counter_size, 0, counter_size);
if (err != 0) {
SK_LOGE("GenEntryArgs memset_s counter failed, ret=%d", static_cast<int>(err));
return {};
}
}
if (dfx_size > 0 && dfxInfos != nullptr) {
err = memcpy_s((uint8_t*)args.Get() + args.Get()->skHeader.dfxOffset, dfx_size, dfxInfos,
dfxCount * sizeof(SkDfxInfo));
if (err != 0) {
SK_LOGE("GenEntryArgs memcpy_s dfx failed, ret=%d", static_cast<int>(err));
return {};
}
}
SkEventConfig *dstEventConfig = (SkEventConfig *)(base + event_config_offset);
if (eventConfig != nullptr) {
err = memcpy_s(dstEventConfig, event_config_size, eventConfig, event_config_size);
if (err != 0) {
SK_LOGE("GenEntryArgs memcpy_s eventConfig failed, ret=%d\n", static_cast<int>(err));
return DeviceArgsPtr();
}
} else {
memset_s(dstEventConfig, event_config_size, 0, event_config_size);
dstEventConfig->enabled = 0;
}
args.Get()->skHeader.nodeCnt = dfxCount;
return args;
}
std::pair<int, int> SkTaskBuilder::GetPreFetchCnt(const ResolvedFunctionInfo& resolved)
{
std::pair<int, int> preFetchCntValue = std::make_pair(resolved.prefetchCnt[0], resolved.prefetchCnt[1]);
auto preLoadOptions = opts.GetOption(aclskOptionType::PRELOAD_CODE);
uint32_t preLoadValue = 1;
if (preLoadOptions != nullptr) {
preLoadValue = preLoadOptions->GetIntValue();
}
if (preLoadValue == 0) {
preFetchCntValue.first = 16;
preFetchCntValue.second = 8;
} else if (preLoadValue == 2) {
preFetchCntValue.first = 0;
preFetchCntValue.second = 0;
}
return preFetchCntValue;
}
bool SkTaskBuilder::AddSyncTask(SkTask& skTask, size_t nodeIndex, SkCoreSyncType syncType)
{
TaskQue* taskQue = skTask.GetTaskQue();
if (taskQue == nullptr) {
SK_LOGE("AddSyncTask failed: get writable task queue failed");
return false;
}
auto syncAllOptions = opts.GetOption(aclskOptionType::DEBUG_SYNC_ALL);
uint32_t debugSyncAll = 0;
if (syncAllOptions != nullptr) {
debugSyncAll = syncAllOptions->GetIntValue();
}
TaskInfo& taskInfo = taskQue->taskInfos[taskQue->taskCnt];
taskInfo.index = nodeIndex;
taskInfo.type = SkTaskType::TYPE_SYNC;
taskInfo.args = static_cast<uint32_t>(syncType);
if (opts.EnableDebug() && debugSyncAll == 1) {
taskInfo.debugOptions |= 0x2;
}
taskQue->taskCnt++;
return true;
}
bool SkTaskBuilder::AddEventTask(SkTask& skTask, SuperKernelBaseNode* node, size_t nodeIndex, SkTaskType taskType)
{
TaskQue* taskQue = skTask.GetTaskQue();
if (taskQue == nullptr) {
SK_LOGE("AddEventTask failed: get writable task queue failed");
return false;
}
TaskInfo& taskInfo = taskQue->taskInfos[taskQue->taskCnt];
taskInfo.index = nodeIndex;
taskInfo.type = taskType;
const auto& syncInfos = node->GetNodeInfos().syncInfos;
uint64_t eventValue = syncInfos.memoryValue;
if (eventValue == std::numeric_limits<uint64_t>::max()) {
switch (taskType) {
case SkTaskType::TYPE_EVENT_NOTIFY:
eventValue = SK_DEFAULT_NOTIFY_VALUE;
break;
case SkTaskType::TYPE_EVENT_WAIT:
eventValue = SK_DEFAULT_WAIT_VALUE;
break;
case SkTaskType::TYPE_EVENT_RESET:
eventValue = SK_DEFAULT_RESET_VALUE;
break;
default:
eventValue = 0;
break;
}
}
uint32_t waitFlag = syncInfos.memoryWaitFlag;
if (taskType != SkTaskType::TYPE_EVENT_WAIT || waitFlag == std::numeric_limits<uint32_t>::max()) {
waitFlag = (taskType == SkTaskType::TYPE_EVENT_WAIT) ?
static_cast<uint32_t>(SkMemoryWaitFlag::EQ) : SK_DEFAULT_WRITE_FLAG;
}
SetEventTaskArgs(taskInfo,
static_cast<uint64_t>(reinterpret_cast<uintptr_t>(syncInfos.addrValue)),
eventValue,
waitFlag);
taskQue->taskCnt++;
return true;
}
bool SkTaskBuilder::ProcessCoreFuncSize(SkDfxInfo* dfxInfo, const void* binHostAddr, uint32_t binHostSize,
const ResolvedFunctionInfo& resolved, int coreIndex, int binIndex,
const char* coreName)
{
SK_LOGD("ProcessCoreFuncSize: Processing %s (coreIndex=%d), funcAddr=0x%lx, funcOffset=0x%lx",
coreName, coreIndex, resolved.funcAddr[coreIndex], resolved.funcOffset[coreIndex]);
std::string symbolName;
uint64_t funcSize = 0;
std::string symbolBind;
bool getInfoRet = GetFuncSymbolInfo(static_cast<const char*>(binHostAddr), binHostSize,
resolved.funcOffset[coreIndex], symbolName, funcSize, symbolBind);
SK_LOGD("ProcessCoreFuncSize: GetFuncSymbolInfo(%s) returned=%d, offset=0x%lx, symbolName=%s, size=0x%lx, bind=%s",
coreName, getInfoRet, resolved.funcOffset[coreIndex], symbolName.c_str(), funcSize, symbolBind.c_str());
if (getInfoRet) {
if (coreIndex == 0) {
dfxInfo->aicSize = static_cast<uint32_t>(funcSize);
dfxInfo->entryAic[binIndex] = resolved.funcAddr[0];
SK_LOGD("ProcessCoreFuncSize: Set aicSize=0x%x, entryAic[%d]=0x%lx",
dfxInfo->aicSize, binIndex, dfxInfo->entryAic[binIndex]);
} else if (coreIndex == 1) {
dfxInfo->aivSize = static_cast<uint32_t>(funcSize);
dfxInfo->entryAiv[binIndex] = resolved.funcAddr[1];
SK_LOGD("ProcessCoreFuncSize: Set aivSize=0x%x, entryAiv[%d]=0x%lx",
dfxInfo->aivSize, binIndex, dfxInfo->entryAiv[binIndex]);
}
return true;
}
SK_LOGW("ProcessCoreFuncSize: GetFuncSymbolInfo failed for %s", coreName);
return false;
}
bool SkTaskBuilder::UpdateDfxInfo(SkDfxInfo* dfxInfo, const KernelInfos& kernelInfo,
const ResolvedFunctionInfo& resolved, int binIndex, int addrIndex)
{
if (dfxInfo == nullptr) {
return true;
}
SK_LOGD("UpdateDfxInfo: Processing dfxInfo for binIndex=%d, addrIndex=%d, funcName=%s",
binIndex, addrIndex, kernelInfo.funcName.c_str());
dfxInfo->binHdl = static_cast<uint64_t>(reinterpret_cast<uintptr_t>(kernelInfo.binHdl));
dfxInfo->funcHdlOri = static_cast<uint64_t>(reinterpret_cast<uintptr_t>(kernelInfo.funcHdl));
dfxInfo->numBlocks = kernelInfo.numBlocks;
dfxInfo->cubeNum = kernelInfo.cubeNum;
dfxInfo->vecNum = kernelInfo.vecNum;
SK_LOGD("UpdateDfxInfo: binHdl=0x%lx, funcHdlOri=0x%lx, numBlocks=%u, cubeNum=%u, vecNum=%u",
dfxInfo->binHdl, dfxInfo->funcHdlOri, dfxInfo->numBlocks, dfxInfo->cubeNum, dfxInfo->vecNum);
void *binHostAddr = nullptr;
uint32_t binHostSize = 0;
int rtRet = rtGetBinBuffer(kernelInfo.binHdl, RT_BIN_HOST_ADDR, &binHostAddr, &binHostSize);
if (rtRet != 0) {
SK_LOGW("UpdateDfxInfo: Failed to get bin buffer, rtRet=%d, using default size 0", rtRet);
return true;
}
SK_LOGD("UpdateDfxInfo: Successfully got bin buffer, binHostAddr=0x%lx, binHostSize=%u",
(uint64_t)binHostAddr, binHostSize);
if (addrIndex == 0 && resolved.funcAddr[0] != 0) {
ProcessCoreFuncSize(dfxInfo, binHostAddr, binHostSize, resolved, 0, binIndex, "AIC");
} else {
SK_LOGD("UpdateDfxInfo: Skipping AIC processing, addrIndex=%d, funcAddr[0]=0x%lx",
addrIndex, resolved.funcAddr[0]);
}
if (addrIndex == 1 && resolved.funcAddr[1] != 0) {
ProcessCoreFuncSize(dfxInfo, binHostAddr, binHostSize, resolved, 1, binIndex, "AIV");
} else {
SK_LOGD("UpdateDfxInfo: Skipping AIV processing, addrIndex=%d, funcAddr[1]=0x%lx",
addrIndex, resolved.funcAddr[1]);
}
return true;
}
bool SkTaskBuilder::AddFuncTask(SkTask& skTask, SuperKernelBaseNode* node, SkDfxInfo* dfxInfo, size_t nodeIndex,
int addrIndex, int binCount, SkTaskType taskType, uint32_t numBlocks)
{
TaskQue* taskQue = skTask.GetTaskQue();
if (taskQue == nullptr) {
SK_LOGE("AddFuncTask failed: get writable task queue failed");
return false;
}
auto disableDcciOptions = opts.GetOption(aclskOptionType::DCCI_DISABLE_ON_KERNEL);
auto dcciBeforeKernelStartOptions = opts.GetOption(aclskOptionType::DCCI_BEFORE_KERNEL_START);
auto dcciAfterKernelEndOptions = opts.GetOption(aclskOptionType::DCCI_AFTER_KERNEL_END);
auto crossCoreSyncCheckOptions = opts.GetOption(aclskOptionType::DEBUG_CROSS_CORE_SYNC_CHECK);
uint32_t debugCrossCoreSyncCheck = 0;
if (crossCoreSyncCheckOptions != nullptr) {
debugCrossCoreSyncCheck = crossCoreSyncCheckOptions->GetIntValue();
}
std::vector<std::string> disableDcciList;
std::vector<std::string> dcciBeforeKernelStartList;
std::vector<std::string> dcciAfterKernelEndList;
if (disableDcciOptions != nullptr) {
disableDcciList = disableDcciOptions->GetStringListValue();
}
if (dcciBeforeKernelStartOptions != nullptr) {
dcciBeforeKernelStartList = dcciBeforeKernelStartOptions->GetStringListValue();
}
if (dcciAfterKernelEndOptions != nullptr) {
dcciAfterKernelEndList = dcciAfterKernelEndOptions->GetStringListValue();
}
if (node->GetNodeType() != SkNodeType::NODE_KERNEL) {
SK_LOGE("AddFuncTask failed: unsupported node type for KERNEL task");
return false;
}
const KernelInfos& kernelInfo = node->GetNodeInfos().kernelInfos;
TaskInfo& taskInfo = taskQue->taskInfos[taskQue->taskCnt];
taskInfo.index = nodeIndex;
taskInfo.type = taskType;
taskInfo.originType = kernelInfo.kernelType;
taskInfo.numBlocks = numBlocks;
if (kernelInfo.resolvedNum != binCount) {
SK_LOGW("mismatch num between sub sk registered and sk option, funcName: %s, registered: %d,"
"option: %d", kernelInfo.funcName.c_str(), kernelInfo.resolvedNum, binCount);
}
for (int i = 0; i < binCount; i++) {
const ResolvedFunctionInfo& resolved = kernelInfo.resolvedFuncs[i];
if (taskType == SkTaskType::TYPE_PRELOAD) {
std::pair<int, int> prefetchCntValue = GetPreFetchCnt(resolved);
if (i == 0) {
SK_LOGI("kernel name: %s, prefetch count: %d %d", kernelInfo.funcName.c_str(), prefetchCntValue.first,
prefetchCntValue.second);
}
taskInfo.args = addrIndex == 0 ? prefetchCntValue.first : prefetchCntValue.second;
taskInfo.reserved = static_cast<uint64_t>(reinterpret_cast<uintptr_t>(kernelInfo.devArgs));
}
uint64_t addr = resolved.funcAddr[addrIndex];
if (addr == 0) {
SK_LOGE("AddFuncTask failed: unresolved function address, nodeIndex=%zu, binIndex=%d, addrIndex=%d",
nodeIndex, i, addrIndex);
return false;
}
taskInfo.entryCnt++;
taskInfo.entry[i] = addr;
if (taskType == SkTaskType::TYPE_FUNC && dfxInfo) {
UpdateDfxInfo(dfxInfo, kernelInfo, resolved, i, addrIndex);
}
}
if (taskType == SkTaskType::TYPE_FUNC) {
taskInfo.args = static_cast<uint64_t>(reinterpret_cast<uintptr_t>(kernelInfo.devArgs));
skTask.numBlocks = std::max(skTask.numBlocks, static_cast<uint32_t>(taskInfo.numBlocks));
skTask.funcCnt++;
if (!disableDcciList.empty() && !kernelInfo.funcName.empty()) {
if (opts.JudgeDisableKernelDcci(disableDcciList, kernelInfo.funcName)) {
taskInfo.debugOptions |= 0x1;
}
}
if (!dcciBeforeKernelStartList.empty() && !kernelInfo.funcName.empty()) {
if (opts.JudgeDisableKernelDcci(dcciBeforeKernelStartList, kernelInfo.funcName)) {
taskInfo.debugOptions |= 0x4;
}
}
if (!dcciAfterKernelEndList.empty() && !kernelInfo.funcName.empty()) {
if (opts.JudgeDisableKernelDcci(dcciAfterKernelEndList, kernelInfo.funcName)) {
taskInfo.debugOptions |= 0x8;
}
}
if (debugCrossCoreSyncCheck == 1) {
taskInfo.debugOptions |= 0x10;
}
}
taskQue->taskCnt++;
return true;
}
bool SkTaskBuilder::DispatchFuncTask(SkTask& skTaskCube, SkTask& skTaskVec, SuperKernelBaseNode* node,
SkDfxInfo* dfxInfo, size_t nodeIndex, int binCount, SkTaskType taskType,
SkQueueType queueType)
{
if (node->GetNodeType() != SkNodeType::NODE_KERNEL) {
SK_LOGE("DispatchFuncTask failed: unsupported node type for FUNC/PRELOAD task");
return false;
}
auto kernelInfo = node->GetNodeInfos().kernelInfos;
switch (queueType) {
case SkQueueType::AIV: {
uint32_t numBlocks = kernelInfo.numBlocks;
if (!AddFuncTask(skTaskVec, node, dfxInfo, nodeIndex, 1, binCount, taskType, numBlocks)) {
return false;
}
SK_LOGI(" task insert: task %zu, [queue=AIV], [type=%s], [kernelType=%s], [nodeInfo=%s]", nodeIndex,
to_string(taskType), to_string(queueType), node->Format().c_str());
break;
}
case SkQueueType::AIC: {
uint32_t numBlocks = kernelInfo.numBlocks;
if (!AddFuncTask(skTaskCube, node, dfxInfo, nodeIndex, 0, binCount, taskType, numBlocks)) {
return false;
}
SK_LOGI(" task insert: task %zu, [queue=AIC], [type=%s], [kernelType=%s], [nodeInfo=%s]", nodeIndex,
to_string(taskType), to_string(queueType), node->Format().c_str());
break;
}
case SkQueueType::MIX_1_1: {
uint32_t numBlocksAic = kernelInfo.numBlocks;
uint32_t numBlocksAiv = kernelInfo.numBlocks;
if (!AddFuncTask(skTaskCube, node, dfxInfo, nodeIndex, 0, binCount, taskType, numBlocksAic)) {
return false;
}
if (!AddFuncTask(skTaskVec, node, dfxInfo, nodeIndex, 1, binCount, taskType, numBlocksAiv)) {
return false;
}
SK_LOGI(" task insert: task %zu, [queue=AIC], [type=%s], [kernelType=%s], [nodeInfo=%s]", nodeIndex,
to_string(taskType), to_string(queueType), node->Format().c_str());
SK_LOGI(" task insert: task %zu, [queue=AIV], [type=%s], [kernelType=%s], [nodeInfo=%s]", nodeIndex,
to_string(taskType), to_string(queueType), node->Format().c_str());
break;
}
case SkQueueType::MIX_1_2: {
uint32_t numBlocksAic = kernelInfo.numBlocks;
uint32_t numBlocksAiv = kernelInfo.numBlocks * 2;
if (!AddFuncTask(skTaskCube, node, dfxInfo, nodeIndex, 0, binCount, taskType, numBlocksAic)) {
return false;
}
if (!AddFuncTask(skTaskVec, node, dfxInfo, nodeIndex, 1, binCount, taskType, numBlocksAiv)) {
return false;
}
skTaskVec.nodeType = SkKernelType::MIX_AIC_1_2;
skTaskCube.nodeType = SkKernelType::MIX_AIC_1_2;
SK_LOGI(" task insert: task %zu, [queue=AIC], [type=%s], [kernelType=%s], [nodeInfo=%s]", nodeIndex,
to_string(taskType), to_string(queueType), node->Format().c_str());
SK_LOGI(" task insert: task %zu, [queue=AIV], [type=%s], [kernelType=%s], [nodeInfo=%s]", nodeIndex,
to_string(taskType), to_string(queueType), node->Format().c_str());
break;
}
default:
SK_LOGE("DispatchFuncTask failed: unsupported kernel type %s for super kernel", to_string(queueType));
return false;
}
return true;
}
bool SkTaskBuilder::DispatchEventTask(SkTask& skTaskCube, SkTask& skTaskVec, SuperKernelBaseNode* node,
size_t nodeIndex, SkTaskType taskType, SkQueueType queueType)
{
if (node->GetNodeType() != SkNodeType::NODE_NOTIFY && node->GetNodeType() != SkNodeType::NODE_WAIT
&& node->GetNodeType() != SkNodeType::NODE_RESET) {
SK_LOGE("DispatchEventTask failed: unsupported node type for EVENT_NOTIFY/EVENT_WAIT/EVENT_RESET task");
return false;
}
SkTask* targetTask = nullptr;
std::string targetQue;
switch (queueType) {
case SkQueueType::AIC:
targetTask = &skTaskCube;
targetQue = "AIC";
break;
case SkQueueType::AIV:
targetTask = &skTaskVec;
targetQue = "AIV";
break;
case SkQueueType::MIX_1_1:
case SkQueueType::MIX_1_2:
default:
targetTask = &skTaskVec;
targetQue = "AIV";
break;
}
if (!AddEventTask(*targetTask, node, nodeIndex, taskType)) {
SK_LOGE("DispatchEventTask failed: add event task failed");
return false;
}
SK_LOGI(" task insert: task %zu, [queue=%s], [type=%s], [nodeInfo=%s]", nodeIndex, targetQue.c_str(),
to_string(taskType), node->Format().c_str());
return true;
}
bool SkTaskBuilder::DispatchSyncTasks(SkTask& skTaskCube, SkTask& skTaskVec, size_t nodeIndex,
const std::map<size_t, SyncDirection>& syncInfo, bool isSend,
SkQueueType queueType)
{
for (const auto& kv : syncInfo) {
size_t peerIdx = kv.first;
SyncDirection dir = kv.second;
SkQueueType peerType = taskSyncInfos_[peerIdx].queueType;
auto syncType = GetSyncTypesForDirection(dir, isSend);
bool addToAicQue = false;
bool addToAivQue = false;
SkQueueType prevType = SkQueueType::UNKNOWN;
SkQueueType nextType = SkQueueType::UNKNOWN;
switch (syncType) {
case SkCoreSyncType::CROSS_SYNC_AIC_TO_AIC:
addToAicQue = true;
addToAivQue = false;
prevType = queueType;
nextType = SkQueueType::UNKNOWN;
break;
case SkCoreSyncType::CROSS_SYNC_AIV_TO_AIV:
addToAicQue = false;
addToAivQue = true;
prevType = queueType;
nextType = SkQueueType::UNKNOWN;
break;
case SkCoreSyncType::INTER_SYNC_SET_AIV_TO_AIC:
addToAicQue = false;
addToAivQue = true;
prevType = queueType;
nextType = peerType;
break;
case SkCoreSyncType::INTER_SYNC_WAIT_AIV_TO_AIC:
addToAicQue = true;
addToAivQue = false;
prevType = peerType;
nextType = queueType;
break;
case SkCoreSyncType::INTER_SYNC_SET_AIC_TO_AIV:
addToAicQue = true;
addToAivQue = false;
prevType = queueType;
nextType = peerType;
break;
case SkCoreSyncType::INTER_SYNC_WAIT_AIC_TO_AIV:
addToAicQue = false;
addToAivQue = true;
prevType = peerType;
nextType = queueType;
break;
case SkCoreSyncType::ALL_SYNC:
addToAicQue = true;
addToAivQue = true;
prevType = SkQueueType::UNKNOWN;
nextType = SkQueueType::UNKNOWN;
break;
default:
SK_LOGE("unknown sync type %s, skipping", to_string(syncType));
return false;
}
if (addToAicQue) {
if (!AddSyncTask(skTaskCube, nodeIndex, syncType)) {
SK_LOGE("DispatchSyncTasks failed: add sync task to aic queue failed");
return false;
}
SK_LOGI(" task insert: task %zu, [queue=AIC], [type=%s], [flag=%s], [prev=%s, next=%s]", nodeIndex,
to_string(SkTaskType::TYPE_SYNC), to_string(syncType), to_string(prevType), to_string(nextType));
}
if (addToAivQue) {
if (!AddSyncTask(skTaskVec, nodeIndex, syncType)) {
SK_LOGE("DispatchSyncTasks failed: add sync task to aiv queue failed");
return false;
}
SK_LOGI(" task insert: task %zu, [queue=AIV], [type=%s], [flag=%s], [prev=%s, next=%s]", nodeIndex,
to_string(SkTaskType::TYPE_SYNC), to_string(syncType), to_string(prevType), to_string(nextType));
}
}
return true;
}
namespace {
enum class EntryFuncFlag : uint8_t {
NONE = 0,
DEBUG = 1 << 0,
DUMP_PROFILING = 1 << 1,
OP_TRACE = 1 << 2,
};
* @brief 根据 kernel 类型获取基础 entry 函数名
*/
const char* GetBaseEntryFuncName(SkKernelType kernelType)
{
switch (kernelType) {
case SkKernelType::AIV_ONLY:
return "sk_entry_aiv";
case SkKernelType::AIC_ONLY:
return "sk_entry_aic";
case SkKernelType::MIX_AIC_1_1:
return "sk_entry_mix11";
case SkKernelType::MIX_AIC_1_2:
return "sk_entry_mix12";
default:
return "sk_entry_aic";
}
}
* @brief 根据标记位组合生成最终的 entry 函数名
*/
std::string BuildEntryFuncName(const char* baseName, EntryFuncFlag flags)
{
std::string funcName = baseName;
uint8_t flagValue = static_cast<uint8_t>(flags);
if ((flagValue & static_cast<uint8_t>(EntryFuncFlag::DEBUG)) != 0) {
funcName += "_debug";
}
if ((flagValue & static_cast<uint8_t>(EntryFuncFlag::DUMP_PROFILING)) != 0) {
funcName += "_dump_profiling";
}
if ((flagValue & static_cast<uint8_t>(EntryFuncFlag::OP_TRACE)) != 0) {
funcName += "_op_trace";
}
return funcName;
}
}
SkHostEntryInfo SkTaskBuilder::GenEntryInfo(SkTask& skTaskCube, SkTask& skTaskVec)
{
SkHostEntryInfo entryInfo;
bool enableDebug = opts.EnableDebug();
const char* profilingEnv = std::getenv("ASCEND_PROF_SK_ON");
bool enableProfiling = (profilingEnv != nullptr && std::string(profilingEnv) != "0");
auto opExecTraceOpt = opts.GetOption(aclskOptionType::DEBUG_OP_EXEC_TRACE);
bool enableOpTrace = (opExecTraceOpt != nullptr && opExecTraceOpt->GetIntValue() == 1);
auto crossCoreSyncCheckOpt = opts.GetOption(aclskOptionType::DEBUG_CROSS_CORE_SYNC_CHECK);
if (crossCoreSyncCheckOpt != nullptr && crossCoreSyncCheckOpt->GetIntValue() == 1) {
enableOpTrace = true;
}
SK_LOGI("GenEntryInfo: enableDebug=%d, enableProfiling=%d, enableOpTrace=%d",
enableDebug, enableProfiling, enableOpTrace);
auto [constantFunc, constantType] = TryGenerateConstantFuncHandle(
skTaskCube, skTaskVec, opts, graph_.GetModelRI());
if (constantFunc != nullptr) {
entryInfo.skEntryFunc = constantFunc;
entryInfo.entryType = constantType;
bool isMix12 = false;
if (constantType == SkKernelType::AIV_ONLY) {
entryInfo.numBlocks = skTaskVec.numBlocks;
skTaskCube.numBlocks = 0;
} else if (constantType == SkKernelType::AIC_ONLY) {
entryInfo.numBlocks = skTaskCube.numBlocks;
skTaskVec.numBlocks = 0;
} else if (constantType == SkKernelType::MIX_AIC_1_2) {
uint32_t mix_1_2_aiv_numBlocks = (skTaskVec.numBlocks + 1) / 2;
entryInfo.numBlocks = std::max(skTaskCube.numBlocks, mix_1_2_aiv_numBlocks);
skTaskCube.numBlocks = entryInfo.numBlocks;
skTaskVec.numBlocks = entryInfo.numBlocks * 2;
isMix12 = true;
} else {
entryInfo.numBlocks = skTaskCube.numBlocks;
skTaskVec.numBlocks = skTaskCube.numBlocks;
}
SK_LOGI("sk entry resolved via CONSTANT_CODEGEN: type=%s, funcHandle=%p, numBlocks=%u",
to_string(entryInfo.entryType), entryInfo.skEntryFunc, entryInfo.numBlocks);
if (isMix12) {
auto* taskQue = skTaskVec.GetTaskQue();
for (auto i = 0; i < taskQue->taskCnt; i++) {
TaskInfo& taskInfo = taskQue->taskInfos[i];
if (taskInfo.type == SkTaskType::TYPE_FUNC &&
taskInfo.originType == SkKernelType::MIX_AIC_1_1) {
taskInfo.numBlocks = taskInfo.numBlocks * 2;
}
}
}
return entryInfo;
}
SK_LOGI("Constant codegen disabled or unsuccessful, falling back to default entry resolution");
SkKernelType kernelType = SkKernelType::AIC_ONLY;
bool isMix12 = false;
if (skTaskCube.funcCnt == 0 && skTaskVec.funcCnt > 0) {
kernelType = SkKernelType::AIV_ONLY;
entryInfo.numBlocks = skTaskVec.numBlocks;
skTaskCube.numBlocks = 0;
} else if (skTaskCube.funcCnt > 0 && skTaskVec.funcCnt == 0) {
kernelType = SkKernelType::AIC_ONLY;
entryInfo.numBlocks = skTaskCube.numBlocks;
skTaskVec.numBlocks = 0;
} else if (skTaskCube.funcCnt > 0 && skTaskVec.funcCnt > 0) {
uint32_t mix_1_2_aiv_numBlocks = (skTaskVec.numBlocks + 1) / 2;
if (skTaskCube.nodeType == SkKernelType::MIX_AIC_1_2 && skTaskVec.nodeType == SkKernelType::MIX_AIC_1_2) {
kernelType = SkKernelType::MIX_AIC_1_2;
isMix12 = true;
entryInfo.numBlocks = std::max(skTaskCube.numBlocks, mix_1_2_aiv_numBlocks);
skTaskCube.numBlocks = entryInfo.numBlocks;
skTaskVec.numBlocks = entryInfo.numBlocks * 2;
} else if (skTaskVec.numBlocks <= skTaskCube.numBlocks) {
kernelType = SkKernelType::MIX_AIC_1_1;
entryInfo.numBlocks = skTaskCube.numBlocks;
skTaskVec.numBlocks = skTaskCube.numBlocks;
} else {
kernelType = SkKernelType::MIX_AIC_1_2;
isMix12 = true;
entryInfo.numBlocks = std::max(skTaskCube.numBlocks, mix_1_2_aiv_numBlocks);
skTaskCube.numBlocks = entryInfo.numBlocks;
skTaskVec.numBlocks = entryInfo.numBlocks * 2;
}
} else {
SK_LOGE("both skTaskCube and skTaskVec have no task, aborting");
return {};
}
entryInfo.entryType = kernelType;
uint8_t flags = static_cast<uint8_t>(EntryFuncFlag::NONE);
if (enableDebug) {
flags = flags | static_cast<uint8_t>(EntryFuncFlag::DEBUG);
}
if (enableProfiling) {
flags = flags | static_cast<uint8_t>(EntryFuncFlag::DUMP_PROFILING);
}
if (enableOpTrace) {
flags = flags | static_cast<uint8_t>(EntryFuncFlag::OP_TRACE);
}
EntryFuncFlag funcFlags = static_cast<EntryFuncFlag>(flags);
const char* baseName = GetBaseEntryFuncName(kernelType);
std::string entryFuncName = BuildEntryFuncName(baseName, funcFlags);
entryInfo.skEntryFunc = ResolveSkEntryFunc(entryFuncName.c_str());
if (entryInfo.skEntryFunc == nullptr) {
SK_LOGE("failed to resolve sk entry function: entryFuncName=%s", entryFuncName.c_str());
return {};
}
if (isMix12) {
auto* taskQue = skTaskVec.GetTaskQue();
for (auto i = 0; i < taskQue->taskCnt; i++) {
TaskInfo& taskInfo = taskQue->taskInfos[i];
if (taskInfo.type != SkTaskType::TYPE_FUNC) {
continue;
}
if (taskInfo.originType == SkKernelType::MIX_AIC_1_1) {
taskInfo.numBlocks = taskInfo.numBlocks * 2;
}
}
}
SK_LOGI("sk entry resolved: type=%s, funcName=%s, funcHandle=%p, numBlocks=%d",
to_string(entryInfo.entryType), entryFuncName.c_str(), entryInfo.skEntryFunc, entryInfo.numBlocks);
return entryInfo;
}
SkBuildResult SkTaskBuilder::Build(std::string skFuncName, const std::vector<SuperKernelBaseNode*>& tasks,
const std::vector<SuperKernelBaseNode*>& customTasks, uint16_t scopeId)
{
if (tasks.empty()) {
SK_LOGE("Build failed: no task to build for super kernel");
return {};
}
const size_t taskCount = tasks.size();
size_t cap = taskCount * (uint8_t)SkTaskType::TYPE_MAX;
SkTask aicTask;
SkTask aivTask;
if (!aicTask.Init(cap)) {
SK_LOGE("Build failed: init aic task queue failed, cap=%zu", cap);
return {};
}
if (!aivTask.Init(cap)) {
SK_LOGE("Build failed: init aiv task queue failed, cap=%zu", cap);
return {};
}
const auto* debugSyncOpt = opts.GetOption(aclskOptionType::DEBUG_SYNC_ALL);
const bool debugSyncAll = (debugSyncOpt != nullptr && debugSyncOpt->GetIntValue() != 0);
if (taskCount > (std::numeric_limits<size_t>::max() / sizeof(SkDfxInfo))) {
SK_LOGE("invalid dfxInfos alloc size, taskCount=%zu", taskCount);
return {};
}
size_t dfxBytes = taskCount * sizeof(SkDfxInfo);
std::unique_ptr<uint8_t[]> dfxStorage = std::make_unique<uint8_t[]>(dfxBytes);
SkDfxInfo* dfxInfos = reinterpret_cast<SkDfxInfo*>(dfxStorage.get());
errno_t initErr = memset_s(dfxInfos, dfxBytes, 0, dfxBytes);
if (initErr != 0) {
SK_LOGE("init dfxInfos failed, ret=%d", static_cast<int>(initErr));
return {};
}
int splitBinCount = 4;
auto splitOptions = opts.GetOption(aclskOptionType::SPLIT_MODE);
if (splitOptions != nullptr) {
splitBinCount = splitOptions->GetIntValue();
}
if (opts.EnableMixKernelSplit()) {
if (!PrecomputeSyncRelationsByMixGroups(tasks)) {
SK_LOGE("Build failed: precompute sync relations with mix kernel split failed");
return {};
}
} else {
SK_LOGI("Build phase-1 begin: precompute sync relations, taskCount=%zu", taskCount);
if (!PrecomputeSyncRelationsFromGraph(tasks)) {
SK_LOGE("Build failed: precompute sync relations failed");
return {};
}
SK_LOGI("Build phase-2 begin: optimize sync relations");
OptimizeSyncRelations(tasks);
}
if (!customTasks.empty()) {
SK_LOGI("add sync info for custom tasks, customTaskCount=%zu", customTasks.size());
taskSyncInfos_.back().crossSyncInfo[static_cast<size_t>(SkQueueType::AIC)] = SyncDirection::ALL_SYNC;
}
if (opts.EnableDebug() && debugSyncAll) {
SK_LOGI("tracing sync all is enabled, now clear all sync info, and only left cross sync info. task count is %d",
taskCount);
for (int i = 0; i < static_cast<int>(taskCount); i++) {
taskSyncInfos_[i].vecRecvInfo.clear();
taskSyncInfos_[i].cubRecvInfo.clear();
taskSyncInfos_[i].vecSendInfo.clear();
taskSyncInfos_[i].cubSendInfo.clear();
taskSyncInfos_[i].crossSyncInfo.clear();
taskSyncInfos_[i].crossSyncInfo[static_cast<size_t>(SkQueueType::AIC)] = SyncDirection::ALL_SYNC;
}
}
SK_LOGI("start build tasks for super kernel, taskCount=%zu", taskCount);
int preloadCount = 1;
SK_LOGI("add tasks preload, preload count is %d, task count is %d", preloadCount, taskCount);
for (int i = 0; i < preloadCount && i < static_cast<int>(taskCount); i++) {
if (tasks[i]->GetNodeType() == SkNodeType::NODE_KERNEL) {
SkQueueType queueType = taskSyncInfos_[i].queueType;
if (!DispatchFuncTask(aicTask, aivTask, tasks[i], dfxInfos + i, i, splitBinCount,
SkTaskType::TYPE_PRELOAD, queueType)) {
SK_LOGE("Build failed: preload dispatch failed at task index %d", i);
return {};
}
}
}
SK_LOGI("start dispatch tasks...");
for (int i = 0; i < static_cast<int>(taskCount); i++) {
SK_LOGI("index=%d, nodeType=%s, nodeInfo=%s", i, to_string(tasks[i]->GetNodeType()),
tasks[i]->Format().c_str());
auto& info = taskSyncInfos_[i];
SkQueueType queueType = info.queueType;
SK_LOGI("add task sync for vec recv, vec recv size %zu", info.vecRecvInfo.size());
if (!DispatchSyncTasks(aicTask, aivTask, i, info.vecRecvInfo, false, queueType)) {
SK_LOGE("Build failed: dispatch vec recv sync failed at task index %d", i);
return {};
}
SK_LOGI("add task sync for cub recv, cub recv size %zu", info.cubRecvInfo.size());
if (!DispatchSyncTasks(aicTask, aivTask, i, info.cubRecvInfo, false, queueType)) {
SK_LOGE("Build failed: dispatch cub recv sync failed at task index %d", i);
return {};
}
switch (tasks[i]->GetNodeType()) {
case SkNodeType::NODE_KERNEL:
SK_LOGI("add task func, task index is %d", i);
if (!DispatchFuncTask(aicTask, aivTask, tasks[i], dfxInfos + i, i, splitBinCount,
SkTaskType::TYPE_FUNC, queueType)) {
SK_LOGE("Build failed: function dispatch failed at task index %d", i);
return {};
}
break;
case SkNodeType::NODE_NOTIFY:
SK_LOGI("add task notify, task index is %d", i);
if (!DispatchEventTask(aicTask, aivTask, tasks[i], i, SkTaskType::TYPE_EVENT_NOTIFY, queueType)) {
SK_LOGE("Build failed: notify dispatch failed at task index %d", i);
return {};
}
break;
case SkNodeType::NODE_WAIT:
SK_LOGI("add task wait, task index is %d", i);
if (!DispatchEventTask(aicTask, aivTask, tasks[i], i, SkTaskType::TYPE_EVENT_WAIT, queueType)) {
SK_LOGE("Build failed: wait dispatch failed at task index %d", i);
return {};
}
break;
case SkNodeType::NODE_RESET:
SK_LOGI("add task reset, task index is %d", i);
if (!DispatchEventTask(aicTask, aivTask, tasks[i], i, SkTaskType::TYPE_EVENT_RESET, queueType)) {
SK_LOGE("Build failed: reset dispatch failed at task index %d", i);
return {};
}
break;
default:
SK_LOGE("process task %d: unsupported node type %s, skipping (supported types: KERNEL, NOTIFY, WAIT, RESET)", i,
to_string(tasks[i]->GetNodeType()));
return {};
}
if (preloadCount > 0 && i + preloadCount < static_cast<int>(taskCount)
&& tasks[i + preloadCount]->GetNodeType() == SkNodeType::NODE_KERNEL) {
SkQueueType preloadQueueType = taskSyncInfos_[i + preloadCount].queueType;
SK_LOGI("add tasks preload, task index is %d", i + preloadCount);
if (!DispatchFuncTask(aicTask, aivTask, tasks[i + preloadCount], dfxInfos + i + preloadCount,
i + preloadCount, splitBinCount, SkTaskType::TYPE_PRELOAD, preloadQueueType)) {
SK_LOGE("Build failed: rolling preload dispatch failed at task index %d", i + preloadCount);
return {};
}
}
SK_LOGI("add sync task for cross sync info, cross sync info size %zu", info.crossSyncInfo.size());
if (!DispatchSyncTasks(aicTask, aivTask, i, info.crossSyncInfo, true, queueType)) {
SK_LOGE("Build failed: dispatch cross sync failed at task index %d", i);
return {};
}
SK_LOGI("add sync task for vec send, vec send size %zu", info.vecSendInfo.size());
if (!DispatchSyncTasks(aicTask, aivTask, i, info.vecSendInfo, true, queueType)) {
SK_LOGE("Build failed: dispatch vec send sync failed at task index %d", i);
return {};
}
SK_LOGI("add sync task for cub send, cub send size %zu", info.cubSendInfo.size());
if (!DispatchSyncTasks(aicTask, aivTask, i, info.cubSendInfo, true, queueType)) {
SK_LOGE("Build failed: dispatch cub send sync failed at task index %d", i);
return {};
}
}
SK_LOGI("Finish build tasks for super kernel: aicTaskCnt=%u, aivTaskCnt=%u", aicTask.GetTaskQue()->taskCnt,
aivTask.GetTaskQue()->taskCnt);
if (!customTasks.empty()) {
SK_LOGI("start process custom tasks");
SK_LOGI("direct add custom tasks");
constexpr size_t kInvalidCustomTaskIndex = static_cast<size_t>(std::numeric_limits<uint32_t>::max());
std::unordered_map<uint64_t, const SuperKernelBaseNode*> kernelNodeCache;
SkQueueType firstKernelEventQueueType = InferFirstKernelEventQueueType(tasks, aicAvailable_, aivAvailable_);
for (size_t i = 0; i < customTasks.size(); i++) {
auto* node = customTasks[i];
SkQueueType queueType;
auto* kernel =
FindKernelNodeInDirection(node->GetPreNodeId(), graph_, SearchDirection::PREV, kernelNodeCache);
if (kernel == nullptr) {
if (firstKernelEventQueueType == SkQueueType::UNKNOWN) {
SK_LOGE("Build failed: custom task %zu failed to resolve previous KERNEL node and fallback queue type is UNKNOWN, nodeId=%lu",
i, node->GetNodeId());
return {};
}
queueType = firstKernelEventQueueType;
SK_LOGI("custom task %zu: unable to resolve previous KERNEL node, nodeId=%lu, fallback = %s", i,
node->GetNodeId(), to_string(queueType));
} else {
queueType =
ToEventQueueType(ToQueueType(GetKernelInfos(kernel).kernelType), aicAvailable_, aivAvailable_);
}
if (node->GetNodeType() == SkNodeType::NODE_NOTIFY) {
if (!DispatchEventTask(aicTask, aivTask, node, kInvalidCustomTaskIndex, SkTaskType::TYPE_EVENT_NOTIFY,
queueType)) {
SK_LOGE("Build failed: custom task %zu notify dispatch failed, nodeId=%lu", i,
node->GetNodeId());
return {};
}
} else if (node->GetNodeType() == SkNodeType::NODE_WAIT) {
if (!DispatchEventTask(aicTask, aivTask, node, kInvalidCustomTaskIndex, SkTaskType::TYPE_EVENT_WAIT,
queueType)) {
SK_LOGE("Build failed: custom task %zu wait dispatch failed, nodeId=%lu", i,
node->GetNodeId());
return {};
}
} else if (node->GetNodeType() == SkNodeType::NODE_RESET) {
if (!DispatchEventTask(aicTask, aivTask, node, kInvalidCustomTaskIndex, SkTaskType::TYPE_EVENT_RESET,
queueType)) {
SK_LOGE("Build failed: custom task %zu reset dispatch failed, nodeId=%lu", i,
node->GetNodeId());
return {};
}
} else {
SK_LOGE("Build failed: custom task %zu has unsupported node type %s, nodeId=%lu", i,
to_string(node->GetNodeType()), node->GetNodeId());
return {};
}
}
SK_LOGI("finish process custom tasks");
}
SK_LOGI("Get entry info...");
SkHostEntryInfo entryInfo = GenEntryInfo(aicTask, aivTask);
if (entryInfo.skEntryFunc == nullptr) {
SK_LOGE("Build failed: GenEntryInfo failed");
return {};
}
entryInfo.nodeCnt = static_cast<uint32_t>(taskCount);
SK_LOGI("Get entry args...");
DeviceArgsPtr devArgs = GenEntryArgs(aicTask, aivTask, dfxInfos, static_cast<uint32_t>(taskCount));
if (devArgs.Get() == nullptr) {
SK_LOGE("Build failed: GenEntryArgs failed");
return {};
}
DumpDeviceArgs(skFuncName, devArgs.Get());
SkLaunchInfo launchInfo;
launchInfo.entryInfo = std::move(entryInfo);
launchInfo.devArgs = std::move(devArgs);
launchInfo.skFuncName = skFuncName;
SK_LOGI("SkTaskToQueueJson: generating task queue JSON for scopeId=%u", scopeId);
Json taskQueueJson = SkTaskToQueueJson(aicTask, aivTask, scopeId);
SkBuildResult result;
result.launchInfo = std::move(launchInfo);
result.taskQueueJson = std::move(taskQueueJson);
return result;
}