pypto/framework/src/cost_model/simulation/cost_model_launcher.h · CANN/pypto - AtomGit

cann-robotperf(machine): Optimize dynamic cell match stride passing
/**
 * Copyright (c) 2025 Huawei Technologies Co., Ltd.
 * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
 * CANN Open Software License Agreement Version 2.0 (the "License").
 * Please refer to the License for details. You may not use this file except in compliance with the License.
 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
 * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
 * See LICENSE in the root of the software repository for the full text of the License.
 */

/*!
 * \file cost_model_launcher.h
 * \brief
 */

#pragma once

#include <thread>
#include <cstdint>
#include <unistd.h>
#include "interface/interpreter/raw_tensor_data.h"
#include "interface/configs/config_manager.h"
#include "interface/function/function.h"
#include "cost_model/simulation/pv/PvModel.h"
#include "cost_model/simulation/pv/PvModelFactory.h"
#include "machine/device/dynamic/costmodel_utils.h"
#include "machine/runtime/launcher/device_launcher.h"
#include "machine/runtime/launcher/cell_match_dynamic.h"
#include "cost_model/simulation/backend.h"
#include "tilefwk/pypto_fwk_log.h"

namespace npu::tile_fwk::dynamic {
class HostAgentStub {
public:
    HostAgentStub(HostAgentStub& other) = delete;

    void operator=(const HostAgentStub& other) = delete;

    static HostAgentStub* GetAgent()
    {
        static HostAgentStub inst;
        return &inst;
    }

    uint8_t* AllocHostAddr(uint64_t size)
    {
        auto hostPtr = (uint8_t*)malloc(size);
        ASSERT(CostModel::InternelErrorScene::NULL_POINTER, hostPtr != nullptr) << "alloc host addr failed.";
        allocatedHostAddr.emplace_back(hostPtr);
        return hostPtr;
    }

    void Finalize()
    {
        if (hostInited) {
            DestroyMemory();
        }
    }

    ~HostAgentStub() { Finalize(); }

protected:
    HostAgentStub() { Init(); }

    void DestroyMemory()
    {
        for (uint8_t* addr : allocatedHostAddr) {
            free(addr);
        }
    }

private:
    void Init() { hostInited = true; }

private:
    bool hostInited{false};

    std::vector<uint8_t*> allocatedHostAddr;
};

struct MemoryHelper {
    MemoryHelper(bool isTest) : isTest_(isTest) {}

    bool IsDevice() { return !isTest_; }

    uint8_t* CopyToDev(uint8_t* data, uint64_t size, uint8_t** cachedDevAddrHolder)
    {
        (void)cachedDevAddrHolder;
        auto ptr = npu::tile_fwk::dynamic::HostAgentStub::GetAgent()->AllocHostAddr(size);
        memcpy_s(ptr, size, data, size);
        return ptr;
    }

    template <typename T>
    T* CopyToDev(std::vector<T> data)
    {
        return (T*)CopyToDev((uint8_t*)data.data(), data.size() * sizeof(T));
    }

    template <typename T>
    T* CopyToDev(std::vector<T> data, uint8_t** cachedDevAddrHolder)
    {
        (void)cachedDevAddrHolder;
        return (T*)CopyToDev((uint8_t*)data.data(), data.size() * sizeof(T), nullptr);
    }

    uint8_t* CopyToDev(RawTensorData& data)
    {
        if (data.GetDevPtr() == nullptr) {
            auto devPtr = CopyToDev((uint8_t*)data.data(), data.size(), nullptr);
            data.SetDevPtr(devPtr);
        }
        return data.GetDevPtr();
    }

    void CopyFromDev(uint8_t* data, uint8_t* devPtr, uint64_t size) { memcpy_s(data, size, devPtr, size); }

    uint8_t* AllocDev(size_t size, uint8_t** cachedDevAddrHolder)
    {
        (void)cachedDevAddrHolder;
        uint8_t* devPtr = npu::tile_fwk::dynamic::HostAgentStub::GetAgent()->AllocHostAddr(size);
        return devPtr;
    }

    uint8_t* AllocZero(uint64_t size, uint8_t** cachedDevAddrHolder)
    {
        (void)cachedDevAddrHolder;
        uint8_t* devPtr = AllocDev(size, nullptr);
        memset_s(devPtr, size, 0, size);
        return devPtr;
    }

    void CopyFromDev(RawTensorData& t) { CopyFromDev(t.data(), t.GetDevPtr(), t.size()); }

    uint64_t GetL2Offset() { return 0; }

    bool isTest_{true};
};

class AiCorePvModelImpl : public CostModel::AiCoreModel {
private:
    std::shared_ptr<CostModel::DynPvModel> pv_;
    std::unordered_map<int, uint64_t> funcdata_;
    std::mutex mtx_;

public:
    explicit AiCorePvModelImpl(std::shared_ptr<CostModel::DynPvModel> pv) : pv_(pv) {}

    void InitData(int coreIdx, int64_t funcdata)
    {
        std::lock_guard<std::mutex> lock(mtx_);
        funcdata_[coreIdx] = funcdata;
    }

    void SendTask(int coreIdx, uint64_t taskId)
    {
        auto funcdata = funcdata_[coreIdx];
        DynFuncHeader* header = reinterpret_cast<DynFuncHeader*>(funcdata);
        DynFuncData* data = reinterpret_cast<DynFuncData*>(header + 1);
        pv_->Run(data, coreIdx, FuncID(taskId), TaskID(taskId));
    }
};

extern "C" int DynTileFwkBackendKernelServer(void* targ);
extern "C" int PyptoKernelCtrlServer(void* targ);

class CostModelLauncher : public DeviceLauncher {
public:
    static void CostModelRunOnce(
        Function* function, const std::vector<RawTensorDataPtr>& inputs, const std::vector<RawTensorDataPtr>& outputs,
        const DeviceLauncherConfig& config = DeviceLauncherConfig())
    {
        auto runner = CostModelLauncher(function, config);
        runner.RunDynamic(inputs, outputs);
    }

    // Run with incast/outcast from ProgramData
    static void CostModelRunOnce(Function* function, const DeviceLauncherConfig& config = DeviceLauncherConfig())
    {
        auto& inputs = ProgramData::GetInstance().GetInputDataList();
        auto& outputs = ProgramData::GetInstance().GetOutputDataList();
        auto runner = CostModelLauncher(function, config);
        runner.RunDynamic(inputs, outputs);
    }

private:
    static void PatchRuntimeDynamicCellMatchMeta(
        MemoryHelper& memoryHelper, DevAscendProgram* hostProg, DevAscendProgram* cfgProg)
    {
        if (hostProg == nullptr || cfgProg == nullptr) {
            return;
        }
        uint64_t dynamicCellMatchBytes = hostProg->memBudget.metadata.dynamicCellMatch;
        if (dynamicCellMatchBytes == 0) {
            hostProg->devArgs.dynamicCellMatchAddr = 0;
            hostProg->devArgs.dynamicCellMatchCapacity = 0;
            cfgProg->devArgs.dynamicCellMatchAddr = 0;
            cfgProg->devArgs.dynamicCellMatchCapacity = 0;
            return;
        }
        uint8_t* dynamicCellMatchAddr = memoryHelper.AllocZero(dynamicCellMatchBytes, nullptr);
        uint64_t dynamicCellMatchAddrU64 = reinterpret_cast<uint64_t>(dynamicCellMatchAddr);
        hostProg->devArgs.dynamicCellMatchAddr = dynamicCellMatchAddrU64;
        hostProg->devArgs.dynamicCellMatchCapacity = dynamicCellMatchBytes;
        cfgProg->devArgs.dynamicCellMatchAddr = dynamicCellMatchAddrU64;
        cfgProg->devArgs.dynamicCellMatchCapacity = dynamicCellMatchBytes;
    }

    CostModelLauncher(Function* function, const DeviceLauncherConfig& config) : function_(function), config_(config) {}

    void RunDynamic(const std::vector<RawTensorDataPtr>& inputs, const std::vector<RawTensorDataPtr>& outputs)
    {
        if (function_ == nullptr || function_->GetDyndevAttribute() == nullptr) {
            return;
        }
        RunModel(inputs, outputs);
    }

    static void RunStatic() {}

    void RunModel(const std::vector<RawTensorDataPtr>& inputs, const std::vector<RawTensorDataPtr>& outputs)
    {
        DeviceKernelArgs kArgs;
        auto dynAttr = function_->GetDyndevAttribute();
        DeviceLauncherConfigFillDeviceInfo(config_);
        MemoryHelper memoryHelper(true);
        DeviceInitDistributedContext(memoryHelper, dynAttr->commGroupNames, kArgs);
        DeviceInitTilingData(memoryHelper, kArgs, dynAttr->devProgBinary, nullptr, config_, nullptr);
        RefillDynamicCellMatchAndMemBudget(memoryHelper, kArgs, inputs, outputs);
        InitKernelInOuts(kArgs, inputs, outputs, true);
        auto* functionDevProg = reinterpret_cast<DevAscendProgram*>(dynAttr->devProgBinary.data());
        kArgs.maxDynamicAssembleOutcastMem = functionDevProg->memBudget.tensor.maxDynamicAssembleOutcastMem;
        kArgs.maxDynamicCellMatchTableMem = functionDevProg->memBudget.metadata.maxDynamicCellMatchTableMem;
        RunCostModel(&kArgs);
        SIMULATION_LOGI("Run TestModel");
        RunTestMode(&kArgs);
        SIMULATION_LOGI("Run DynCostModel");
        RunDynCostModel();
        SIMULATION_LOGI("Run PvModel");
#ifdef BUILD_WITH_CANN
        RunPvModel(kArgs, inputs, outputs);
#endif
    }

    void RunCostModel(DeviceKernelArgs* kArgs)
    {
        if (!config::GetPlatformConfig(KEY_ENABLE_DYN_COST_MODEL, true)) {
            return;
        }
        Function* function = Program::GetInstance().GetLastFunction();
        if (function == nullptr) {
            return;
        }
        config::SetSimConfig(KEY_SIM_MODE, CostModel::SimMode::LEAF_FUNCTION);
        CostModelAgent costModelAgent;
        costModelAgent.SubmitLeafFunctionsToCostModel();
        costModelAgent.RunCostModel();
        costModelAgent.TerminateCostModel();
        CostModel::ModelData* modelData = new CostModel::ModelData();
        auto attr = function->GetDyndevAttribute();
        modelData->functionTime.resize(attr->devLeafIndex2Hash.size(), 0);
        for (const auto& [index, hash] : attr->devLeafIndex2Hash) {
            auto time = costModelAgent.GetLeafFunctionTimeCost(hash);
            DEV_INFO("devLeafIndex2Hash, %d -> %lu: %lu\n", index, hash, time);
            modelData->functionTime[index] = time;
        }
        kArgs->costmodeldata = modelData;
    }

    void RunDynCostModel()
    {
        if (config::GetRuntimeOption<int64_t>(CFG_RUN_MODE) != CFG_RUN_MODE_SIM) {
            return;
        }
        config::SetSimConfig(KEY_SIM_MODE, CostModel::SimMode::NORMAL);
        CostModelAgent costModelAgent;

        std::string path = config::LogTopFolder() + "/dyn_topo.txt";
        costModelAgent.SubmitTopo(path);
        costModelAgent.SubmitLeafFunctionsToCostModel();
        costModelAgent.RunCostModel();
        costModelAgent.TerminateCostModel();
    }

    void RunPvModel(DeviceKernelArgs& kArgs, const std::vector<RawTensorDataPtr>& inputs,
        const std::vector<RawTensorDataPtr>& outputs)
    {
        if (config::GetRuntimeOption<int64_t>(CFG_RUN_MODE) != CFG_RUN_MODE_SIM ||
            std::getenv("ASCEND_HOME_PATH") == nullptr) {
            return;
        }
        
        try {
            pv_ = CostModel::PvModelFactory::CreateDyn();
            pv_->InitPv();
        } catch (const std::runtime_error& e) {
            SIMULATION_LOGE(CostModel::PrecisionSimErrorScene::NO_SO_EXISTS, "pv init fail.");
            return;
        }
        model_ = std::make_shared<AiCorePvModelImpl>(pv_);
        pv_->Codegen(function_);
        BuildPvKernelArgs(kArgs, inputs, outputs);
        RunTestMode(&kArgs);
        pv_->CopyTensorFromDev();
    }

    void BuildPvKernelArgs(DeviceKernelArgs& kArgs, const std::vector<RawTensorDataPtr>& inputs,
        const std::vector<RawTensorDataPtr>& outputs)
    {
        MemoryHelper devMem{true};
        auto buildInouts = [&](auto& tensorList, DevTensorData* tensorData) {
            for (auto& t : tensorList) {
                auto addrs = reinterpret_cast<uint64_t>(pv_->CopyTensorToDev((uint8_t*)t->data(), t->size()));
                DevAscendTensorDataCreator::Init(tensorData, addrs, t->GetShape().data(), t->GetShape().size());
                tensorData++;
            }
            return;
        };

        std::vector<uint8_t>& devProgData = function_->GetDyndevAttribute()->devProgBinary;
        auto* devProg = reinterpret_cast<DevAscendProgram*>(const_cast<uint8_t*>(devProgData.data()));

        devProg->devArgs.nrAicpu = 1;
        devProg->devArgs.nrValidAic = 1;
        devProg->devArgs.scheCpuNum = 1;
        AssignMetaAddr(devMem, kArgs, devProg, nullptr);
        PatchRuntimeDynamicCellMatchMeta(devMem, devProg, devProg);
        const uint64_t maxPatchCount = function_->GetDyndevAttribute()->dynamicCellMatchLaunchMetaList.size();
        size_t patchTailSize = sizeof(uint64_t) + maxPatchCount * sizeof(DevDynamicCellMatchStridePatch);
        size_t tensorSize = (inputs.size() + outputs.size()) * sizeof(DevTensorData) + 2 * sizeof(uint64_t) + patchTailSize;
        std::vector<uint8_t> tensorInfo(tensorSize);
        auto data = reinterpret_cast<uint64_t*>(tensorInfo.data());
        *data = inputs.size();
        data++;
        *data = outputs.size();
        data++;
        auto dataPtr = reinterpret_cast<DevTensorData*>(data);
        buildInouts(inputs, dataPtr);
        dataPtr += inputs.size();
        buildInouts(outputs, dataPtr);
        WriteDynamicCellMatchStridePatchesToLaunchArgs(reinterpret_cast<int64_t*>(tensorInfo.data()), cellMatchDescPatches_);
        kArgs.inputs = (int64_t*)pv_->CopyToDev(tensorInfo.data(), tensorSize);
        kArgs.outputs = kArgs.inputs + 1;
        kArgs.cfgdata = (int64_t*)pv_->CopyToDev(devProgData.data(), devProgData.size());
        kArgs.aicoreModel = model_.get();
    }

    void RunTestMode(DeviceKernelArgs* kArgs)
    {
        std::atomic<int> idx{0};
        auto* devProg = (DevAscendProgram*)(kArgs->cfgdata);
        size_t shmSize = DEVICE_TASK_CTRL_POOL_SIZE + DEVICE_TASK_QUEUE_SIZE * devProg->devArgs.scheCpuNum;
        auto deviceTaskCtrlPoolAddr =
            devProg->devArgs.runtimeDataRingBufferAddr + sizeof(RuntimeDataRingBufferHead) + DEV_ARGS_SIZE;
        (void)memset_s(reinterpret_cast<void*>(deviceTaskCtrlPoolAddr), shmSize, 0, shmSize);
        int launchAiCpuNum = static_cast<int>(devProg->devArgs.nrAicpu + dynamic::MAX_CONTROL_FLOW_AICPU_NUM);
        std::vector<std::thread> aicpus(launchAiCpuNum);
        auto threadFun = [&](uint32_t runMode) {
            int tidx = idx++;
            cpu_set_t cpuset;
            CPU_ZERO(&cpuset);
            CPU_SET(tidx, &cpuset);
            std::string name = "aicput" + std::to_string(tidx);
            pthread_setname_np(pthread_self(), name.c_str());
            pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
            DeviceKernelArgs localArgs = *kArgs;
            localArgs.parameter.runMode = runMode;
            (void)DynTileFwkBackendKernelServer(&localArgs);
        };

        aicpus[0] = std::thread(threadFun, RUN_SPLITTED_STREAM_CTRL);
        for (int i = 1; i < launchAiCpuNum; i++) {
           aicpus[i] = std::thread(threadFun, RUN_SPLITTED_STREAM_SCHE);
        }

        for (int i = 0; i < launchAiCpuNum; i++) {
            if (aicpus[i].joinable()) {
                aicpus[i].join();
            }
        }
    }

    void InitKernelInOuts(
        DeviceKernelArgs& kArgs, const std::vector<RawTensorDataPtr>& inputTensors,
        const std::vector<RawTensorDataPtr>& outputTensors, bool isTest)
    {
        std::vector<DeviceTensorData> inputList;
        std::vector<DeviceTensorData> outputList;
        MemoryHelper memoryHelper(isTest);
        std::tie(inputList, outputList) = BuildInputOutputFromHost(memoryHelper, inputTensors, outputTensors);
        const uint64_t maxPatchCount = function_->GetDyndevAttribute()->dynamicCellMatchLaunchMetaList.size();
        DeviceInitKernelInOuts(memoryHelper, kArgs, inputList, outputList, {}, maxPatchCount);
        WriteDynamicCellMatchStridePatchesToLaunchArgs(kArgs.inputs, cellMatchDescPatches_);
        SIMULATION_LOGI(
            "Inputs %p outputs %p workspace %p cfgdata %p", kArgs.inputs, kArgs.outputs, kArgs.workspace,
            kArgs.cfgdata);
    }

    void RefillDynamicCellMatchAndMemBudget(
        MemoryHelper& memoryHelper, DeviceKernelArgs& kArgs, const std::vector<RawTensorDataPtr>& inputs,
        const std::vector<RawTensorDataPtr>& outputs)
    {
        auto dynAttr = function_->GetDyndevAttribute();
        auto* functionDevProg = reinterpret_cast<DevAscendProgram*>(dynAttr->devProgBinary.data());
        std::vector<DeviceTensorData> evalInputList;
        std::vector<DeviceTensorData> evalOutputList;
        std::tie(evalInputList, evalOutputList) = BuildInputOutputFromHost(memoryHelper, inputs, outputs);
        Evaluator eval{dynAttr->inputSymbolDict, evalInputList, evalOutputList};
        auto* cfgProg = reinterpret_cast<DevAscendProgram*>(kArgs.cfgdata);
        cellMatchDescPatches_ = PrepareHostDynamicCellMatchForLaunch(*dynAttr.get(), eval, functionDevProg);
        cfgProg->memBudget = functionDevProg->memBudget;
        PatchRuntimeDynamicCellMatchMeta(memoryHelper, functionDevProg, cfgProg);
    }

private:
    Function* function_;
    DeviceLauncherConfig config_;
    std::vector<DevDynamicCellMatchStridePatch> cellMatchDescPatches_;
    std::shared_ptr<CostModel::DynPvModel> pv_;
    std::shared_ptr<CostModel::AiCoreModel> model_;
}; // CostModelLauncher
} // namespace npu::tile_fwk::dynamic