* This file is part of the MindStudio project.
* Copyright (c) 2025 Huawei Technologies Co.,Ltd.
*
* MindStudio is licensed under Mulan PSL v2.
* You can use this software according to the terms and conditions of the Mulan PSL v2.
* You may obtain a copy of Mulan PSL v2 at:
*
* http://license.coscl.org.cn/MulanPSL2
*
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
* See the Mulan PSL v2 for more details.
* -------------------------------------------------------------------------
*/
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/mman.h>
#include <unistd.h>
#include <semaphore.h>
#include <fcntl.h>
#include <algorithm>
#include <atomic>
#include <chrono>
#include <cstring>
#include <climits>
#include <ctime>
#include <fstream>
#include <iostream>
#include <string>
#include <thread>
#include <vector>
#include <map>
#include <cmath>
#include <csignal>
#include <functional>
#include <unordered_set>
#include "acl/acl_prof.h"
#include "acl/acl.h"
#include "mstx/ms_tools_ext.h"
#include "securec.h"
#include "msServiceProfiler/ServiceProfilerInterface.h"
#include "msServiceProfiler/Profiler.h"
#include "msServiceProfiler/Log.h"
#include "msServiceProfiler/Utils.h"
#include "msServiceProfiler/ServiceProfilerDbWriter.h"
#include "msServiceProfiler/SecurityUtilsLog.h"
#include "msServiceProfiler/ServiceProfilerMspti.h"
#include "msServiceProfiler/DBExecutor/DbExecutorServiceData.h"
#include "msServiceProfiler/DBExecutor/DbExecutorSliceData.h"
#include "msServiceProfiler/DBExecutor/DbExecutorMetaData.h"
#include "msServiceProfiler/ServiceProfilerInterface.h"
#include "msServiceProfiler/ServiceProfilerManager.h"
namespace {
constexpr int MAX_TX_MSG_LEN = 128;
constexpr int MAX_DEVICE_NUM = 128;
constexpr int SPAN_CACHE_LEN = 64;
constexpr int32_t kProfileDeviceStateCCallback = 16;
struct DlCloser {
void operator()(void* handle) const noexcept
{
if (handle) {
dlclose(handle);
}
}
};
using LibraryHandle = std::unique_ptr<void, DlCloser>;
std::once_flag& GetProcessRegisteredFlag()
{
static std::once_flag flag;
return flag;
}
bool& GetThreadRegisteredRef()
{
thread_local bool registered = false;
return registered;
}
std::pair<uint32_t, uint32_t> GetCurrentTidPid()
{
uint32_t tid = MsUtils::GetTid();
uint32_t pid = static_cast<uint32_t>(getpid());
return {tid, pid};
}
void EnsureProcessRegistered(uint32_t pid)
{
std::call_once(GetProcessRegisteredFlag(), [pid]() {
msServiceProfiler::DbProcessData procData;
std::string pidStr = std::to_string(pid);
procData.pid = std::move(pidStr);
procData.process_name = procData.pid;
procData.label = "";
procData.parentPid = "";
auto procExecutor = std::make_unique<
msServiceProfiler::DbExecutor<msServiceProfiler::PROCESS_INSERT_STMT>
>(std::move(procData));
msServiceProfiler::InsertExecutor2Writer<
msServiceProfiler::DBFile::SERVICE
>(std::move(procExecutor));
});
}
void EnsureThreadRegistered(uint32_t tid, uint32_t pid, const char* domain)
{
bool& registered = GetThreadRegisteredRef();
if (!registered) {
msServiceProfiler::DbThreadData threadData;
threadData.track_id = tid;
threadData.tid = std::to_string(tid);
threadData.pid = std::to_string(pid);
threadData.thread_name = (domain ? std::string(domain) : "") + "(" + std::to_string(tid) + ")";
threadData.thread_sort_index = 0;
auto threadExecutor = std::make_unique<
msServiceProfiler::DbExecutor<msServiceProfiler::THREAD_INSERT_STMT>
>(std::move(threadData));
msServiceProfiler::InsertExecutor2Writer<
msServiceProfiler::DBFile::SERVICE
>(std::move(threadExecutor));
registered = true;
}
}
void WriteSliceEvent(
const char* name,
const char* domain,
const char* msg,
uint64_t timestamp,
uint64_t duration,
uint32_t tid,
uint32_t pid
)
{
msServiceProfiler::DbSliceData sliceData;
sliceData.timestamp = timestamp;
sliceData.duration = duration;
sliceData.name = name ? name : "";
sliceData.depth = 0;
sliceData.track_id = tid;
sliceData.cat = domain ? domain : "";
sliceData.args = msg ? msg : "";
sliceData.cname = "";
sliceData.end_time = timestamp + duration;
sliceData.flag_id = "";
sliceData.pid = pid;
sliceData.tid = tid;
auto executor = std::make_unique<
msServiceProfiler::DbExecutor<msServiceProfiler::SLICE_INSERT_STMT>
>(std::move(sliceData));
msServiceProfiler::InsertExecutor2Writer<
msServiceProfiler::DBFile::SERVICE
>(std::move(executor));
}
}
std::atomic<u_int64_t> g_markIndex(0);
std::atomic<int> prof_current_step_num{0};
using DATA_PTR = struct ProfSetDevParaDevice *;
MS_SERVICE_PROFILER_API void SetProfilerCurrentStep(int current_step_num)
{
prof_current_step_num.store(current_step_num, std::memory_order_relaxed);
}
MS_SERVICE_PROFILER_API int GetProfilerCurrentStep()
{
return prof_current_step_num.load(std::memory_order_relaxed);
}
struct ProfSetDevParaDevice {
uint32_t chipId;
uint32_t deviceId;
bool isOpen;
};
static uint64_t *GetSpanStartTimeCache()
{
thread_local u_int64_t cacheSpanStartTime[SPAN_CACHE_LEN + 8];
return cacheSpanStartTime;
}
SpanHandle StartSpanWithName(const char *name)
{
if (name == nullptr) {
return StartSpan();
}
auto timestamp = MsUtils::GetCurrentTimeInNanoseconds();
uint64_t *timeCache = GetSpanStartTimeCache();
auto threadMarkId = timeCache[0];
threadMarkId++;
timeCache[0] = threadMarkId;
auto location = threadMarkId % SPAN_CACHE_LEN + 1;
*(timeCache + location) = timestamp;
return threadMarkId;
}
SpanHandle StartSpan()
{
return StartSpanWithName("");
}
void MarkSpanAttr(const char *msg, SpanHandle spanHandle)
{
if (msg == nullptr) {
return;
}
thread_local uint32_t tid = MsUtils::GetTid();
uint64_t *timeCache = GetSpanStartTimeCache();
auto location = spanHandle % SPAN_CACHE_LEN + 1;
auto stratTimestamp = *(timeCache + location);
msServiceProfiler::DbActivityMarker marker;
marker.flag = msServiceProfiler::ActivityFlag::ACTIVITY_FLAG_MARKER_SPAN;
marker.timestamp = stratTimestamp;
marker.endTimestamp = MsUtils::GetCurrentTimeInNanoseconds();
marker.id = g_markIndex.fetch_add(1);
marker.processId = static_cast<uint32_t>(getpid());
marker.threadId = tid;
marker.message = msg;
auto executor =
std::make_unique<msServiceProfiler::DbExecutor<msServiceProfiler::SERVICE_INSERT_STMT>>(std::move(marker));
msServiceProfiler::InsertExecutor2Writer<msServiceProfiler::DBFile::SERVICE>(std::move(executor));
}
void EndSpan(SpanHandle)
{
return;
}
void MarkEvent(const char *msg)
{
if (msg == nullptr) {
return;
}
thread_local uint32_t tid = MsUtils::GetTid();
msServiceProfiler::DbActivityMarker marker;
marker.flag = msServiceProfiler::ActivityFlag::ACTIVITY_FLAG_MARKER_EVENT;
marker.timestamp = MsUtils::GetCurrentTimeInNanoseconds();
marker.endTimestamp = marker.timestamp;
marker.id = g_markIndex.fetch_add(1);
marker.processId = static_cast<uint32_t>(getpid());
marker.threadId = tid;
marker.message = msg;
auto executor =
std::make_unique<msServiceProfiler::DbExecutor<msServiceProfiler::SERVICE_INSERT_STMT>>(std::move(marker));
msServiceProfiler::InsertExecutor2Writer<msServiceProfiler::DBFile::SERVICE>(std::move(executor));
}
MS_SERVICE_PROFILER_API void SpanEndEx(
const char* name, const char* domain, const char* msg, SpanHandle spanHandle)
{
const auto tidPid = GetCurrentTidPid();
const uint32_t tid = tidPid.first;
const uint32_t pid = tidPid.second;
EnsureProcessRegistered(pid);
EnsureThreadRegistered(tid, pid, domain);
uint64_t* timeCache = GetSpanStartTimeCache();
auto location = spanHandle % SPAN_CACHE_LEN + 1;
uint64_t startTimestamp = *(timeCache + location);
uint64_t endTimestamp = MsUtils::GetCurrentTimeInNanoseconds();
uint64_t duration = endTimestamp - startTimestamp;
WriteSliceEvent(name, domain, msg, startTimestamp, duration, tid, pid);
}
MS_SERVICE_PROFILER_API void MarkEventEx(
const char* name, const char* domain, const char* msg)
{
const auto tidPid = GetCurrentTidPid();
const uint32_t tid = tidPid.first;
const uint32_t pid = tidPid.second;
EnsureProcessRegistered(pid);
EnsureThreadRegistered(tid, pid, domain);
uint64_t timestamp = MsUtils::GetCurrentTimeInNanoseconds();
WriteSliceEvent(name, domain, msg, timestamp, 0, tid, pid);
}
void StartServerProfiler()
{
msServiceProfiler::ServiceProfilerManager::GetInstance().NotifyStartProfiler();
}
void StopServerProfiler()
{
msServiceProfiler::ServiceProfilerManager::GetInstance().NotifyStopProfiler();
}
MS_SERVICE_PROFILER_API void RegisterProfilerStartCallback(void (*callback)())
{
msServiceProfiler::ServiceProfilerManager::GetInstance().RegisterStartCallback(callback);
}
MS_SERVICE_PROFILER_API void RegisterProfilerStopCallback(void (*callback)())
{
msServiceProfiler::ServiceProfilerManager::GetInstance().RegisterStopCallback(callback);
}
MS_SERVICE_PROFILER_API void RegisterProfilerStartMetricCallback(void (*callback)())
{
msServiceProfiler::ServiceProfilerManager::GetInstance().RegisterStartMetricCallback(callback);
}
MS_SERVICE_PROFILER_API void RegisterProfilerStopMetricCallback(void (*callback)())
{
msServiceProfiler::ServiceProfilerManager::GetInstance().RegisterStopMetricCallback(callback);
}
bool IsEnable(uint32_t level)
{
return msServiceProfiler::ServiceProfilerManager::GetInstance().IsEnable(level);
}
const char* GetProfPath()
{
return msServiceProfiler::ServiceProfilerManager::GetInstance().GetProfPath().c_str();
}
int GetAclProfAicoreMetrics()
{
return msServiceProfiler::ServiceProfilerManager::GetInstance().GetAclProfAicoreMetricsValue();
}
bool GetTorchProfStack()
{
return msServiceProfiler::ServiceProfilerManager::GetInstance().GetTorchProfStack();
}
bool IsValidDomain(const char *domainName)
{
const std::set<std::string> &allowNames = msServiceProfiler::ServiceProfilerManager::GetInstance().GetValidDomain();
return allowNames.empty() || allowNames.find(std::string(domainName)) != allowNames.end();
}
bool GetTorchProfModules()
{
return msServiceProfiler::ServiceProfilerManager::GetInstance().GetTorchProfModules();
}
int GetTorchProfStepNum()
{
return msServiceProfiler::ServiceProfilerManager::GetInstance().GetTorchProfStepNum();
}
const char* GetAclTaskTimeLevel()
{
return msServiceProfiler::ServiceProfilerManager::GetInstance().GetAclTaskTimeLevel().c_str();
}
bool GetTorchProfilerEnable()
{
return msServiceProfiler::ServiceProfilerManager::GetInstance().GetTorchProfilerEnable();
}
bool GetEnableDomainFilter()
{
return msServiceProfiler::ServiceProfilerManager::GetInstance().GetEnableDomainFilter();
}
const std::set<std::string> &GetValidDomain()
{
return msServiceProfiler::ServiceProfilerManager::GetInstance().GetValidDomain();
}
void AddMetaInfo(const char *key, const char *value)
{
auto executor = std::make_unique<msServiceProfiler::DbExecutor<msServiceProfiler::META_INSERT_STMT>>(key, value);
msServiceProfiler::InsertExecutor2Writer<msServiceProfiler::DBFile::SERVICE>(std::move(executor));
}
static bool g_usedNewDeviceStateApi = false;
void MsprofSetDeviceCallbackLegacy(DATA_PTR data, uint32_t len)
{
if (len != sizeof(::ProfSetDevParaDevice)) {
return;
}
if (data == nullptr) {
return;
}
DATA_PTR setCfg = static_cast<DATA_PTR>(data);
static uint32_t sdeviceID = msServiceProfiler::INVALID_DEVICE_ID;
if (setCfg->deviceId != sdeviceID) {
sdeviceID = setCfg->deviceId;
msServiceProfiler::ServiceProfilerManager::GetInstance().NotifyDeviceID(sdeviceID);
}
return;
}
int32_t MsprofSetDeviceCallbackImpl(void *data, uint32_t len)
{
if (len != sizeof(::ProfSetDevParaDevice)) {
return -1;
}
if (data == nullptr) {
return -1;
}
DATA_PTR setCfg = static_cast<DATA_PTR>(data);
static uint32_t sdeviceID = msServiceProfiler::INVALID_DEVICE_ID;
if (setCfg->deviceId != sdeviceID) {
sdeviceID = setCfg->deviceId;
msServiceProfiler::ServiceProfilerManager::GetInstance().NotifyDeviceID(sdeviceID);
}
return 0;
}
static void UnregisterDeviceStateCallback(void *profApiHandle)
{
if (!g_usedNewDeviceStateApi || profApiHandle == nullptr) {
return;
}
using MsprofRegisterProfileCallbackFunc = int32_t (*)(int32_t callbackType, void *callback, uint32_t len);
auto registerFn = reinterpret_cast<MsprofRegisterProfileCallbackFunc>(
dlsym(profApiHandle, "MsprofRegisterProfileCallback"));
if (registerFn == nullptr) {
return;
}
(void)registerFn(kProfileDeviceStateCCallback, nullptr, sizeof(void *));
g_usedNewDeviceStateApi = false;
}
static LibraryHandle RegisterSetDeviceCallback()
{
void *handle = dlopen("libprofapi.so", RTLD_LAZY | RTLD_LOCAL);
if (handle == nullptr) {
PROF_LOGW("Failed to dlopen libprofapi.so. Will be not able to get device profiling data. "
"Check whether a NPU server or if cann toolkit installed.");
return LibraryHandle(nullptr);
}
using ProfSetDeviceHandle = void (*)(DATA_PTR, uint32_t);
using ProfRegDeviceStateCallbackFunc = int32_t (*)(ProfSetDeviceHandle);
auto legacyFn = reinterpret_cast<ProfRegDeviceStateCallbackFunc>(
dlsym(handle, "profRegDeviceStateCallback"));
if (legacyFn != nullptr) {
PROF_LOGD("Using legacy profRegDeviceStateCallback API");
legacyFn(MsprofSetDeviceCallbackLegacy);
g_usedNewDeviceStateApi = false;
return LibraryHandle(handle);
}
using MsprofRegisterProfileCallbackFunc = int32_t (*)(int32_t callbackType, void *callback, uint32_t len);
auto registerFn = reinterpret_cast<MsprofRegisterProfileCallbackFunc>(
dlsym(handle, "MsprofRegisterProfileCallback"));
if (registerFn == nullptr) {
PROF_LOGW("Failed to get profRegDeviceStateCallback or MsprofRegisterProfileCallback "
"from libprofapi.so. Will be not able to get device profiling data. "
"Check whether a NPU server or if cann toolkit installed.");
dlclose(handle);
return LibraryHandle(nullptr);
}
int32_t ret = registerFn(kProfileDeviceStateCCallback,
reinterpret_cast<void *>(MsprofSetDeviceCallbackImpl), sizeof(void *));
if (ret != 0) {
PROF_LOGW("MsprofRegisterProfileCallback(PROFILE_DEVICE_STATE_C_CALLBACK) failed: %d", ret);
dlclose(handle);
return LibraryHandle(nullptr);
}
g_usedNewDeviceStateApi = true;
return LibraryHandle(handle);
}
namespace msServiceProfiler {
ServiceProfilerManager &ServiceProfilerManager::GetInstance()
{
static ServiceProfilerManager manager;
return manager;
}
ServiceProfilerManager::ServiceProfilerManager()
: configHandle_(nullptr), config_(std::make_shared<Config>()), msptiHandle_(nullptr)
{
ProfLogInit();
MarkFirstProcessAsMain();
config_->ReadAndSaveConfig();
if (config_->GetEnable()) {
StartProfiler(true);
}
notifyStarted = started_;
LaunchThread();
PROF_LOGD("ServiceProfilerManager Init Finished");
}
ServiceProfilerManager::~ServiceProfilerManager()
{
const std::string &exitSemName = GetConfigPath();
if (!exitSemName.empty()) {
shm_unlink(ServiceProfilerManager::ToSemName(exitSemName).c_str());
}
if (this->thread_.joinable()) {
threadRunFlag_ = false;
this->thread_.join();
}
}
std::string ServiceProfilerManager::ToSemName(const std::string &oriSemName)
{
std::string semName = "/";
semName.append(oriSemName);
std::replace(++semName.begin(), semName.end(), '/', '#');
return semName;
}
void ServiceProfilerManager::MarkFirstProcessAsMain()
{
const size_t mmapSize = 1024;
const size_t infoMaxSize = 1000;
const std::string &semNameTouchTime = config_->GetConfigPath();
if (semNameTouchTime.empty()) {
return;
}
int shmFd = shm_open(ToSemName(semNameTouchTime).c_str(), O_CREAT | O_RDWR, 0640);
if (shmFd == -1) {
PROF_LOGW("shm_open failed");
return;
}
if (ftruncate(shmFd, mmapSize) == -1) {
PROF_LOGW("ftruncate failed");
close(shmFd);
return;
}
void *mmapPtr = mmap(nullptr, mmapSize, PROT_READ | PROT_WRITE, MAP_SHARED, shmFd, 0);
if (mmapPtr == MAP_FAILED) {
PROF_LOGW("mmap failed");
close(shmFd);
return;
}
char *pInfoStr = static_cast<char *>(mmapPtr);
std::string infoStr(pInfoStr, infoMaxSize);
auto splitInfo = MsUtils::SplitStr(infoStr, ',');
if (!splitInfo.second.empty()) {
pid_t pid =
static_cast<pid_t>(MsUtils::Str2Uint(splitInfo.first));
if (kill(pid, 0) == 0) {
isMaster_ = false;
config_->SetProfPathDateTail(std::string(splitInfo.second.c_str()));
}
}
if (isMaster_) {
std::string infoOut;
config_->InitProfPathDateTail(true);
infoOut.append(std::to_string(getpid())).append(",").append(config_->GetProfPathDateTail());
if (sprintf_s(pInfoStr, infoMaxSize, "%s", infoOut.c_str()) == -1) {
PROF_LOGW("cannot write to mmap");
}
}
if (munmap(mmapPtr, mmapSize) == -1) {
PROF_LOGW("munmap failed");
}
close(shmFd);
}
void ServiceProfilerManager::LaunchThread()
{
this->thread_ = std::thread(&ServiceProfilerManager::ThreadFunction, this);
}
void ServiceProfilerManager::DynamicControl()
{
auto configPath = config_->GetConfigPath();
if (configPath.empty()) {
return;
}
struct stat configFileStat;
if (stat(configPath.c_str(), &configFileStat) == 0) {
if (configFileStat.st_mtime == lastUpdate_) {
return;
} else {
lastUpdate_ = configFileStat.st_mtime;
}
} else {
LOG_ONCE_E("fail to get stat of %s", SecurityUtils::ToSafeString(configPath).c_str());
return;
}
auto configJson = config_->ReadConfigFile();
bool enableFromConfig = config_->ParseEnable(configJson, true);
bool prevMetricEnable = config_->GetMetricEnable();
if (enableFromConfig && !config_->GetEnable()) {
PROF_LOGI("Profiler Enabled...");
config_->ParseConfig(configJson);
StartProfiler();
PROF_LOGI("Profiler Enabled Successfully!");
} else if (!enableFromConfig && config_->GetEnable()) {
PROF_LOGI("Profiler Disabled...");
StopProfiler();
PROF_LOGI("Profiler Disabled Successfully!");
}
bool metricEnableFromConfig = config_->ParseMetricEnable(configJson, true);
bool needStart = metricEnableFromConfig && (prevMetricEnable != metricEnableFromConfig || !metricStartCallbackInvoked_);
bool needStop = !metricEnableFromConfig && (prevMetricEnable != metricEnableFromConfig || metricStartCallbackInvoked_);
if (needStart) {
config_->ParseMetricEnable(configJson, false);
if (startMetricCallback_ != nullptr) {
PROF_LOGI("Metric collection enabled, calling start metric callback");
try {
startMetricCallback_();
metricStartCallbackInvoked_ = true;
} catch (...) {
PROF_LOGE("Python start metric callback threw an exception");
}
}
} else if (needStop) {
config_->ParseMetricEnable(configJson, false);
if (stopMetricCallback_ != nullptr) {
PROF_LOGI("Metric collection disabled, calling stop metric callback");
try {
stopMetricCallback_();
metricStartCallbackInvoked_ = false;
} catch (...) {
PROF_LOGE("Python stop metric callback threw an exception");
}
}
}
}
void ServiceProfilerManager::ThreadFunction()
{
PROF_LOGD("profiler thread launched");
auto profApiHandle = RegisterSetDeviceCallback();
uint32_t deviceID = deviceID_.load();
PROF_LOGD("start prof device id is %u", deviceID);
if (config_->GetEnable()) {
StartAclProfiler(config_->GetProfPath(), deviceID);
}
NpuMemoryUsage npuMemoryUsage = NpuMemoryUsage();
AddMetaInfo("hostname", MsUtils::GetHostName().c_str());
AddMetaInfo("ppid", std::to_string(getppid()).c_str());
int heartbeat = 0;
while (threadRunFlag_) {
if (heartbeat++ % (60000 / config_->GetNpuMemorySleepMilliseconds()) == 0) {
PROF_LOGD("manager thread heartbeat");
}
std::this_thread::sleep_for(std::chrono::milliseconds(config_->GetNpuMemorySleepMilliseconds()));
DynamicControl();
bool startFlagFromNotify = notifyStarted.load();
if (startFlagFromNotify != started_) {
if (startFlagFromNotify) {
StartProfiler();
deviceID = deviceID_.load();
} else {
StopProfiler();
}
}
uint32_t nowDeviceID = deviceID_.load();
if (nowDeviceID != deviceID && started_ && config_->IsAclProf()) {
StopAclProf();
StartAclProf(config_->GetProfPath(), nowDeviceID);
}
deviceID = nowDeviceID;
ProfTimerCtrl();
ProfStepCtrl();
RecordMemoryUsage(npuMemoryUsage);
if (msptiStarted_) {
FlushBufferByTime();
}
}
UnregisterDeviceStateCallback(profApiHandle.get());
PROF_LOGD("profiler thread stop loop");
StopProfiler();
}
void ServiceProfilerManager::ProfStepCtrl()
{
int stepLimit = config_->GetProfilerStepNum();
if (profilerStoppedByLimit_) {
return;
}
if (!started_ || stepLimit < 0) {
return;
}
int currentStep = GetProfilerCurrentStep();
if (stepLimit >= 0) {
if (stopTargetStep_ < 0) {
stopTargetStep_ = currentStep + stepLimit;
}
if (currentStep >= stopTargetStep_) {
PROF_LOGI("Profiler Step Limit Reached! Current=%d, Target=%d. Stopping...",
currentStep, stopTargetStep_);
StopProfiler();
stopTargetStep_ = -1;
PROF_LOGI("Profiler Disabled Successfully!");
config_->SetFileEnable(0);
profilerStoppedByLimit_ = true;
}
} else {
if (stopTargetStep_ != -1) {
stopTargetStep_ = -1;
}
}
}
void ServiceProfilerManager::ProfTimerCtrl()
{
{
if (config_->GetTimeLimit() > 0 && started_) {
auto terminate = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::seconds>(terminate - initiate);
if (duration.count() >= config_->GetTimeLimit()) {
StopProfiler();
PROF_LOGI("Profiler Timelimit %u Seconds Is Reached,"
" Profiler Disabled Successfully!",
config_->GetTimeLimit());
config_->SetFileEnable(0);
}
}
if (config_->GetAclTaskTimeDuration() > 0 && aclProfStarted_) {
auto terminate = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::seconds>(terminate - initiate);
if (duration.count() >= config_->GetAclTaskTimeDuration()) {
StopAclProf();
PROF_LOGI("Profiler AclTaskTimeDuration %d Seconds Is Reached, "
"AclTaskTime Disabled Successfully!",
config_->GetAclTaskTimeDuration());
config_->SetAclTaskTimeDuration(0);
}
}
}
}
void DeviceMemoryWrite2Tx(const std::vector<int> &memoryInfo, const std::string& metricName)
{
for (long unsigned int i = 0; i < memoryInfo.size(); i++) {
msServiceProfiler::Profiler<msServiceProfiler::INFO>()
.Domain("npu")
.Metric(metricName.c_str(), memoryInfo[i])
.MetricScope("device", i)
.Launch();
}
}
void ServiceProfilerManager::RecordMemoryUsage(NpuMemoryUsage &npuMemoryUsage)
{
try {
if (!(config_->GetEnable() && config_->GetNpuMemoryUsage() && isMaster_)) {
return;
}
int ret = npuMemoryUsage.InitDcmiCardAndDevices();
if (ret != EXITCODE_SUCCESS) {
PROF_LOGE("InitDcmiCardAndDevices failed."
" Check whether a NPU server or if NPU driver installed.");
return;
}
std::vector<int> memoryUsed;
std::vector<int> memoryUtiliza;
if (npuMemoryUsage.GetByDcmi(memoryUsed, memoryUtiliza) == EXITCODE_SUCCESS) {
DeviceMemoryWrite2Tx(memoryUsed, "usage");
DeviceMemoryWrite2Tx(memoryUtiliza, "utiliza");
}
} catch (std::exception &e) {
PROF_LOGD("get npu memory usage failed");
}
}
void ServiceProfilerManager::SetAclProfHostSysConfig() const
{
std::string hostProfString = "";
if (config_->GetHostCpuUsage() && config_->GetHostMemoryUsage()) {
hostProfString = "cpu,mem";
} else if (config_->GetHostCpuUsage()) {
hostProfString = "cpu";
} else if (config_->GetHostMemoryUsage()) {
hostProfString = "mem";
}
aclprofSetConfig(ACL_PROF_HOST_SYS, hostProfString.c_str(), strlen(hostProfString.c_str()));
aclprofSetConfig(ACL_PROF_HOST_SYS_USAGE, hostProfString.c_str(), strlen(hostProfString.c_str()));
aclprofSetConfig(ACL_PROF_HOST_SYS_USAGE_FREQ,
std::to_string(config_->GetHostFreq()).c_str(),
strlen(std::to_string(config_->GetHostFreq()).c_str()));
}
AclprofConfig *ServiceProfilerManager::ProfCreateConfig(uint32_t deviceID)
{
uint32_t profSwitch = ACL_PROF_MSPROFTX;
uint32_t deviceIdList[MAX_DEVICE_NUM] = {0};
uint32_t deviceNums = deviceID == INVALID_DEVICE_ID ? 0 : 1;
if (deviceNums > 0) {
deviceIdList[0] = deviceID;
if (config_->GetEnableAclTaskTime()) {
profSwitch = config_->GetProfilingSwitch();
}
}
aclprofAicoreMetrics aicoreMetricsEnum = config_->GetAclProfAicoreMetrics();
PROF_LOGD("Current profSwitch configuration: Hex: 0x%x", profSwitch);
PROF_LOGD("Current aicoreMetricsEnum configuration: %u", aicoreMetricsEnum);
PROF_LOGD("Current deviceID configuration: %u, %u", deviceNums, deviceIdList[0]);
auto profConfig = aclprofCreateConfig(
deviceIdList,
deviceNums,
aicoreMetricsEnum,
nullptr,
profSwitch);
if (profConfig == nullptr) {
PROF_LOGE("acl prof create config failed.");
} else {
this->configHandle_ = profConfig;
}
return profConfig;
}
void ServiceProfilerManager::StartProfiler(bool isInit)
{
if (started_) {
return;
}
stopTargetStep_ = -1;
profilerStoppedByLimit_ = false;
initiate = std::chrono::high_resolution_clock::now();
auto profPath = config_->GetProfPath();
if (!MsUtils::MakeDirs(profPath)) {
PROF_LOGE(
"Failed to create directory(%s), possibly due to lack of permission", profPath.c_str());
config_->SetEnable(false);
return;
}
PROF_LOGI("prof path: %s", profPath.c_str());
auto executor = std::make_unique<DbFuncExec>(
[profPath](ServiceProfilerDbWriter &writer, sqlite3 *) -> void { writer.StartDump(profPath); }, PRIORITY_START_PROF);
msServiceProfiler::InsertExecutor2Writer<DBFile::SERVICE>(std::move(executor));
if (!isInit) {
StartAclProfiler(profPath, deviceID_.load());
}
config_->SetEnable(true);
started_ = true;
notifyStarted = true;
if (startCallback_ != nullptr) {
PROF_LOGD("Calling Python start callback");
try {
startCallback_();
} catch (...) {
PROF_LOGE("Python start callback threw an exception");
}
}
}
void ServiceProfilerManager::StartAclProfiler(const std::string &profPath, uint32_t deviceID)
{
if (config_->GetMsptiEnable()) {
StartMsptiProf(profPath);
} else if (config_->GetTorchProfilerEnable()) {
} else if (config_->IsAclProf()) {
StartAclProf(profPath, deviceID);
} else {
}
}
void ServiceProfilerManager::StartMsptiProf(const std::string &profPath)
{
auto ret = InitMspti(profPath, msptiHandle_);
if (ret != 0) {
PROF_LOGE("Mspti init failed.");
msptiStarted_ = false;
} else {
InitMsptiActivity(config_->GetMsptiEnable());
const auto apiFilter_ = config_->GetApiFilter();
const auto kernelFilter_ = config_->GetKernelFilter();
InitMsptiFilter(apiFilter_, kernelFilter_);
msptiStarted_ = true;
}
}
void ServiceProfilerManager::StartAclProf(const std::string &profPath, uint32_t deviceID)
{
if (aclProfStarted_) {
PROF_LOGD("StartAclProf aclProf is Started: %d", aclProfStarted_);
return;
}
PROF_LOGD("StartAclProf device_id: %u, is Master: %d", deviceID, isMaster_);
if (deviceID == INVALID_DEVICE_ID &&
!(isMaster_ && (config_->GetHostCpuUsage() || config_->GetHostMemoryUsage()))) {
return;
}
PROF_LOGD("StartAclProf starting");
aclError ret = aclprofInit(profPath.c_str(), profPath.size());
if (ret != ACL_ERROR_NONE) {
PROF_LOGE("acl prof init failed, ret = %d", ret);
return;
}
MsUtils::FailAutoFree autoFree;
autoFree.AddFreeFunction([]() {
if (aclprofFinalize() != ACL_ERROR_NONE) {
PROF_LOGE("acl prof finalize failed");
}
},
"auto call finalize after acl prof init when start failed.");
if (ret == ACL_ERROR_NONE && isMaster_) {
SetAclProfHostSysConfig();
}
auto profConfig = ProfCreateConfig(deviceID);
if (profConfig == nullptr) {
config_->SetEnable(false);
return;
}
autoFree.AddFreeFunction([this, profConfig]() {
if (aclprofDestroyConfig(profConfig) != ACL_ERROR_NONE) {
PROF_LOGE("acl prof destroy config failed");
}
configHandle_ = nullptr;
},
"auto call destroy after acl prof create config when start failed.");
PROF_LOGD("begin to start profiling");
ret = aclprofStart(profConfig);
if (ret != ACL_ERROR_NONE) {
PROF_LOGE("acl prof start failed, ret = %d", ret);
config_->SetEnable(false);
return;
}
autoFree.SetSuccess();
aclProfStarted_ = true;
}
void ServiceProfilerManager::StopAclProf()
{
if (!aclProfStarted_) {
return;
}
auto profConfig = (AclprofConfig *)this->configHandle_;
PROF_LOGD("StopAclProf calling aclprofStop");
auto ret = aclprofStop(profConfig);
aclProfStarted_ = false;
if (ret != ACL_ERROR_NONE) {
PROF_LOGE("acl prof stop failed, ret = %d", ret);
return;
}
ret = aclprofDestroyConfig(profConfig);
if (ret != ACL_ERROR_NONE) {
PROF_LOGE("acl prof destroy config failed, ret = %d", ret);
}
this->configHandle_ = nullptr;
ret = aclprofFinalize();
if (ret != ACL_ERROR_NONE) {
PROF_LOGE("acl prof finalize failed, ret = %d", ret);
return;
}
}
void ServiceProfilerManager::NotifyDeviceID(uint32_t deviceID)
{
PROF_LOGD("device id set to %u", deviceID);
deviceID_ = deviceID;
}
void ServiceProfilerManager::RegisterStartCallback(void (*callback)())
{
startCallback_ = callback;
PROF_LOGD("Profiler start callback registered");
if (started_ && callback != nullptr) {
PROF_LOGI("Profiler already started, calling start callback immediately");
try {
callback();
} catch (...) {
PROF_LOGE("Start callback threw an exception");
}
}
}
void ServiceProfilerManager::RegisterStopCallback(void (*callback)())
{
stopCallback_ = callback;
PROF_LOGD("Profiler stop callback registered");
}
void ServiceProfilerManager::RegisterStartMetricCallback(void (*callback)())
{
startMetricCallback_ = callback;
PROF_LOGD("Profiler start metric callback registered");
}
void ServiceProfilerManager::RegisterStopMetricCallback(void (*callback)())
{
stopMetricCallback_ = callback;
PROF_LOGD("Profiler stop metric callback registered");
}
void ServiceProfilerManager::StopProfiler()
{
PROF_LOGD("StopProfiler started_=%d, aclProfStarted_=%d", started_, aclProfStarted_);
if (!started_) {
return;
}
config_->SetEnable(false);
if (msptiStarted_) {
msptiStarted_ = false;
UninitMspti(msptiHandle_);
} else if (aclProfStarted_) {
StopAclProf();
} else {
}
auto executor =
std::make_unique<DbFuncExec>([](ServiceProfilerDbWriter &writer, sqlite3 *) -> void { writer.StopDump(); }, PRIORITY_STOP_PROF);
msServiceProfiler::InsertExecutor2Writer<DBFile::SERVICE>(std::move(executor));
started_ = false;
notifyStarted = false;
if (stopCallback_ != nullptr) {
PROF_LOGD("Calling Python stop callback");
try {
stopCallback_();
} catch (...) {
PROF_LOGE("Python stop callback threw an exception");
}
}
}
}