* -------------------------------------------------------------------------
* This file is part of the MindStudio project.
* Copyright (c) 2025 Huawei Technologies Co.,Ltd.
*
* MindStudio is licensed under Mulan PSL v2.
* You can use this software according to the terms and conditions of the Mulan PSL v2.
* You may obtain a copy of Mulan PSL v2 at:
*
* http://license.coscl.org.cn/MulanPSL2
*
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
* See the Mulan PSL v2 for more details.
* -------------------------------------------------------------------------
*/
#include <cstring>
#include <thread>
#include <sys/wait.h>
#include <unistd.h>
#include <dirent.h>
#include <sys/types.h>
#include <fcntl.h>
#include <sys/stat.h>
#include <chrono>
#include <sys/file.h>
#include "include/Macro.h"
#include "utils/FileUtils.h"
#include "utils/FileOperation.h"
#include "utils/DataUtils.h"
#include "utils/MathUtils.h"
#include "core/AclTensor.h"
#include "base/ErrorInfosManager.h"
#include "third_party/proto/AclDumpMsg.pb.h"
#include "AclDumpDataProcessor.h"
namespace MindStudioDebugger {
namespace AclDumpMsg = toolkit::dumpdata;
constexpr size_t DHA_ATOMIC_ADD_INFO_SIZE = 128;
constexpr size_t L2_ATOMIC_ADD_INFO_SIZE = 128;
constexpr size_t AICORE_INFO_SIZE = 256;
constexpr size_t DHA_ATOMIC_ADD_STATUS_SIZE = 256;
constexpr size_t L2_ATOMIC_ADD_STATUS_SIZE = 256;
constexpr size_t UINT64_SIZE = sizeof(uint64_t);
constexpr const char* DEBUG_FILE_SIGN = "Opdebug.Node_OpDebug.";
constexpr const char* STATS_HEADER_INOUT = "Input/Output";
constexpr const char* STATS_HEADER_ID = "Index";
constexpr const char* STATS_HEADER_DATA_SIZE = "Data Size";
constexpr const char* STATS_HEADER_DATA_TYPE = "Data Type";
constexpr const char* STATS_HEADER_FORMAT = "Format";
constexpr const char* STATS_HEADER_SHAPE = "Shape";
constexpr const char* STATS_HEADER_MAX = "Max Value";
constexpr const char* STATS_HEADER_MIN = "Min Value";
constexpr const char* STATS_HEADER_AVG = "Avg Value";
constexpr const char* STATS_HEADER_L2NORM = "l2norm";
constexpr const char* STATS_CSV_HEADER_L2NORM = "L2Norm Value";
constexpr const char* STATS_HEADER_MD5 = "MD5 Value";
constexpr const char* STATS_HEADER_NAN = "Nan Count";
constexpr const char* STATS_CSV_HEADER_NAN = "NaN Count";
constexpr const char* STATS_HEADER_NEG_INF = "Negative Inf Count";
constexpr const char* STATS_HEADER_POS_INF = "Positive Inf Count";
constexpr const char* RANK_ID = "RANK_ID";
constexpr const char* DIGITAL_NUMBERS = "0123456789";
static const std::map<DebuggerSummaryOption, std::pair<std::string, std::string>> SUMMARY_OPTION_HEADER_STR_MAP = {
{DebuggerSummaryOption::MAX, {STATS_HEADER_MAX, STATS_HEADER_MAX}},
{DebuggerSummaryOption::MIN, {STATS_HEADER_MIN, STATS_HEADER_MIN}},
{DebuggerSummaryOption::MEAN, {STATS_HEADER_AVG, STATS_HEADER_AVG}},
{DebuggerSummaryOption::L2NORM, {STATS_HEADER_L2NORM, STATS_CSV_HEADER_L2NORM}},
{DebuggerSummaryOption::NAN_CNT, {STATS_HEADER_NAN, STATS_CSV_HEADER_NAN}},
{DebuggerSummaryOption::NEG_INF_CNT, {STATS_HEADER_NEG_INF, STATS_HEADER_NEG_INF}},
{DebuggerSummaryOption::POS_INF_CNT, {STATS_HEADER_POS_INF, STATS_HEADER_POS_INF}},
{DebuggerSummaryOption::MD5, {STATS_HEADER_MD5, STATS_HEADER_MD5}},
};
const static std::map<AclDtype, AclDtype> kDtypeTransMap = {
{AclDtype::DT_BF16, AclDtype::DT_FLOAT},
{AclDtype::DT_INT4, AclDtype::DT_INT8},
};
class AclTensorStats {
public:
AclTensorStats() = default;
explicit AclTensorStats(const AclTensorInfo& tensor, const std::map<DebuggerSummaryOption, std::string>& summary);
~AclTensorStats() = default;
std::string GetCsvHeader() const;
std::string GetCsvValue() const;
std::string GetPath() const {return path;}
bool Empty() const {return stats.empty();};
static AclTensorStats CalTensorSummary(const AclTensorInfo& tensor, const std::vector<DebuggerSummaryOption>& opt);
static AclTensorStats ParseTensorSummary(const std::string& dumpPath, const std::string& input);
private:
std::string path;
std::string opType;
std::string opName;
std::string taskID;
std::string streamID;
std::string timestamp;
std::string inout;
std::string slot;
std::string dataSize;
std::string dataType;
std::string format;
std::string shape;
std::map<DebuggerSummaryOption, std::string> stats;
void ParseInfoFromDumpPath(const std::string& dumpPath);
std::string& operator[](DebuggerSummaryOption opt) { return stats[opt]; }
static constexpr const size_t BUFFER_LEN = 1024;
};
void AclTensorStats::ParseInfoFromDumpPath(const std::string& dumpPath)
{
std::string filename;
if (FileUtils::GetFileSuffix(dumpPath) == "csv") {
filename = FileUtils::GetFileBaseName(dumpPath);
} else {
filename = FileUtils::GetFileName(dumpPath);
}
path = FileUtils::GetParentDir(dumpPath);
std::vector<std::string> tokens = FileUtils::SplitPath(filename, '.');
if (tokens.size() < 5) {
LOG_WARNING(DebuggerErrno::ERROR_INVALID_FORMAT, "Skip dumping invalid op " + filename);
stats.clear();
return;
}
opType = std::move(tokens[0]);
opName = std::move(tokens[1]);
taskID = std::move(tokens[2]);
streamID = std::move(tokens[3]);
timestamp = std::move(tokens[4]);
}
AclTensorStats::AclTensorStats(const AclTensorInfo& tensor, const std::map<DebuggerSummaryOption, std::string>& summary)
: stats{summary}
{
ParseInfoFromDumpPath(tensor.dumpPath);
if (stats.empty()) {
return;
}
inout = tensor.inout;
slot = std::to_string(tensor.slot);
dataSize = std::to_string(tensor.dataSize);
dataType = DataUtils::GetDTypeString(tensor.dtype);
format = DataUtils::GetFormatString(tensor.hostFmt);
shape = DataUtils::GetShapeString(tensor.hostShape);
}
AclTensorStats AclTensorStats::CalTensorSummary(const AclTensorInfo& tensor,
const std::vector<DebuggerSummaryOption>& opt)
{
DEBUG_FUNC_TRACE();
std::map<DebuggerSummaryOption, std::string> summary;
if (ELE_IN_VECTOR(opt, DebuggerSummaryOption::MD5)) {
const uint8_t* data = tensor.transBuf.empty() ? tensor.aclData : tensor.transBuf.data();
summary[DebuggerSummaryOption::MD5] = MathUtils::CalculateMD5(data, tensor.dataSize);
}
return AclTensorStats(tensor, summary);
}
static std::map<uint32_t, DebuggerSummaryOption> ParseTensorSummaryHeaderOrder(const std::vector<std::string>& segs)
{
std::map<uint32_t, DebuggerSummaryOption> ret;
for (size_t pos = 0; pos < segs.size(); ++pos) {
const std::string& opt = segs[pos];
for (auto it = SUMMARY_OPTION_HEADER_STR_MAP.begin(); it != SUMMARY_OPTION_HEADER_STR_MAP.end(); ++it) {
if (opt == it->second.first) {
ret[pos] = it->first;
break;
}
}
}
return ret;
}
AclTensorStats AclTensorStats::ParseTensorSummary(const std::string& dumpPath, const std::string& input)
{
constexpr const size_t optPosBase = 7;
static std::map<uint32_t, DebuggerSummaryOption> order;
static uint32_t headerLen = 0;
std::vector<std::string> segs = FileUtils::SplitPath(input, ',');
if (order.empty()) {
if (segs.size() <= optPosBase || segs[0] != STATS_HEADER_INOUT) {
LOG_WARNING(DebuggerErrno::ERROR_INVALID_FORMAT, "Summary data miss header, some data may lose.");
return AclTensorStats();
}
headerLen = segs.size();
order = ParseTensorSummaryHeaderOrder(segs);
return AclTensorStats();
}
if (segs.size() < headerLen) {
LOG_WARNING(DebuggerErrno::ERROR_INVALID_FORMAT, "Summary data miss some fields, some data may lose.");
return AclTensorStats();
}
if (segs[0] == STATS_HEADER_INOUT) {
return AclTensorStats();
}
AclTensorStats stat = AclTensorStats();
stat.ParseInfoFromDumpPath(dumpPath);
stat.inout = segs[0];
stat.slot = segs[1];
stat.dataSize = segs[2];
stat.dataType = segs[3];
stat.format = segs[4];
stat.shape = segs[5];
for (auto it = order.begin(); it != order.end(); ++it) {
stat[it->second] = segs[it->first];
}
return stat;
}
std::string AclTensorStats::GetCsvHeader() const
{
if (stats.empty()) {
return std::string();
}
std::string ret;
ret.reserve(BUFFER_LEN);
ret.append("Op Type,Op Name,Task ID,Stream ID,Timestamp,Input/Output,Slot,Data Size,Data Type,Format,Shape");
for (auto it = stats.begin(); it != stats.end(); it++) {
ret.append(",");
ret.append(SUMMARY_OPTION_HEADER_STR_MAP.at(it->first).second);
}
ret.append("\n");
return ret;
}
std::string AclTensorStats::GetCsvValue() const
{
if (stats.empty()) {
return std::string();
}
std::string ret;
ret.reserve(BUFFER_LEN);
ret.append(opType).append(",").append(opName).append(",").append(taskID).append(",").append(streamID).append(",") \
.append(timestamp).append(",").append(inout).append(",").append(slot).append(",") .append(dataSize) \
.append(",").append(dataType).append(",").append(format).append(",").append(shape);
for (auto it = stats.begin(); it != stats.end(); it++) {
ret.append(",");
ret.append(it->second);
}
ret.append("\n");
return ret;
}
AclDumpDataProcessor::~AclDumpDataProcessor()
{
while (!buffer.empty()) {
delete buffer.front();
buffer.pop();
}
}
std::string AclDumpDataProcessor::ToString() const
{
return "AclDumpDataProcessor(path=" + dumpPath + ",completed=" + std::to_string(completed) + ",len=" +
std::to_string(totalLen) + ")";
}
DebuggerErrno AclDumpDataProcessor::PushData(const AclDumpChunk *chunk)
{
DEBUG_FUNC_TRACE();
if (completed) {
LOG_WARNING(DebuggerErrno::ERROR_INVALID_OPERATION,
ToString() + " receive data when completed. Some errors may occur.");
return DebuggerErrno::ERROR_INVALID_OPERATION;
}
if (chunk->isLastChunk) {
completed = true;
}
size_t len = chunk->bufLen;
if (len == 0) {
LOG_ERROR(DebuggerErrno::ERROR_INVALID_VALUE, ToString() + ": invalid value(cached size " +
std::to_string(totalLen) + ", receiving size " + std::to_string(len) + ").");
errorOccurred = true;
return DebuggerErrno::ERROR_INVALID_VALUE;
}
if (SIZE_MAX - len < totalLen || totalLen + len > MAX_DATA_LEN) {
LOG_ERROR(DebuggerErrno::ERROR_BUFFER_OVERFLOW, ToString() + ": buffer overflow(cached size " +
std::to_string(totalLen) + ", receiving size " + std::to_string(len) + ").");
errorOccurred = true;
return DebuggerErrno::ERROR_BUFFER_OVERFLOW;
}
std::vector<uint8_t> *p = new (std::nothrow) std::vector<uint8_t>(len);
if (p == nullptr) {
LOG_ERROR(DebuggerErrno::ERROR_NO_MEMORY, "Acl dump data processor(" + dumpPath + "): Alloc failed(" +
std::to_string(len) + " bytes).");
errorOccurred = true;
return DebuggerErrno::ERROR_NO_MEMORY;
}
try {
std::copy(chunk->dataBuf, chunk->dataBuf + len, p->begin());
} catch (const std::exception& e) {
LOG_ERROR(DebuggerErrno::ERROR_SYSCALL_FAILED, ToString() + ": Failed to copy data;");
delete p;
errorOccurred = true;
return DebuggerErrno::ERROR_SYSCALL_FAILED;
}
buffer.push(p);
totalLen += len;
if (!chunk->isLastChunk) { return DebuggerErrno::OK; }
completed = true;
DebuggerErrno ret = ConcatenateData();
if (ret != DebuggerErrno::OK) {
LOG_ERROR(ret, "Acl dump data processor(" + dumpPath + "): Failed to concatenate data.");
errorOccurred = true;
return ret;
}
LOG_DEBUG(ToString() + " is completed.");
return DebuggerErrno::OK;
}
DebuggerErrno AclDumpDataProcessor::ConcatenateData()
{
DEBUG_FUNC_TRACE();
if (!completed) {
LOG_ERROR(DebuggerErrno::ERROR_INVALID_OPERATION, "Data processor(" + dumpPath + "): Data is incomplete.");
return DebuggerErrno::ERROR_INVALID_OPERATION;
}
if (buffer.empty()) {
LOG_ERROR(DebuggerErrno::ERROR_INVALID_VALUE, "Data processor(" + dumpPath + "): No data.");
return DebuggerErrno::ERROR_INVALID_VALUE;
}
if (buffer.size() > 1) {
std::vector<uint8_t> *p = new (std::nothrow) std::vector<uint8_t>(totalLen);
if (p == nullptr) {
LOG_ERROR(DebuggerErrno::ERROR_NO_MEMORY, "Alloc failed(" + std::to_string(totalLen) + ").");
return DebuggerErrno::ERROR_NO_MEMORY;
}
size_t offset = 0;
while (!buffer.empty()) {
try {
std::copy(buffer.front()->begin(), buffer.front()->end(), p->begin() + offset);
} catch (const std::exception& e) {
delete p;
LOG_ERROR(DebuggerErrno::ERROR_SYSCALL_FAILED, "Data processor(" + dumpPath + "): Failed to copy.");
return DebuggerErrno::ERROR_SYSCALL_FAILED;
}
offset += buffer.front()->size();
delete buffer.front();
buffer.pop();
}
buffer.push(p);
}
if (FileUtils::GetFileSuffix(dumpPath) == CSV_SUFFIX) {
dataSegOffset = 0;
dataSegLen = totalLen;
return DebuggerErrno::OK;
}
headerSegOffset = sizeof(uint64_t);
if (totalLen < headerSegOffset) {
LOG_ERROR(DebuggerErrno::ERROR_INVALID_FORMAT, "Acl dump data processor(" + dumpPath +
"): Invalid data length " + std::to_string(totalLen) + ".");
return DebuggerErrno::ERROR_INVALID_FORMAT;
}
headerSegLen = *(reinterpret_cast<const uint64_t *>(buffer.front()->data()));
if (totalLen < headerSegOffset + headerSegLen) {
LOG_ERROR(DebuggerErrno::ERROR_INVALID_FORMAT, "Acl dump data processor(" + dumpPath +
"): Invalid header len " + std::to_string(headerSegLen) + "/" + std::to_string(totalLen) + ".");
return DebuggerErrno::ERROR_INVALID_FORMAT;
}
dataSegOffset = headerSegOffset + headerSegLen;
dataSegLen = totalLen - dataSegOffset;
return DebuggerErrno::OK;
}
static nlohmann::json ParseOverflowInfo(const uint8_t* data)
{
DEBUG_FUNC_TRACE();
uint32_t index = 0;
nlohmann::json overflowInfo;
uint64_t modelId = DataUtils::UnpackUint64ValueLe(data);
index += UINT64_SIZE;
uint64_t streamId = DataUtils::UnpackUint64ValueLe(data + index);
index += UINT64_SIZE;
uint64_t taskId = DataUtils::UnpackUint64ValueLe(data + index);
index += UINT64_SIZE;
uint64_t taskType = DataUtils::UnpackUint64ValueLe(data + index);
index += UINT64_SIZE;
uint64_t pcStart = DataUtils::UnpackUint64ValueLe(data + index);
index += UINT64_SIZE;
uint64_t paraBase = DataUtils::UnpackUint64ValueLe(data + index);
overflowInfo["model_id"] = modelId;
overflowInfo["stream_id"] = streamId;
overflowInfo["task_id"] = taskId;
overflowInfo["task_type"] = taskType;
overflowInfo["pc_start"] = DataUtils::U64ToHexString(pcStart);
overflowInfo["para_base"] = DataUtils::U64ToHexString(paraBase);
return overflowInfo;
}
static DebuggerErrno DumpOpDebugDataToDisk(const std::string& dumpPath, AclDumpMsg::DumpData& dumpData,
const uint8_t* data, size_t dataLen)
{
DEBUG_FUNC_TRACE();
std::string outPath = dumpPath + ".output.";
uint32_t num = static_cast<uint32_t>(dumpData.output().size());
for (uint32_t slot = 0; slot < num; slot++) {
uint32_t offset = 0;
nlohmann::json dhaAtomicAddInfo = ParseOverflowInfo(data + offset);
offset += DHA_ATOMIC_ADD_INFO_SIZE;
nlohmann::json l2AtomicAddInfo = ParseOverflowInfo(data + offset);
offset += L2_ATOMIC_ADD_INFO_SIZE;
nlohmann::json aiCoreInfo = ParseOverflowInfo(data + offset);
offset += AICORE_INFO_SIZE;
dhaAtomicAddInfo["status"] = DataUtils::UnpackUint64ValueLe(data + offset);
offset += DHA_ATOMIC_ADD_STATUS_SIZE;
l2AtomicAddInfo["status"] = DataUtils::UnpackUint64ValueLe(data + offset);
offset += L2_ATOMIC_ADD_STATUS_SIZE;
uint64_t kernelCode = DataUtils::UnpackUint64ValueLe(data + offset);
offset += UINT64_SIZE;
uint64_t blockIdx = DataUtils::UnpackUint64ValueLe(data + offset);
offset += UINT64_SIZE;
uint64_t status = DataUtils::UnpackUint64ValueLe(data + offset);
aiCoreInfo["kernel_code"] = DataUtils::U64ToHexString(kernelCode);
aiCoreInfo["block_idx"] = blockIdx;
aiCoreInfo["status"] = status;
nlohmann::json opdebugData;
opdebugData["DHA Atomic Add"] = dhaAtomicAddInfo;
opdebugData["L2 Atomic Add"] = l2AtomicAddInfo;
opdebugData["AI Core"] = aiCoreInfo;
std::string filePath = outPath + std::to_string(slot) + "." + JSON_SUFFIX;
DebuggerErrno ret = FileOperation::DumpJson(filePath, opdebugData);
if (ret != DebuggerErrno::OK) {
LOG_ERROR(ret, "Failed to dump data to " + filePath + ".");
return ret;
}
}
return DebuggerErrno::OK;
}
static DebuggerErrno ConvertFormatDeviceToHost(AclTensorInfo& tensor)
{
DEBUG_FUNC_TRACE();
if (tensor.deviceFmt == tensor.hostFmt || AclTensor::SizeOfTensor(tensor) == 0) {
LOG_DEBUG(tensor + ": No need to convert format.");
return DebuggerErrno::OK;
}
DebuggerErrno ret = AclTensor::TransFormatD2H(tensor);
if (ret == DebuggerErrno::ERROR_UNKNOWN_TRANS) {
LOG_INFO("Do not support convert format from " +
std::to_string(tensor.deviceFmt) + " to " + std::to_string(tensor.hostFmt) + ".");
tensor.hostFmt = tensor.deviceFmt;
return DebuggerErrno::OK;
}
if (ret != DebuggerErrno::OK) {
LOG_ERROR(ret, tensor + ": Failed to convert format.");
return ret;
}
LOG_DEBUG(tensor + ": Convert format successfully.");
return DebuggerErrno::OK;
}
static std::string MappingFilePath(const std::string& originPath)
{
static std::string lastOriName;
static std::string lastMappingPath;
if (lastOriName == originPath && !lastMappingPath.empty()) {
return lastMappingPath;
}
std::string dir = FileUtils::GetParentDir(originPath);
std::string suffix = FileUtils::GetFileSuffix(originPath);
std::string mappingName;
uint32_t retry = 10;
constexpr uint32_t randFileNameLen = 32;
do {
mappingName = MathUtils::RandomString(randFileNameLen, '0', '9');
if (!suffix.empty()) {
mappingName.append(".").append(suffix);
}
if (!FileUtils::IsPathExist(dir + "/" + mappingName)) {
break;
}
} while (--retry);
if (retry == 0) {
LOG_ERROR(DebuggerErrno::ERROR, "Failed to map path " + originPath + ".");
return std::string();
}
DebuggerErrno ret = FileUtils::CreateDir(dir);
if (ret != DebuggerErrno::OK) {
LOG_ERROR(DebuggerErrno::ERROR, "Failed to create directory " + dir + ".");
return std::string();
}
std::ofstream ofs;
constexpr const char* mapFileName = "mapping.csv";
ret = FileUtils::OpenFile(dir + "/" + mapFileName, ofs, std::ofstream::app);
if (ret != DebuggerErrno::OK) {
LOG_ERROR(DebuggerErrno::ERROR, "Failed to open mapping file " + dir + "/" + mapFileName + ".");
return std::string();
}
ofs << mappingName << "," << FileUtils::GetFileName(originPath) << "\n";
if (ofs.fail()) {
LOG_ERROR(DebuggerErrno::ERROR_FAILED_TO_WRITE_FILE, "Failed to write file " + dir + "/" + mapFileName + ".");
ofs.close();
return std::string();
}
ofs.close();
lastOriName = originPath;
lastMappingPath = dir + "/" + mappingName;
return lastMappingPath;
}
static DebuggerErrno StandardizedDumpPath(std::string& originPath)
{
std::string filename = FileUtils::GetFileName(originPath);
if (filename.length() <= FileUtils::FILE_NAME_MAX) {
return DebuggerErrno::OK;
}
std::string mappingPath = MappingFilePath(originPath);
if (mappingPath.empty()) {
LOG_ERROR(DebuggerErrno::ERROR, "Failed to open mapping file " + originPath + ".");
return DebuggerErrno::ERROR;
}
originPath = std::move(mappingPath);
return DebuggerErrno::OK;
}
static std::string GenDataPath(const std::string& path)
{
LOG_DEBUG("Original acl data path is " + path);
std::string outputPath = DebuggerConfig::GetInstance().GetOutputPath();
std::string dataPath;
if (path.compare(0, outputPath.length(), outputPath) != 0) {
return path;
}
dataPath = path.substr(outputPath.length());
const std::vector<std::string> items = FileUtils::SplitPath(dataPath);
constexpr const size_t expectSegLen = 9;
constexpr const size_t rankIdPos = 0;
constexpr const size_t timeStampPos = 1;
constexpr const size_t stepIdPos = 2;
constexpr const size_t dataNamePos = 8;
if (items.size() >= expectSegLen) {
dataPath = outputPath;
if (dataPath.at(dataPath.length() - 1) != '/') {
dataPath.append("/");
}
* ACL 接口返回数据的路径格式如下
* {dump_path}/rank_{rank_id}/{time stamp}/step_{step_id}/{time}
/{device_id}/{model_name}/{model_id}/{iteration_id}/{data name}
* items[0] 表示 rank_{rank_id}
* items[1] 表示 {time stamp}
* items[2] 表示 step_{step_id}
* items[8] 表示 {data name}
*/
dataPath.append(items[rankIdPos] + "/");
dataPath.append(items[timeStampPos] + "/");
dataPath.append(items[stepIdPos] + "/");
dataPath.append(items[dataNamePos]);
return dataPath;
}
return path;
}
inline std::string GetTensorInfoSuffix(AclTensorInfo& tensor)
{
return "." + tensor.inout + "." + std::to_string(tensor.slot) +
"." + DataUtils::GetFormatString(tensor.hostFmt) + "." + DataUtils::GetDTypeString(tensor.oriDtype);
}
static DebuggerErrno DumpOneAclTensorFmtBin(AclTensorInfo& tensor)
{
DebuggerErrno ret;
std::string dumpPathSlot = tensor.dumpPath + GetTensorInfoSuffix(tensor);
if (StandardizedDumpPath(dumpPathSlot) != DebuggerErrno::OK) {
LOG_ERROR(DebuggerErrno::ERROR, "Failed to standardize path " + dumpPathSlot + ".");
return DebuggerErrno::ERROR;
}
std::ofstream ofs;
ret = FileUtils::OpenFile(dumpPathSlot, ofs, std::ios::out | std::ios::binary);
if (ret != DebuggerErrno::OK) {
LOG_ERROR(ret, "Failed to open file " + dumpPathSlot + ".");
return ret;
}
ofs.write(reinterpret_cast<const char*>(tensor.aclData), tensor.dataSize);
if (ofs.fail()) {
LOG_ERROR(DebuggerErrno::ERROR_FAILED_TO_WRITE_FILE, "Failed to write file " + dumpPathSlot + ".");
ret = DebuggerErrno::ERROR_FAILED_TO_WRITE_FILE;
}
ofs.close();
return ret;
}
static DebuggerErrno DumpOneAclTensorFmtNpy(AclTensorInfo& tensor)
{
DEBUG_FUNC_TRACE();
DebuggerErrno ret;
if (tensor.dataSize == 0) {
LOG_INFO(tensor + ": Data size is 0. No need to dump.");
return DebuggerErrno::OK;
}
auto it = kDtypeTransMap.find(tensor.dtype);
if (it != kDtypeTransMap.end()) {
AclDtype dstDtype = it->second;
ret = AclTensor::TransDtype(tensor, dstDtype);
if (ret != DebuggerErrno::OK) {
LOG_ERROR(ret, tensor + ": Failed to transform dtype from " +
DataUtils::GetDTypeString(it->first) + " to " +
DataUtils::GetDTypeString(it->second)+ ".");
return ret;
}
}
std::string dumpPathSlot = tensor.dumpPath + GetTensorInfoSuffix(tensor) + "." + NPY_SUFFIX;
if (StandardizedDumpPath(dumpPathSlot) != DebuggerErrno::OK) {
LOG_ERROR(DebuggerErrno::ERROR, "Failed to standardize path " + dumpPathSlot + ".");
return DebuggerErrno::ERROR;
}
if (tensor.transBuf.empty()) {
ret = FileOperation::DumpNpy(dumpPathSlot, tensor.aclData, tensor.dataSize, tensor.dtype, tensor.hostShape);
} else {
ret = FileOperation::DumpNpy(dumpPathSlot, tensor.transBuf.data(), tensor.transBuf.size(), tensor.dtype,
tensor.hostShape);
}
if (ret != DebuggerErrno::OK) {
LOG_ERROR(ret, tensor + ": Failed to dump as npy.");
return ret;
}
LOG_DEBUG(tensor + ": dump successfully.");
return ret;
}
static DebuggerErrno WriteOneTensorStatToDisk(const AclTensorStats& stat)
{
DEBUG_FUNC_TRACE();
if (stat.Empty()) {
return DebuggerErrno::OK;
}
std::string dumpfile = stat.GetPath() + "/statistic.csv";
uint32_t retry = 100;
uint32_t interval = 10;
if (FileUtils::CheckFileBeforeCreateOrWrite(dumpfile, true) != DebuggerErrno::OK) {
LOG_ERROR(DebuggerErrno::ERROR_FILE_ALREADY_EXISTS, "File " + dumpfile + " exists and has invalid format.");
return DebuggerErrno::ERROR_FILE_ALREADY_EXISTS;
}
int fd = open(dumpfile.c_str(), O_WRONLY | O_CREAT | O_APPEND, NORMAL_FILE_MODE_DEFAULT);
if (fd < 0) {
LOG_ERROR(DebuggerErrno::ERROR_FAILED_TO_OPEN_FILE, "Failed to open file " + dumpfile);
return DebuggerErrno::ERROR_FAILED_TO_OPEN_FILE;
}
uint32_t i;
for (i = 0; i < retry; ++i) {
if (flock(fd, LOCK_EX | LOCK_NB) == 0) {
break;
}
std::this_thread::sleep_for(std::chrono::milliseconds(interval));
}
if (i == retry) {
LOG_ERROR(DebuggerErrno::ERROR_SYSCALL_FAILED, "Failed to occupy file " + dumpfile);
close(fd);
return DebuggerErrno::ERROR_SYSCALL_FAILED;
}
off_t offset = lseek(fd, 0, SEEK_END);
if (offset == 0) {
std::string header = stat.GetCsvHeader();
if (write(fd, header.c_str(), header.length()) < static_cast<ssize_t>(header.length())) {
LOG_ERROR(DebuggerErrno::ERROR_FAILED_TO_WRITE_FILE, "Failed to write file " + dumpfile);
flock(fd, LOCK_UN);
close(fd);
return DebuggerErrno::ERROR_FAILED_TO_WRITE_FILE;
}
}
std::string value = stat.GetCsvValue();
DebuggerErrno ret = DebuggerErrno::OK;
if (write(fd, value.c_str(), value.length()) < static_cast<ssize_t>(value.length())) {
LOG_ERROR(DebuggerErrno::ERROR_FAILED_TO_WRITE_FILE, "Failed to write file " + dumpfile);
ret = DebuggerErrno::ERROR_FAILED_TO_WRITE_FILE;
}
flock(fd, LOCK_UN);
close(fd);
return ret;
}
static DebuggerErrno DumpOneAclTensor(AclTensorInfo& tensor, std::vector<DebuggerSummaryOption>& opt)
{
DEBUG_FUNC_TRACE();
if (tensor.dumpOriginData || !FileOperation::IsDtypeSupportByNpy(tensor.dtype)) {
if (kDtypeTransMap.find(tensor.dtype) == kDtypeTransMap.end()) {
return DumpOneAclTensorFmtBin(tensor);
}
}
DebuggerErrno ret = ConvertFormatDeviceToHost(tensor);
if (ret != DebuggerErrno::OK) {
LOG_ERROR(ret, tensor + ": Failed to convert format to host.");
return ret;
}
if (!opt.empty()) {
AclTensorStats stat = AclTensorStats::CalTensorSummary(tensor, opt);
return WriteOneTensorStatToDisk(stat);
}
return DumpOneAclTensorFmtNpy(tensor);
}
static void DumpAclTensor(std::vector<AclTensorInfo>::iterator begin, std::vector<AclTensorInfo>::iterator end,
std::vector<DebuggerSummaryOption> opt)
{
DEBUG_FUNC_TRACE();
DebuggerErrno ret = DebuggerErrno::OK;
for (auto it = begin; it != end; it++) {
ret = DumpOneAclTensor(*it, opt);
if (ret != DebuggerErrno::OK) {
LOG_WARNING(ret, *it + ": Failed to dump to disk.");
break;
}
}
return;
}
static DebuggerErrno DumpTensorDataToDisk(const std::string& dumpPath, AclDumpMsg::DumpData& dumpData,
const uint8_t* data, size_t dataLen, std::vector<DebuggerSummaryOption>& opt)
{
DEBUG_FUNC_TRACE();
std::vector<AclTensorInfo> aclTensorInfos;
uint64_t offset = 0;
uint32_t slot = 0;
for (auto& tensor : dumpData.input()) {
aclTensorInfos.push_back(AclTensor::ParseAttrsFromDumpData(dumpPath, data + offset, tensor, "input", slot));
offset += tensor.size();
slot++;
}
slot = 0;
for (auto& tensor : dumpData.output()) {
aclTensorInfos.push_back(AclTensor::ParseAttrsFromDumpData(dumpPath, data + offset, tensor, "output", slot));
offset += tensor.size();
slot++;
}
if (aclTensorInfos.empty()) {
return DebuggerErrno::OK;
}
if (offset > dataLen) {
LOG_ERROR(DebuggerErrno::ERROR_VALUE_OVERFLOW, dumpPath + ": offset overflow " + std::to_string(offset) + "/" +
std::to_string(dataLen) + ".");
return DebuggerErrno::ERROR_VALUE_OVERFLOW;
}
constexpr int kMaxTensorSize = 1024 * 1024;
if (offset < kMaxTensorSize) {
DumpAclTensor(aclTensorInfos.begin(), aclTensorInfos.end(), opt);
} else {
size_t concurrent = std::max<size_t>(1, std::thread::hardware_concurrency() / 4);
concurrent = std::min(concurrent, aclTensorInfos.size());
size_t total = aclTensorInfos.size();
size_t batch = MathUtils::DivCeil(total, concurrent);
size_t cur = 0;
std::vector<std::thread> threads;
std::vector<AclTensorInfo>::iterator begin = aclTensorInfos.begin();
threads.reserve(concurrent);
while (cur < total) {
threads.emplace_back(std::thread(&DumpAclTensor, begin + cur, begin + std::min(total, cur + batch), opt));
cur += batch;
}
for (auto& t : threads) {
if (t.joinable()) {
t.join();
}
}
}
DebuggerErrLevel err = ErrorInfosManager::GetTopErrLevelInDuration();
return err >= DebuggerErrLevel::LEVEL_ERROR ? DebuggerErrno::ERROR : DebuggerErrno::OK;
}
static DebuggerErrno DumpStatsDataToDisk(const std::string& dumpPath, const uint8_t* data, size_t dataLen)
{
DEBUG_FUNC_TRACE();
constexpr const size_t maxDataSize = 10 * 1024 * 1024;
if (dataLen > maxDataSize) {
LOG_ERROR(DebuggerErrno::ERROR_FILE_TOO_LARGE, "File " + dumpPath + " is too large to be dumped.");
return DebuggerErrno::ERROR_FILE_TOO_LARGE;
}
std::string content(reinterpret_cast<const char*>(data), dataLen);
std::vector<std::string> lines = FileUtils::SplitPath(content, '\n');
DebuggerErrno ret;
for (const auto& line : lines) {
if (line.empty() || line[0] == '\0') {
continue;
}
AclTensorStats stat = AclTensorStats::ParseTensorSummary(dumpPath, line);
ret = WriteOneTensorStatToDisk(stat);
if (ret != DebuggerErrno::OK) {
return ret;
}
}
return DebuggerErrno::OK;
}
DebuggerErrno AclDumpDataProcessor::DumpToDisk()
{
DEBUG_FUNC_TRACE();
if (!completed) {
LOG_ERROR(DebuggerErrno::ERROR_INVALID_OPERATION, ToString() + ": Data is incomplete.");
return DebuggerErrno::ERROR_INVALID_OPERATION;
}
uint8_t* msg = buffer.front()->data();
AclDumpMsg::DumpData dumpData;
if (headerSegLen > 0) {
if (!dumpData.ParseFromArray(msg + headerSegOffset, headerSegLen)) {
LOG_ERROR(DebuggerErrno::ERROR_INVALID_FORMAT, ToString() + ": Failed to parse header.");
return DebuggerErrno::ERROR_INVALID_FORMAT;
}
}
const std::string dataPath = GenDataPath(dumpPath);
DebuggerErrno ret;
if (FileUtils::GetFileName(dumpPath).find(DEBUG_FILE_SIGN) == 0 &&
DebuggerConfig::GetInstance().GetOverflowCheckCfg() != nullptr) {
ret = DumpOpDebugDataToDisk(dataPath, dumpData, msg + dataSegOffset, dataSegLen);
} else if (DebuggerConfig::GetInstance().GetStatisticsCfg() != nullptr &&
hostAnalysisOpts.empty()) {
ret = DumpStatsDataToDisk(dataPath, msg + dataSegOffset, dataSegLen);
} else {
ret = DumpTensorDataToDisk(dataPath, dumpData, msg + dataSegOffset, dataSegLen, hostAnalysisOpts);
}
if (ret != DebuggerErrno::OK) {
LOG_ERROR(DebuggerErrno::ERROR_OPERATION_FAILED, ToString() + ": Failed to dump to disk.");
}
return ret;
}
}