#include <pwd.h>
#include <sys/stat.h>
#include "op_api_common_base.h"
#include "op_api_common.h"
thread_local char g_hash_buf[g_hash_buf_size];
thread_local int g_hash_offset = 0;
constexpr int g_rShift33Bits = 33;
constexpr uint64_t MIX_STEP1 = 18397679294719823053LLU;
constexpr uint64_t MIX_STEP2 = 14181476777654086739LLU;
constexpr int OWNER_ROOT_UID = 0;
typedef void(*AddTensorAddrToCachedList) (void *addr);
static std::unordered_map<aclFormat, aclFormat> FORMAT_FAKE_TO_REAL {
{ ACL_FORMAT_FRACTAL_NZ_C0_16, ACL_FORMAT_FRACTAL_NZ_C0_32 },
{ ACL_FORMAT_FRACTAL_NZ, ACL_FORMAT_FRACTAL_NZ }
};
bool checkOwner(string cusLibPath)
{
struct stat fileInfo;
stat(cusLibPath.c_str(), &fileInfo);
auto cusLibOwnerUid = fileInfo.st_uid;
auto curOwnerUid = getuid();
if (curOwnerUid != OWNER_ROOT_UID && cusLibOwnerUid == OWNER_ROOT_UID) {
TORCH_NPU_WARN_ONCE("A common user is using the files of the root user.");
return true;
} else if ((curOwnerUid == OWNER_ROOT_UID && cusLibOwnerUid != OWNER_ROOT_UID) ||
(curOwnerUid != OWNER_ROOT_UID && (curOwnerUid != cusLibOwnerUid))) {
TORCH_NPU_WARN_ONCE("The ", cusLibPath,
" owner does not match current owner or the root user is using the files of a common user, will skip this file.");
return false;
}
return true;
}
static std::vector<std::string> split_str(std::string s, const std::string &del)
{
size_t end = s.find(del);
std::vector<std::string> path_list;
while (end != std::string::npos) {
path_list.push_back(s.substr(0, end));
s.erase(s.begin(), s.begin() + end + 1);
end = s.find(del);
}
path_list.push_back(s);
return path_list;
}
static bool is_file_exist(const std::string &path)
{
if (path.empty() || path.size() > PATH_MAX) {
return false;
}
return (access(path.c_str(), F_OK) == 0) ? true : false;
}
std::string real_path(const std::string &path)
{
if (path.empty() || path.size() > PATH_MAX) {
return "";
}
char realPath[PATH_MAX] = {0};
if (realpath(path.c_str(), realPath) == nullptr) {
return "";
}
return std::string(realPath);
}
std::vector<std::string> get_custom_lib_path()
{
char *ascend_custom_opppath = std::getenv("ASCEND_CUSTOM_OPP_PATH");
std::vector<std::string> custom_lib_path_list;
if (ascend_custom_opppath == nullptr) {
ASCEND_LOGW("ASCEND_CUSTOM_OPP_PATH does not exist");
return std::vector<std::string>();
}
std::string ascend_custom_opppath_str(ascend_custom_opppath);
custom_lib_path_list = split_str(ascend_custom_opppath_str, ":");
if (custom_lib_path_list.empty()) {
return std::vector<std::string>();
}
for (auto &it : custom_lib_path_list) {
it = it + "/op_api/lib/";
}
return custom_lib_path_list;
}
std::vector<std::string> get_default_custom_lib_path()
{
char *ascend_opp_path = std::getenv("ASCEND_OPP_PATH");
std::vector<std::string> default_vendors_list;
if (ascend_opp_path == nullptr) {
ASCEND_LOGW("ASCEND_OPP_PATH does not exist");
return std::vector<std::string>();
}
std::string vendors_path(ascend_opp_path);
vendors_path = vendors_path + "/vendors";
std::string vendors_config_file = real_path(vendors_path + "/config.ini");
if (vendors_config_file.empty()) {
ASCEND_LOGW("config.ini does not exist");
return std::vector<std::string>();
}
if (!is_file_exist(vendors_config_file)) {
ASCEND_LOGW("config.ini does not exist or the path length is more than %d", PATH_MAX);
return std::vector<std::string>();
}
std::ifstream ifs(vendors_config_file);
std::string line;
while (std::getline(ifs, line)) {
if (line.find("load_priority=") == 0) {
break;
}
}
std::string head = "load_priority=";
line.erase(0, head.length());
default_vendors_list = split_str(line, ",");
if (default_vendors_list.empty()) {
return std::vector<std::string>();
}
for (auto &it : default_vendors_list) {
it = real_path(vendors_path + "/" + it + "/op_api/lib/");
}
return default_vendors_list;
}
const std::vector<std::string> g_custom_lib_path = get_custom_lib_path();
const std::vector<std::string> g_default_custom_lib_path = get_default_custom_lib_path();
bool hasPrefix(const std::string &str, const std::string &prefix)
{
return str.compare(0, prefix.size(), prefix) == 0;
}
bool hasSuffix(const std::string &str, const std::string &suffix)
{
if (suffix.size() > str.size()) {
return false;
}
return str.compare(str.size() - suffix.size(), suffix.size(), suffix) == 0;
}
std::vector<std::string> GetAllOpApiSoFiles()
{
std::vector<std::string> opApiSoFiles;
const char *ascendHomePath = std::getenv("ASCEND_HOME_PATH");
if (ascendHomePath == nullptr) {
ASCEND_LOGW("ASCEND_HOME_PATH does not exist");
return opApiSoFiles;
}
std::string allOpApiSoPath(ascendHomePath);
allOpApiSoPath = allOpApiSoPath + "/lib64";
std::string allOpApiSoRealPath = real_path(allOpApiSoPath);
if (allOpApiSoRealPath.empty()) {
ASCEND_LOGW("ASCEND_HOME_PATH/lib64 does not exist");
return opApiSoFiles;
}
if (!is_file_exist(allOpApiSoRealPath)) {
ASCEND_LOGW("ASCEND_HOME_PATH/lib64 does not exist or the path length is more than %d", PATH_MAX);
return opApiSoFiles;
}
try {
for (const auto& entry : std::filesystem::directory_iterator(allOpApiSoRealPath)) {
if (!entry.is_regular_file()) {
continue;
}
std::string fileName = entry.path().filename().string();
if (hasPrefix(fileName, "libopapi_") && hasSuffix(fileName, ".so")) {
ASCEND_LOGI("%s is found.", fileName.c_str());
opApiSoFiles.push_back(fileName);
}
}
} catch (const std::filesystem::filesystem_error& e) {
ASCEND_LOGW("Filesystem error: %s", e.what());
} catch (const std::exception& e) {
ASCEND_LOGW("Error: %s", e.what());
}
if (opApiSoFiles.empty()) {
ASCEND_LOGW("ASCEND_HOME_PATH does not get any libopapi_*.so file");
}
return opApiSoFiles;
}
const std::vector<std::string> g_opApiSoFiles = GetAllOpApiSoFiles();
std::vector<void *> GetAllOpApiHandlers()
{
std::vector<void *> opApiHandlers;
for (const auto& opApiSoFile : g_opApiSoFiles) {
auto opApiHandler = GetOpApiLibHandler(opApiSoFile.c_str());
if (opApiHandler != nullptr) {
ASCEND_LOGI("%s has got handler.", opApiSoFile.c_str());
}
opApiHandlers.push_back(opApiHandler);
}
if (opApiHandlers.empty()) {
ASCEND_LOGW("ASCEND_HOME_PATH does not get any libopapi_*.so handler");
}
return opApiHandlers;
}
const std::vector<void *> g_opApiHandlers = GetAllOpApiHandlers();
void add_param_to_buf(const at::Tensor &at_tensor)
{
static const auto addTensorAddrToCachedListAddr = GetOpApiFuncAddr("AddTensorAddrToCachedList");
TORCH_CHECK(addTensorAddrToCachedListAddr != nullptr, "GetOpApiFuncAddr failed.", OPS_ERROR(ErrCode::PTR));
AddTensorAddrToCachedList addTensorAddrToCachedListFunc =
reinterpret_cast<AddTensorAddrToCachedList>(addTensorAddrToCachedListAddr);
if (!at_tensor.defined()) {
MEMCPY_TO_BUF(",", 1);
return;
}
CheckNpuTensorValid(at_tensor);
if (at_npu::native::OpPreparation::is_scalar_wrapped_to_tensor(at_tensor)) {
g_hash_offset = g_hash_buf_max_size;
return;
}
MEMCPY_TO_BUF(at_tensor.sizes().data(), static_cast<int64_t>(at_tensor.sizes().size() * sizeof(int64_t)));
auto st = at_tensor.scalar_type();
MEMCPY_TO_BUF(&st, sizeof(st));
MEMCPY_TO_BUF(",", 1);
MEMCPY_TO_BUF(at_tensor.strides().data(), static_cast<int64_t>(at_tensor.sizes().size() * sizeof(int64_t)));
auto so = at_tensor.storage_offset();
MEMCPY_TO_BUF(&so, sizeof(so));
aclDataType acl_data_type = at_npu::native::OpPreparation::convert_to_acl_data_type(st);
c10::SmallVector<int64_t, 5> storageDims;
if (acl_data_type != ACL_STRING) {
TORCH_CHECK(at_tensor.itemsize() > 0, "the itemsize of tensor must be greater than 0.",
OPS_ERROR(ErrCode::PARAM));
storageDims.push_back(at_tensor.storage().nbytes() / at_tensor.itemsize());
}
MEMCPY_TO_BUF(storageDims.data(), static_cast<int64_t>(storageDims.size() * sizeof(int64_t)));
addTensorAddrToCachedListFunc(const_cast<void*>(at_tensor.storage().data()));
}
void add_param_to_buf(const at::Scalar &at_scalar)
{
at::ScalarType scalar_data_type = at_scalar.type();
switch (scalar_data_type) {
case at::ScalarType::Double: {
double value = at_scalar.toDouble();
MEMCPY_TO_BUF(&value, sizeof(double));
break;
}
case at::ScalarType::Long: {
int64_t value = at_scalar.toLong();
MEMCPY_TO_BUF(&value, sizeof(int64_t));
break;
}
case at::ScalarType::Bool: {
bool value = at_scalar.toBool();
MEMCPY_TO_BUF(&value, sizeof(bool));
break;
}
case at::ScalarType::ComplexDouble: {
auto value = at_scalar.toComplexDouble();
MEMCPY_TO_BUF(&value, sizeof(value));
break;
}
default: {
return;
}
}
MEMCPY_TO_BUF(&scalar_data_type, sizeof(at::ScalarType));
}
void add_param_to_buf(const at::IntArrayRef &at_array)
{
MEMCPY_TO_BUF(at_array.data(), static_cast<int64_t>(at_array.size() * sizeof(int64_t)));
auto counter = at_array.size();
MEMCPY_TO_BUF(&counter, sizeof(counter));
}
void add_param_to_buf(const at::ArrayRef<c10::SymInt> &int_array)
{
auto at_array = c10::asIntArrayRefUnchecked(int_array);
MEMCPY_TO_BUF(at_array.data(), static_cast<int64_t>(at_array.size() * sizeof(int64_t)));
auto counter = at_array.size();
MEMCPY_TO_BUF(&counter, sizeof(counter));
}
void add_param_to_buf(const at::ArrayRef<bool> &at_array)
{
MEMCPY_TO_BUF(at_array.data(), static_cast<int64_t>(at_array.size() * sizeof(bool)));
auto counter = at_array.size();
MEMCPY_TO_BUF(&counter, sizeof(counter));
}
void add_param_to_buf(const at::TensorList &at_tensor_list)
{
for (size_t i = 0; i < at_tensor_list.size(); i++) {
add_param_to_buf(at_tensor_list[i]);
}
auto counter = at_tensor_list.size();
MEMCPY_TO_BUF(&counter, sizeof(counter));
}
void add_param_to_buf(const at::ArrayRef<at::Scalar> &at_scalar_list)
{
for (size_t i = 0; i < at_scalar_list.size(); i++) {
add_param_to_buf(at_scalar_list[i]);
}
auto counter = at_scalar_list.size();
MEMCPY_TO_BUF(&counter, sizeof(counter));
MEMCPY_TO_BUF(",", 1);
}
void add_param_to_buf(const c10::optional<at::Tensor> &opt_tensor)
{
if (opt_tensor.has_value() && opt_tensor.value().defined()) {
add_param_to_buf(opt_tensor.value());
} else {
MEMCPY_TO_BUF(",", 1);
}
}
void add_param_to_buf(const c10::optional<at::IntArrayRef> &opt_array)
{
if (opt_array.has_value()) {
add_param_to_buf(opt_array.value());
} else {
MEMCPY_TO_BUF(",", 1);
}
}
void add_param_to_buf(const c10::OptionalArrayRef<c10::SymInt> &opt_array)
{
if (opt_array.has_value()) {
add_param_to_buf(opt_array.value());
} else {
MEMCPY_TO_BUF(",", 1);
}
}
void add_param_to_buf(const c10::OptionalIntArrayRef &opt_array)
{
if (opt_array.has_value()) {
add_param_to_buf(opt_array.value());
} else {
MEMCPY_TO_BUF(",", 1);
}
}
void add_param_to_buf(const c10::optional<at::Scalar> &opt_scalar)
{
if (opt_scalar.has_value()) {
add_param_to_buf(opt_scalar.value());
} else {
MEMCPY_TO_BUF(",", 1);
}
}
void add_param_to_buf(const at::ScalarType scalar_type)
{
MEMCPY_TO_BUF(&scalar_type, sizeof(scalar_type));
}
void add_param_to_buf(const string& s)
{
MEMCPY_TO_BUF(s.c_str(), static_cast<int64_t>(s.size()));
}
void add_param_to_buf(char *c)
{
MEMCPY_TO_BUF(c, strlen(c));
auto counter = strlen(c);
MEMCPY_TO_BUF(&counter, sizeof(counter));
}
void add_param_to_buf(const char *c)
{
MEMCPY_TO_BUF(c, strlen(c));
auto counter = strlen(c);
MEMCPY_TO_BUF(&counter, sizeof(counter));
}
void add_param_to_buf(const TensorWrapper &tensor_r)
{
static const auto addTensorAddrToCachedListAddr = GetOpApiFuncAddr("AddTensorAddrToCachedList");
TORCH_CHECK(addTensorAddrToCachedListAddr != nullptr, "GetOpApiFuncAddr failed.", OPS_ERROR(ErrCode::PTR));
AddTensorAddrToCachedList addTensorAddrToCachedListFunc =
reinterpret_cast<AddTensorAddrToCachedList>(addTensorAddrToCachedListAddr);
const at::Tensor &at_tensor = tensor_r.tensor_;
if (!at_tensor.defined()) {
MEMCPY_TO_BUF(",", 1);
return;
}
CheckNpuTensorValid(at_tensor);
if (at_npu::native::OpPreparation::is_scalar_wrapped_to_tensor(at_tensor)) {
g_hash_offset = g_hash_buf_max_size;
return;
}
aclDataType acl_data_type = tensor_r.dtype;
MEMCPY_TO_BUF(at_tensor.sizes().data(), static_cast<int64_t>(at_tensor.sizes().size() * sizeof(int64_t)));
MEMCPY_TO_BUF(&acl_data_type, sizeof(acl_data_type));
MEMCPY_TO_BUF(",", 1);
MEMCPY_TO_BUF(at_tensor.strides().data(), static_cast<int64_t>(at_tensor.sizes().size() * sizeof(int64_t)));
auto so = at_tensor.storage_offset();
MEMCPY_TO_BUF(&so, sizeof(so));
c10::SmallVector<int64_t, 5> storageDims;
if (acl_data_type != ACL_STRING) {
TORCH_CHECK(at_tensor.itemsize() > 0, "the itemsize of tensor must be greater than 0.",
OPS_ERROR(ErrCode::PARAM));
storageDims.push_back(at_tensor.storage().nbytes() / at_tensor.itemsize());
}
MEMCPY_TO_BUF(storageDims.data(), static_cast<int64_t>(storageDims.size() * sizeof(int64_t)));
addTensorAddrToCachedListFunc(const_cast<void*>(at_tensor.storage().data()));
}
void add_param_to_buf(const TensorListWrapper &tensor_list_wrapper)
{
const at::TensorList &at_tensor_list = tensor_list_wrapper.tensor_list_;
for (size_t i = 0; i < at_tensor_list.size(); i++) {
add_param_to_buf(TensorWrapper{
tensor_list_wrapper.tensor_list_[i], tensor_list_wrapper.dtype});
}
auto counter = at_tensor_list.size();
MEMCPY_TO_BUF(&counter, sizeof(counter));
}
void add_param_to_buf() {}
void add_param_to_buf_v2(TensorStructPtr at_tensor)
{
static const auto addTensorAddrToCachedListAddr = GetOpApiFuncAddr("AddTensorAddrToCachedList");
TORCH_CHECK(addTensorAddrToCachedListAddr != nullptr, "GetOpApiFuncAddr failed.", OPS_ERROR(ErrCode::PTR));
AddTensorAddrToCachedList addTensorAddrToCachedListFunc =
reinterpret_cast<AddTensorAddrToCachedList>(addTensorAddrToCachedListAddr);
if (at_tensor == nullptr) {
MEMCPY_TO_BUF(",", 1);
return;
}
MEMCPY_TO_BUF((*at_tensor).sizes.data(), static_cast<int64_t>((*at_tensor).sizes.size() * sizeof(int64_t)));
auto acl_data_type = (*at_tensor).acl_type;
MEMCPY_TO_BUF(&acl_data_type, sizeof(acl_data_type));
MEMCPY_TO_BUF(",", 1);
MEMCPY_TO_BUF((*at_tensor).strides.data(), static_cast<int64_t>((*at_tensor).sizes.size() * sizeof(int64_t)));
auto so = (*at_tensor).storage_offset;
MEMCPY_TO_BUF(&so, sizeof(so));
c10::SmallVector<int64_t, 5> storageDims;
if (acl_data_type != ACL_STRING) {
TORCH_CHECK((*at_tensor).itemsize > 0, "the itemsize of tensor must be greater than 0.",
OPS_ERROR(ErrCode::PARAM));
storageDims.push_back((*at_tensor).nbytes / (*at_tensor).itemsize);
}
MEMCPY_TO_BUF(storageDims.data(), static_cast<int64_t>(storageDims.size() * sizeof(int64_t)));
addTensorAddrToCachedListFunc((*at_tensor).data_ptr);
}
void add_param_to_buf_v2(const at::Scalar &at_scalar)
{
at::ScalarType scalar_data_type = at_scalar.type();
switch (scalar_data_type) {
case at::ScalarType::Double: {
double value = at_scalar.toDouble();
MEMCPY_TO_BUF(&value, sizeof(double));
break;
}
case at::ScalarType::Long: {
int64_t value = at_scalar.toLong();
MEMCPY_TO_BUF(&value, sizeof(int64_t));
break;
}
case at::ScalarType::Bool: {
bool value = at_scalar.toBool();
MEMCPY_TO_BUF(&value, sizeof(bool));
break;
}
case at::ScalarType::ComplexDouble: {
auto value = at_scalar.toComplexDouble();
MEMCPY_TO_BUF(&value, sizeof(value));
break;
}
default: {
return;
}
}
MEMCPY_TO_BUF(&scalar_data_type, sizeof(at::ScalarType));
}
void add_param_to_buf_v2(const std::vector<int64_t> &at_array)
{
MEMCPY_TO_BUF(at_array.data(), static_cast<int64_t>(at_array.size() * sizeof(int64_t)));
auto counter = at_array.size();
MEMCPY_TO_BUF(&counter, sizeof(counter));
}
void add_param_to_buf_v2(const std::vector<bool> &at_array)
{
bool *value_ptr = reinterpret_cast<bool *>(malloc(at_array.size() * sizeof(bool)));
for (size_t i = 0; i < at_array.size(); i++) {
value_ptr[i] = at_array[i];
}
MEMCPY_TO_BUF(value_ptr, static_cast<int64_t>(at_array.size() * sizeof(int64_t)));
free(value_ptr);
auto counter = at_array.size();
MEMCPY_TO_BUF(&counter, sizeof(counter));
}
void add_param_to_buf_v2(const std::vector<TensorStructPtr> &at_tensor_list)
{
for (size_t i = 0; i < at_tensor_list.size(); i++) {
add_param_to_buf_v2(at_tensor_list[i]);
}
auto counter = at_tensor_list.size();
MEMCPY_TO_BUF(&counter, sizeof(counter));
}
void add_param_to_buf_v2(const std::vector<at::Scalar> &at_scalar_list)
{
for (size_t i = 0; i < at_scalar_list.size(); i++) {
add_param_to_buf_v2(at_scalar_list[i]);
}
auto counter = at_scalar_list.size();
MEMCPY_TO_BUF(&counter, sizeof(counter));
MEMCPY_TO_BUF(",", 1);
}
void add_param_to_buf_v2(const c10::optional<std::vector<int64_t>> &opt_array)
{
if (opt_array.has_value()) {
add_param_to_buf_v2(opt_array.value());
} else {
MEMCPY_TO_BUF(",", 1);
}
}
void add_param_to_buf_v2(const c10::optional<at::Scalar> &opt_scalar)
{
if (opt_scalar.has_value()) {
add_param_to_buf_v2(opt_scalar.value());
} else {
MEMCPY_TO_BUF(",", 1);
}
}
void add_param_to_buf_v2(const at::ScalarType scalar_type)
{
MEMCPY_TO_BUF(&scalar_type, sizeof(scalar_type));
}
void add_param_to_buf_v2(const string& s)
{
MEMCPY_TO_BUF(s.c_str(), static_cast<int64_t>(s.size()));
}
void add_param_to_buf_v2(char *c)
{
MEMCPY_TO_BUF(c, strlen(c));
auto counter = strlen(c);
MEMCPY_TO_BUF(&counter, sizeof(counter));
}
void add_param_to_buf_v2(const char *c)
{
MEMCPY_TO_BUF(c, strlen(c));
auto counter = strlen(c);
MEMCPY_TO_BUF(&counter, sizeof(counter));
}
void add_param_to_buf_v2()
{
}
inline uint64_t rotating_left(uint64_t x, uint8_t n)
{
return (x << n) | (x >> (64 - n));
}
inline uint64_t mixture(uint64_t x)
{
x ^= x >> g_rShift33Bits;
x *= MIX_STEP1;
x ^= x >> g_rShift33Bits;
x *= MIX_STEP2;
x ^= x >> g_rShift33Bits;
return x;
}
uint64_t gen_hash(const void *key, const int len, const uint32_t seed = 0xdeadb0d7)
{
const uint8_t *data = (const uint8_t *)key;
const int block_num = len / 16;
uint64_t has = seed;
uint64_t hax = seed;
const uint64_t c1 = 9782798678568883157LLU;
const uint64_t c2 = 5545529020109919103LLU;
const uint64_t *blocks = (const uint64_t *)(data);
for (int i = 0; i < block_num; i++) {
int even_num = 2;
uint64_t tmp1 = blocks[i * even_num];
uint64_t tmp2 = blocks[i * even_num + 1];
int8_t bits_31 = 31;
tmp1 *= c1;
tmp1 = rotating_left(tmp1, bits_31);
tmp1 *= c2;
has ^= tmp1;
int8_t bits_27 = 27;
has = rotating_left(has, bits_27);
has += hax;
has = has * 5 + 1390208809;
int8_t bits_33 = 33;
tmp2 *= c2;
tmp2 = rotating_left(tmp2, bits_33);
tmp2 *= c1;
hax ^= tmp2;
hax = rotating_left(hax, bits_31);
hax += has;
hax = hax * 5 + 944331445;
}
const uint8_t *tail = (const uint8_t*)(data + block_num * 16);
uint64_t t1 = 0;
uint64_t t2 = 0;
switch (static_cast<uint64_t>(len) & 15) {
case 15:
t2 ^= ((uint64_t)tail[14]) << 48;
[[fallthrough]];;
case 14:
t2 ^= ((uint64_t)tail[13]) << 40;
[[fallthrough]];;
case 13:
t2 ^= ((uint64_t)tail[12]) << 32;
[[fallthrough]];;
case 12:
t2 ^= ((uint64_t)tail[11]) << 24;
[[fallthrough]];;
case 11:
t2 ^= ((uint64_t)tail[10]) << 16;
[[fallthrough]];;
case 10:
t2 ^= ((uint64_t)tail[9]) << 8;
[[fallthrough]];;
case 9:
t2 ^= ((uint64_t)tail[8]) << 0;
t2 *= c2;
t2 = rotating_left(t2, 33);
t2 *= c1;
hax ^= t2;
[[fallthrough]];;
case 8:
t1 ^= ((uint64_t)tail[7]) << 56;
[[fallthrough]];;
case 7:
t1 ^= ((uint64_t)tail[6]) << 48;
[[fallthrough]];;
case 6:
t1 ^= ((uint64_t)tail[5]) << 40;
[[fallthrough]];;
case 5:
t1 ^= ((uint64_t)tail[4]) << 32;
[[fallthrough]];;
case 4:
t1 ^= ((uint64_t)tail[3]) << 24;
[[fallthrough]];;
case 3:
t1 ^= ((uint64_t)tail[2]) << 16;
[[fallthrough]];;
case 2:
t1 ^= ((uint64_t)tail[1]) << 8;
[[fallthrough]];;
case 1:
t1 ^= ((uint64_t)tail[0]) << 0;
t1 *= c1;
t1 = rotating_left(t1, 31);
t1 *= c2;
has ^= t1;
[[fallthrough]];;
default:
break;
};
has ^= static_cast<uint64_t>(len);
hax ^= static_cast<uint64_t>(len);
has += hax;
hax += has;
has = mixture(has);
hax = mixture(hax);
has += hax;
hax += has;
return hax;
}
uint64_t calc_hash_id()
{
if (g_hash_offset == g_hash_buf_max_size) {
return 0;
}
uint64_t hash_id = gen_hash(g_hash_buf, g_hash_offset);
return hash_id;
}
void *GetOpApiFuncAddrFromFeatureLib(const char *api_name)
{
GET_OP_API_FUNC_FROM_FEATURE_LIB(ops_infer_handler, "libaclnn_ops_infer.so", api_name);
GET_OP_API_FUNC_FROM_FEATURE_LIB(ops_train_handler, "libaclnn_ops_train.so", api_name);
GET_OP_API_FUNC_FROM_FEATURE_LIB(math_handler, "libaclnn_math.so", api_name);
GET_OP_API_FUNC_FROM_FEATURE_LIB(sparse_handler, "libaclnn_sparse.so", api_name);
GET_OP_API_FUNC_FROM_FEATURE_LIB(fft_handler, "libaclnn_fft.so", api_name);
GET_OP_API_FUNC_FROM_FEATURE_LIB(rand_handler, "libaclnn_rand.so", api_name);
return nullptr;
}
bool check_aclnn_kernel_available(std::string aclnn_name)
{
std::string workspace_name = aclnn_name + "GetWorkspaceSize";
if (GetOpApiFuncAddr(aclnn_name.c_str()) == nullptr || GetOpApiFuncAddr(workspace_name.c_str()) == nullptr) {
return false;
}
return true;
}
inline void CollectB4ShapeInfo(const at::Tensor &at_tensor,
c10::SmallVector<int64_t, MAX_DIM_NUM>& wrapperStride,
c10::SmallVector<int64_t, MAX_DIM_NUM>& wrapperShape)
{
int64_t nDim = at_tensor.sizes().size();
if (nDim == 1) {
wrapperShape[0] = wrapperShape[0] * FP4_IN_INT8;
} else if (nDim > 1) {
if (wrapperStride[nDim - 1] == 1 && wrapperStride[nDim - PENULTIMATE_DIM] == 1) {
if (wrapperShape[nDim - PENULTIMATE_DIM] == 1) {
wrapperStride[nDim - 1] = wrapperStride[nDim - 1] * FP4_IN_INT8;
wrapperShape[nDim - PENULTIMATE_DIM] = wrapperShape[nDim - PENULTIMATE_DIM] * FP4_IN_INT8;
} else if (wrapperShape[nDim - 1] == 1) {
wrapperStride[nDim - PENULTIMATE_DIM] = wrapperStride[nDim - PENULTIMATE_DIM] * FP4_IN_INT8;
wrapperShape[nDim - 1] = wrapperShape[nDim - 1] * FP4_IN_INT8;
}
} else if (wrapperStride[nDim - 1] == 1) {
wrapperStride[nDim - PENULTIMATE_DIM] =
wrapperStride[nDim - PENULTIMATE_DIM] * FP4_IN_INT8;
wrapperShape[nDim - 1] = wrapperShape[nDim - 1] * FP4_IN_INT8;
} else if (wrapperStride[nDim - PENULTIMATE_DIM] == 1) {
wrapperStride[nDim - 1] =
wrapperStride[nDim - 1] * FP4_IN_INT8;
wrapperShape[nDim - PENULTIMATE_DIM] =
wrapperShape[nDim - PENULTIMATE_DIM] * FP4_IN_INT8;
}
for (auto i = 0; i < nDim - PENULTIMATE_DIM; i++) {
wrapperStride[i] = wrapperStride[i] * FP4_IN_INT8;
}
} else {
TORCH_CHECK(false, "unsupported tensor size() in 4-bit dtype.", OPS_ERROR(ErrCode::VALUE));
}
}
inline bool Is4BitDtype(const aclDataType acl_data_type)
{
return acl_data_type == ACL_FLOAT4_E2M1 || acl_data_type == ACL_FLOAT4_E1M2 || acl_data_type == ACL_INT4;
}
void *GetOpApiFuncAddr(const char *apiName)
{
if (!g_custom_lib_path.empty()) {
for (auto &it : g_custom_lib_path) {
auto cust_opapi_lib = real_path(it + "/" + GetCustOpApiLibName());
if (cust_opapi_lib.empty()) {
continue;
}
auto custOpApiHandler = GetOpApiLibHandler(cust_opapi_lib.c_str());
if (custOpApiHandler != nullptr) {
auto funcAddr =
GetOpApiFuncAddrInLib(custOpApiHandler, GetCustOpApiLibName(), apiName);
if (funcAddr != nullptr) {
if (!checkOwner(cust_opapi_lib)) {
continue;
}
ASCEND_LOGI("%s is found in %s.", apiName, cust_opapi_lib.c_str());
return funcAddr;
}
}
}
ASCEND_LOGI("%s is not in custom lib.", apiName);
}
if (!g_default_custom_lib_path.empty()) {
for (auto &it : g_default_custom_lib_path) {
auto default_cust_opapi_lib = real_path(it + "/" + GetCustOpApiLibName());
if (default_cust_opapi_lib.empty()) {
continue;
}
auto custOpApiHandler = GetOpApiLibHandler(default_cust_opapi_lib.c_str());
if (custOpApiHandler != nullptr) {
auto funcAddr =
GetOpApiFuncAddrInLib(custOpApiHandler, GetCustOpApiLibName(), apiName);
if (funcAddr != nullptr) {
if (!checkOwner(default_cust_opapi_lib)) {
continue;
}
ASCEND_LOGI("%s is found in %s.", apiName, default_cust_opapi_lib.c_str());
return funcAddr;
}
}
}
ASCEND_LOGI("%s is not in default custom lib.", apiName);
}
if (!g_opApiHandlers.empty()) {
for (size_t i = 0; i < g_opApiHandlers.size(); ++i) {
if (g_opApiHandlers[i] != nullptr) {
auto funcAddr = GetOpApiFuncAddrInLib(g_opApiHandlers[i], g_opApiSoFiles[i].c_str(), apiName);
if (funcAddr != nullptr) {
ASCEND_LOGI("%s is found in %s.", apiName, g_opApiSoFiles[i].c_str());
return funcAddr;
}
}
}
}
static auto opApiHandler = GetOpApiLibHandler(GetOpApiLibName());
if (opApiHandler != nullptr) {
auto funcAddr = GetOpApiFuncAddrInLib(opApiHandler, GetOpApiLibName(), apiName);
if (funcAddr != nullptr) {
return funcAddr;
}
}
return GetOpApiFuncAddrFromFeatureLib(apiName);
}
aclTensor *ConvertType(const at::Tensor &at_tensor)
{
static const auto aclCreateTensor = GET_OP_API_FUNC(aclCreateTensor);
if (aclCreateTensor == nullptr) {
return nullptr;
}
if (!at_tensor.defined()) {
return nullptr;
}
CheckNpuTensorValid(at_tensor);
at::ScalarType scalar_data_type = at_tensor.scalar_type();
aclDataType acl_data_type = at_npu::native::OpPreparation::convert_to_acl_data_type(scalar_data_type);
c10::SmallVector<int64_t, MAX_DIM_NUM> storageDims;
c10::SmallVector<int64_t, MAX_DIM_NUM> wrapperStride = op_infer::array_to_small_vector(at_tensor.strides());
c10::SmallVector<int64_t, MAX_DIM_NUM> wrapperShape = op_infer::array_to_small_vector(at_tensor.sizes());
const auto dimNum = at_tensor.sizes().size();
aclFormat format = ACL_FORMAT_ND;
if (!at_npu::native::FormatHelper::IsOpInputBaseFormat(at_tensor)) {
format = torch_npu::NPUBridge::GetNpuStorageImpl(at_tensor)->npu_desc_.npu_format_;
if (acl_data_type != ACL_STRING) {
TORCH_CHECK(at_tensor.itemsize() > 0, "the itemsize of tensor must be greater than 0.",
OPS_ERROR(ErrCode::VALUE));
storageDims = torch_npu::NPUBridge::GetNpuStorageImpl(at_tensor)->npu_desc_.storage_sizes_;
if (Is4BitDtype(acl_data_type)) {
storageDims.back() *= FP4_IN_INT8;
CollectB4ShapeInfo(at_tensor, wrapperStride, wrapperShape);
auto realFormat = FORMAT_FAKE_TO_REAL.find(format);
TORCH_CHECK(realFormat != FORMAT_FAKE_TO_REAL.end(), "not support convert ", format, ".",
OPS_ERROR(ErrCode::VALUE));
format = realFormat->second;
}
}
} else {
switch (dimNum) {
case NCL_DIM_NUM:
format = ACL_FORMAT_NCL;
break;
case NCHW_DIM_NUM:
format = ACL_FORMAT_NCHW;
break;
case NCDHW_DIM_NUM:
format = ACL_FORMAT_NCDHW;
break;
default:
format = ACL_FORMAT_ND;
}
if (acl_data_type != ACL_STRING) {
TORCH_CHECK(at_tensor.itemsize() > 0, "the itemsize of tensor must be greater than 0.",
OPS_ERROR(ErrCode::VALUE));
if (Is4BitDtype(acl_data_type)) {
storageDims.push_back(at_tensor.storage().nbytes() / at_tensor.itemsize() * FP4_IN_INT8);
CollectB4ShapeInfo(at_tensor, wrapperStride, wrapperShape);
} else {
storageDims.push_back(at_tensor.storage().nbytes() / at_tensor.itemsize());
}
}
}
if (at_npu::native::OpPreparation::is_scalar_wrapped_to_tensor(at_tensor)) {
c10::Scalar expScalar = at_tensor.item();
at::Tensor aclInput = at_npu::native::OpPreparation::copy_scalar_to_device(expScalar, scalar_data_type);
return aclCreateTensor(aclInput.sizes().data(), aclInput.sizes().size(), acl_data_type,
aclInput.strides().data(), aclInput.storage_offset(), format, storageDims.data(),
storageDims.size(), const_cast<void *>(aclInput.storage().data()));
}
auto acl_tensor =
aclCreateTensor(wrapperShape.data(), at_tensor.sizes().size(), acl_data_type, wrapperStride.data(),
at_tensor.storage_offset(), format, storageDims.data(), storageDims.size(),
const_cast<void *>(at_tensor.storage().data()));
return acl_tensor;
}
aclScalar *ConvertType(const at::Scalar &at_scalar)
{
static const auto aclCreateScalar = GET_OP_API_FUNC(aclCreateScalar);
if (aclCreateScalar == nullptr) {
return nullptr;
}
at::ScalarType scalar_data_type = at_scalar.type();
aclDataType acl_data_type = at_npu::native::OpPreparation::convert_to_acl_data_type(scalar_data_type);
aclScalar *acl_scalar = nullptr;
switch (scalar_data_type) {
case at::ScalarType::Double:
{
double value = at_scalar.toDouble();
acl_scalar = aclCreateScalar(&value, acl_data_type);
break;
}
case at::ScalarType::Long:
{
int64_t value = at_scalar.toLong();
acl_scalar = aclCreateScalar(&value, acl_data_type);
break;
}
case at::ScalarType::Bool:
{
bool value = at_scalar.toBool();
acl_scalar = aclCreateScalar(&value, acl_data_type);
break;
}
case at::ScalarType::ComplexDouble:
{
auto value = at_scalar.toComplexDouble();
acl_scalar = aclCreateScalar(&value, acl_data_type);
break;
}
default:
acl_scalar = nullptr;
break;
}
return acl_scalar;
}
aclIntArray *ConvertType(const at::IntArrayRef &at_array)
{
static const auto aclCreateIntArray = GET_OP_API_FUNC(aclCreateIntArray);
if (aclCreateIntArray == nullptr) {
return nullptr;
}
auto array = aclCreateIntArray(at_array.data(), at_array.size());
return array;
}
aclIntArray *ConvertType(const at::ArrayRef<c10::SymInt> &at_array)
{
static const auto aclCreateIntArray = GET_OP_API_FUNC(aclCreateIntArray);
if (aclCreateIntArray == nullptr) {
return nullptr;
}
auto int_array = c10::asIntArrayRefUnchecked(at_array);
auto array = aclCreateIntArray(int_array.data(), int_array.size());
return array;
}
aclBoolArray *ConvertType(const at::ArrayRef<bool> &value)
{
static const auto aclCreateBoolArray = GET_OP_API_FUNC(aclCreateBoolArray);
if (aclCreateBoolArray == nullptr) {
return nullptr;
}
auto array = aclCreateBoolArray(value.data(), value.size());
return array;
}
aclTensorList *ConvertType(const at::TensorList &at_tensor_list)
{
if (at_tensor_list.size() == 0) {
return nullptr;
}
static const auto aclCreateTensorList = GET_OP_API_FUNC(aclCreateTensorList);
if (aclCreateTensorList == nullptr) {
return nullptr;
}
std::vector<const aclTensor *> tensor_list(at_tensor_list.size());
for (size_t i = 0; i < at_tensor_list.size(); i++) {
tensor_list[i] = ConvertType(at_tensor_list[i]);
}
auto acl_tensor_list = aclCreateTensorList(tensor_list.data(), tensor_list.size());
return acl_tensor_list;
}
aclScalarList *ConvertType(const at::ArrayRef<at::Scalar> &at_scalar_list)
{
static const auto aclCreateScalarList = GET_OP_API_FUNC(aclCreateScalarList);
if (aclCreateScalarList == nullptr) {
return nullptr;
}
std::vector<const aclScalar *> scalar_list(at_scalar_list.size());
for (size_t i = 0; i < at_scalar_list.size(); i++) {
scalar_list[i] = ConvertType(at_scalar_list[i]);
}
auto acl_scalar_list = aclCreateScalarList(scalar_list.data(), scalar_list.size());
return acl_scalar_list;
}
aclTensor *ConvertType(const c10::optional<at::Tensor> &opt_tensor)
{
if (opt_tensor.has_value() && opt_tensor.value().defined()) {
return ConvertType(opt_tensor.value());
}
return nullptr;
}
aclIntArray *ConvertType(const c10::optional<at::IntArrayRef> &opt_array)
{
if (opt_array.has_value()) {
return ConvertType(opt_array.value());
}
return nullptr;
}
aclIntArray *ConvertType(const c10::OptionalArrayRef<c10::SymInt> &opt_array)
{
if (opt_array.has_value()) {
return ConvertType(opt_array.value());
}
return nullptr;
}
aclIntArray *ConvertType(const c10::OptionalIntArrayRef &opt_array)
{
if (opt_array.has_value()) {
return ConvertType(opt_array.value());
}
return nullptr;
}
aclScalar *ConvertType(const c10::optional<at::Scalar> &opt_scalar)
{
if (opt_scalar.has_value()) {
return ConvertType(opt_scalar.value());
}
return nullptr;
}
aclDataType ConvertType(const at::ScalarType scalarType)
{
return at_npu::native::OpPreparation::convert_to_acl_data_type(scalarType);
}
aclTensor *ConvertType(const TensorWrapper &tensor_r)
{
static const auto aclCreateTensor = GET_OP_API_FUNC(aclCreateTensor);
if (aclCreateTensor == nullptr) {
return nullptr;
}
const at::Tensor &at_tensor = tensor_r.tensor_;
if (!at_tensor.defined()) {
return nullptr;
}
CheckNpuTensorValid(at_tensor);
aclDataType acl_data_type = tensor_r.dtype;
c10::SmallVector<int64_t, MAX_DIM_NUM> storageDims;
c10::SmallVector<int64_t, MAX_DIM_NUM> wrapperStride = op_infer::array_to_small_vector(at_tensor.strides());
c10::SmallVector<int64_t, MAX_DIM_NUM> wrapperShape = op_infer::array_to_small_vector(at_tensor.sizes());
const auto dimNum = at_tensor.sizes().size();
aclFormat format = ACL_FORMAT_ND;
if (!at_npu::native::FormatHelper::IsOpInputBaseFormat(at_tensor)) {
format = torch_npu::NPUBridge::GetNpuStorageImpl(at_tensor)->npu_desc_.npu_format_;
if (acl_data_type != ACL_STRING) {
TORCH_CHECK(at_tensor.itemsize() > 0, "the itemsize of tensor must be greater than 0.",
OPS_ERROR(ErrCode::VALUE));
storageDims = torch_npu::NPUBridge::GetNpuStorageImpl(at_tensor)->npu_desc_.storage_sizes_;
if (Is4BitDtype(acl_data_type)) {
storageDims.back() *= FP4_IN_INT8;
CollectB4ShapeInfo(at_tensor, wrapperStride, wrapperShape);
auto realFormat = FORMAT_FAKE_TO_REAL.find(format);
TORCH_CHECK(realFormat != FORMAT_FAKE_TO_REAL.end(), "not support convert ", format, ".",
OPS_ERROR(ErrCode::VALUE));
format = realFormat->second;
}
}
} else {
switch (dimNum) {
case NCL_DIM_NUM:
format = ACL_FORMAT_NCL;
break;
case NCHW_DIM_NUM:
format = ACL_FORMAT_NCHW;
break;
case NCDHW_DIM_NUM:
format = ACL_FORMAT_NCDHW;
break;
default:
format = ACL_FORMAT_ND;
}
if (acl_data_type != ACL_STRING) {
TORCH_CHECK(at_tensor.itemsize() > 0, "the itemsize of tensor must be greater than 0.",
OPS_ERROR(ErrCode::VALUE));
if (Is4BitDtype(acl_data_type)) {
storageDims.push_back(at_tensor.storage().nbytes() / at_tensor.itemsize() * FP4_IN_INT8);
CollectB4ShapeInfo(at_tensor, wrapperStride, wrapperShape);
} else {
storageDims.push_back(at_tensor.storage().nbytes() / at_tensor.itemsize());
}
}
}
auto acl_tensor =
aclCreateTensor(wrapperShape.data(), at_tensor.sizes().size(), acl_data_type, wrapperStride.data(),
at_tensor.storage_offset(), format, storageDims.data(), storageDims.size(),
const_cast<void *>(at_tensor.storage().data()));
return acl_tensor;
}
aclTensorList *ConvertType(const TensorListWrapper &tensor_list_wrapper)
{
if (tensor_list_wrapper.tensor_list_.size() == 0) {
return nullptr;
}
static const auto aclCreateTensorList = GET_OP_API_FUNC(aclCreateTensorList);
if (aclCreateTensorList == nullptr) {
return nullptr;
}
std::vector<const aclTensor *> tensor_list(tensor_list_wrapper.tensor_list_.size());
for (size_t i = 0; i < tensor_list.size(); i++) {
tensor_list[i] = ConvertType(TensorWrapper{
tensor_list_wrapper.tensor_list_[i], tensor_list_wrapper.dtype});
}
auto acl_tensor_list = aclCreateTensorList(tensor_list.data(), tensor_list.size());
return acl_tensor_list;
}
aclTensor *ConvertTypeV2(TensorStructPtr at_tensor)
{
static const auto aclCreateTensor = GET_OP_API_FUNC(aclCreateTensor);
if (aclCreateTensor == nullptr) {
return nullptr;
}
if (at_tensor == nullptr) {
return nullptr;
}
aclDataType acl_data_type = (*at_tensor).acl_type;
c10::SmallVector<int64_t, MAX_DIM_NUM> storageDims;
c10::SmallVector<int64_t, MAX_DIM_NUM> wrapperStride = op_infer::array_to_small_vector((*at_tensor).strides);
c10::SmallVector<int64_t, MAX_DIM_NUM> wrapperShape = op_infer::array_to_small_vector((*at_tensor).sizes);
const auto dimNum = (*at_tensor).sizes.size();
aclFormat format = ACL_FORMAT_ND;
if (!at_npu::native::FormatHelper::IsBaseFormatType((*at_tensor).acl_format)) {
format = (*at_tensor).acl_format;
if (acl_data_type != ACL_STRING) {
TORCH_CHECK((*at_tensor).itemsize > 0, "the itemsize of tensor must be greater than 0.",
OPS_ERROR(ErrCode::VALUE));
storageDims = (*at_tensor).storage_sizes;
}
} else {
switch (dimNum) {
case NCL_DIM_NUM:
format = ACL_FORMAT_NCL;
break;
case NCHW_DIM_NUM:
format = ACL_FORMAT_NCHW;
break;
case NCDHW_DIM_NUM:
format = ACL_FORMAT_NCDHW;
break;
default:
format = ACL_FORMAT_ND;
}
if (acl_data_type != ACL_STRING) {
TORCH_CHECK((*at_tensor).itemsize > 0, "the itemsize of tensor must be greater than 0.",
OPS_ERROR(ErrCode::VALUE));
if (Is4BitDtype(acl_data_type)) {
storageDims.push_back((*at_tensor).nbytes / (*at_tensor).itemsize * FP4_IN_INT8);
if ((*at_tensor).sizes.size() == 1) {
wrapperShape[0] = wrapperShape[0] * FP4_IN_INT8;
} else if ((*at_tensor).sizes.size() > 1 && wrapperStride[(*at_tensor).sizes.size() - 1] == 1) {
wrapperStride[(*at_tensor).sizes.size() - PENULTIMATE_DIM] =
wrapperStride[(*at_tensor).sizes.size() - PENULTIMATE_DIM] * FP4_IN_INT8;
for (auto i = 0; i < (*at_tensor).sizes.size() - PENULTIMATE_DIM; i++) {
wrapperStride[i] = wrapperStride[i] * FP4_IN_INT8;
}
wrapperShape[(*at_tensor).sizes.size() - 1] =
wrapperShape[(*at_tensor).sizes.size() - 1] * FP4_IN_INT8;
} else if ((*at_tensor).sizes.size() > 1 &&
wrapperStride[(*at_tensor).sizes.size() - PENULTIMATE_DIM] == 1) {
wrapperStride[(*at_tensor).sizes.size() - 1] =
wrapperStride[(*at_tensor).sizes.size() - 1] * FP4_IN_INT8;
for (auto i = 0; i < (*at_tensor).sizes.size() - PENULTIMATE_DIM; i++) {
wrapperStride[i] = wrapperStride[i] * FP4_IN_INT8;
}
wrapperShape[(*at_tensor).sizes.size() - PENULTIMATE_DIM] =
wrapperShape[(*at_tensor).sizes.size() - PENULTIMATE_DIM] * FP4_IN_INT8;
} else {
TORCH_CHECK(false, "unsupported tensor wrapper strides in 4-bit dtype.", OPS_ERROR(ErrCode::VALUE));
}
} else {
storageDims.push_back((*at_tensor).nbytes / (*at_tensor).itemsize);
}
}
}
auto acl_tensor = aclCreateTensor(
wrapperShape.data(), (*at_tensor).sizes.size(), acl_data_type, wrapperStride.data(),
(*at_tensor).storage_offset, format, storageDims.data(), storageDims.size(), (*at_tensor).data_ptr);
return acl_tensor;
}
TensorStructPtr CopyTypeV2(const at::Tensor &at_tensor)
{
if (!at_tensor.defined()) {
return nullptr;
}
CheckNpuTensorValid(at_tensor);
aclDataType acl_data_type = at_npu::native::OpPreparation::convert_to_acl_data_type(at_tensor.scalar_type());
return std::make_shared<TensorStruct>(
const_cast<void *>(at_tensor.storage().data()),
acl_data_type,
torch_npu::NPUBridge::GetNpuStorageImpl(at_tensor)->npu_desc_.npu_format_,
at_tensor.storage().nbytes(),
at_tensor.itemsize(),
at_tensor.storage_offset(),
at_tensor.sizes(),
at_tensor.strides(),
torch_npu::NPUBridge::GetNpuStorageImpl(at_tensor)->npu_desc_.storage_sizes_);
}
TensorStructPtr CopyTypeV2(const TensorWrapper &tensor_r)
{
const at::Tensor &at_tensor = tensor_r.tensor_;
if (!at_tensor.defined()) {
return nullptr;
}
CheckNpuTensorValid(at_tensor);
return std::make_shared<TensorStruct>(
const_cast<void *>(at_tensor.storage().data()),
tensor_r.dtype,
torch_npu::NPUBridge::GetNpuStorageImpl(at_tensor)->npu_desc_.npu_format_,
at_tensor.storage().nbytes(),
at_tensor.itemsize(),
at_tensor.storage_offset(),
at_tensor.sizes(),
at_tensor.strides(),
torch_npu::NPUBridge::GetNpuStorageImpl(at_tensor)->npu_desc_.storage_sizes_);
}
aclScalar *ConvertTypeV2(const at::Scalar &at_scalar)
{
static const auto aclCreateScalar = GET_OP_API_FUNC(aclCreateScalar);
if (aclCreateScalar == nullptr) {
return nullptr;
}
at::ScalarType scalar_data_type = at_scalar.type();
aclDataType acl_data_type = at_npu::native::OpPreparation::convert_to_acl_data_type(scalar_data_type);
aclScalar *acl_scalar = nullptr;
switch (scalar_data_type) {
case at::ScalarType::Double:
{
double value = at_scalar.toDouble();
acl_scalar = aclCreateScalar(&value, acl_data_type);
break;
}
case at::ScalarType::Long:
{
int64_t value = at_scalar.toLong();
acl_scalar = aclCreateScalar(&value, acl_data_type);
break;
}
case at::ScalarType::Bool:
{
bool value = at_scalar.toBool();
acl_scalar = aclCreateScalar(&value, acl_data_type);
break;
}
case at::ScalarType::ComplexDouble:
{
auto value = at_scalar.toComplexDouble();
acl_scalar = aclCreateScalar(&value, acl_data_type);
break;
}
default:
acl_scalar = nullptr;
break;
}
return acl_scalar;
}
aclIntArray *ConvertTypeV2(const std::vector<int64_t> &int_list)
{
static const auto aclCreateIntArray = GET_OP_API_FUNC(aclCreateIntArray);
if (aclCreateIntArray == nullptr) {
return nullptr;
}
auto array = aclCreateIntArray(int_list.data(), int_list.size());
return array;
}
std::vector<int64_t> CopyTypeV2(const at::IntArrayRef &at_array)
{
return at_array.vec();
}
std::vector<int64_t> CopyTypeV2(const at::ArrayRef<c10::SymInt> &at_array)
{
auto int_array = c10::asIntArrayRefUnchecked(at_array);
return int_array.vec();
}
aclBoolArray *ConvertTypeV2(const std::vector<bool> &value)
{
static const auto aclCreateBoolArray = GET_OP_API_FUNC(aclCreateBoolArray);
if (aclCreateBoolArray == nullptr) {
return nullptr;
}
bool *value_ptr = reinterpret_cast<bool *>(malloc(value.size() * sizeof(bool)));
for (size_t i = 0; i < value.size(); i++) {
value_ptr[i] = value[i];
}
auto array = aclCreateBoolArray(value_ptr, value.size());
free(value_ptr);
return array;
}
std::vector<bool> CopyTypeV2(const at::ArrayRef<bool> &value)
{
return value.vec();
}
aclTensorList *ConvertTypeV2(const std::vector<TensorStructPtr> &at_tensor_list)
{
static const auto aclCreateTensorList = GET_OP_API_FUNC(aclCreateTensorList);
if (aclCreateTensorList == nullptr) {
return nullptr;
}
std::vector<const aclTensor *> tensor_list(at_tensor_list.size());
for (size_t i = 0; i < at_tensor_list.size(); i++) {
tensor_list[i] = ConvertTypeV2(at_tensor_list[i]);
}
auto acl_tensor_list = aclCreateTensorList(tensor_list.data(), tensor_list.size());
return acl_tensor_list;
}
std::vector<TensorStructPtr> CopyTypeV2(const at::TensorList &at_tensor_list)
{
std::vector<TensorStructPtr> tensor_list(at_tensor_list.size());
for (size_t i = 0; i < at_tensor_list.size(); i++) {
tensor_list[i] = CopyTypeV2(at_tensor_list[i]);
}
return tensor_list;
}
std::vector<TensorStructPtr> CopyTypeV2(const TensorListWrapper &tensor_list_wrapper)
{
std::vector<TensorStructPtr> tensor_list(tensor_list_wrapper.tensor_list_.size());
for (size_t i = 0; i < tensor_list.size(); i++) {
tensor_list[i] = CopyTypeV2(TensorWrapper{
tensor_list_wrapper.tensor_list_[i], tensor_list_wrapper.dtype});
}
return tensor_list;
}
aclScalarList *ConvertTypeV2(const std::vector<at::Scalar> &at_scalar_list)
{
static const auto aclCreateScalarList = GET_OP_API_FUNC(aclCreateScalarList);
if (aclCreateScalarList == nullptr) {
return nullptr;
}
std::vector<const aclScalar *> scalar_list(at_scalar_list.size());
for (size_t i = 0; i < at_scalar_list.size(); i++) {
scalar_list[i] = ConvertTypeV2(at_scalar_list[i]);
}
auto acl_scalar_list = aclCreateScalarList(scalar_list.data(), scalar_list.size());
return acl_scalar_list;
}
std::vector<at::Scalar> CopyTypeV2(const at::ArrayRef<at::Scalar> &at_scalar_list)
{
return at_scalar_list.vec();
}
TensorStructPtr CopyTypeV2(const c10::optional<at::Tensor> &opt_tensor)
{
if (opt_tensor.has_value() && opt_tensor.value().defined()) {
return CopyTypeV2(opt_tensor.value());
}
return nullptr;
}
aclIntArray *ConvertTypeV2(const c10::optional<std::vector<int64_t>> &opt_array)
{
if (opt_array.has_value()) {
return ConvertTypeV2(opt_array.value());
}
return nullptr;
}
c10::optional<std::vector<int64_t>> CopyTypeV2(const c10::optional<at::IntArrayRef> &opt_array)
{
if (opt_array.has_value()) {
return CopyTypeV2(opt_array.value());
}
return c10::nullopt;
}
c10::optional<std::vector<int64_t>> CopyTypeV2(const c10::OptionalArrayRef<c10::SymInt> &opt_array)
{
if (opt_array.has_value()) {
return CopyTypeV2(opt_array.value());
}
return c10::nullopt;
}
c10::optional<std::vector<int64_t>> CopyTypeV2(const c10::OptionalIntArrayRef &opt_array)
{
if (opt_array.has_value()) {
return CopyTypeV2(opt_array.value());
}
return c10::nullopt;
}
aclScalar *ConvertTypeV2(const c10::optional<at::Scalar> &opt_scalar)
{
if (opt_scalar.has_value()) {
return ConvertTypeV2(opt_scalar.value());
}
return nullptr;
}
aclDataType ConvertTypeV2(const at::ScalarType scalarType)
{
return at_npu::native::OpPreparation::convert_to_acl_data_type(scalarType);
}
char* ConvertTypeV2(const std::string &str)
{
char* string_ptr = const_cast<char *>(str.c_str());
return string_ptr;
}
std::string CopyTypeV2(char* str)
{
std::string result = str;
return result;
}
void MemcpyToBufImpl(const void* data, size_t size)
{
if (g_hash_offset + size > g_hash_buf_size) {
g_hash_offset = g_hash_buf_max_size;
return;
}
memcpy(g_hash_buf + g_hash_offset, data, size);
g_hash_offset += size;
}
bool CacheParams::GetDeterministicStatus() const
{
return deterministic_status_;
}
uint32_t CacheParams::GetAicNum() const
{
return aic_num_;
}
uint32_t CacheParams::GetAivNum() const
{
return aiv_num_;
}
CacheParams GetCacheParams()
{
CacheParams params;
params.deterministic_status_ = at::globalContext().deterministicAlgorithms();
if (c10_npu::is_core_control_enabled()) {
params.aic_num_ = c10_npu::GetResInCurrentThread(c10_npu::acl::ACL_RT_DEV_RES_CUBE_CORE);
params.aiv_num_ = c10_npu::GetResInCurrentThread(c10_npu::acl::ACL_RT_DEV_RES_VECTOR_CORE);
}
return params;
}
void GetApiFunc(
const char* api_name,
const char* workspace_api_name,
void*& opApiFuncAddr,
void*& getWorkspaceSizeFuncAddr
)
{
opApiFuncAddr = GetOpApiFuncAddr(api_name);
getWorkspaceSizeFuncAddr = GetOpApiFuncAddr(workspace_api_name);
TORCH_CHECK(opApiFuncAddr != nullptr && getWorkspaceSizeFuncAddr != nullptr,
api_name, " or ", workspace_api_name, " not in ", GetOpApiLibName(),
", or ", GetOpApiLibName(), " not found.",
OPS_ERROR(ErrCode::PTR));
}
void InitExecCommonCtx()
{
void* initMemAddr = GetOpApiFuncAddr("InitHugeMemThreadLocal");
InitHugeMemThreadLocal initMemFunc = initMemAddr ? reinterpret_cast<InitHugeMemThreadLocal>(initMemAddr) : nullptr;
if (initMemFunc) {
initMemFunc(nullptr, false);
}
}
void InitExecSubTheadCtx(aclrtStream acl_stream)
{
if (c10_npu::check_dequeue_need_use(acl_stream)) {
c10_npu::UseStreamResInCurrentThread(acl_stream);
}
}
void UnInitExecCommonCtx()
{
void* unInitMemAddr = GetOpApiFuncAddr("UnInitHugeMemThreadLocal");
UnInitHugeMemThreadLocal unInitMemFunc = unInitMemAddr ? reinterpret_cast<UnInitHugeMemThreadLocal>(unInitMemAddr) : nullptr;
if (unInitMemFunc) {
unInitMemFunc(nullptr, false);
}
UnInitCacheThreadLocal();
}
void ReleaseExecCommonCtx()
{
void* releaseMemAddr = GetOpApiFuncAddr("ReleaseHugeMem");
if (releaseMemAddr) {
ReleaseHugeMem releaseMemFunc = reinterpret_cast<ReleaseHugeMem>(releaseMemAddr);
releaseMemFunc(nullptr, false);
}
}
aclrtStream GetAclStream()
{
auto acl_stream = c10_npu::getCurrentNPUStream().stream(false);
if (c10_npu::check_enqueue_need_use(acl_stream)) {
c10_npu::UseStreamResInCurrentThread(acl_stream);
}
return acl_stream;
}
void SetExecConfig()
{
at_npu::native::SetDeterministic();
}
void SetExecConfigV2(const CacheParams& cache_params)
{
at_npu::native::SetDeterministicOps(cache_params.GetDeterministicStatus());
}
void* GetWorkSpaceAddr(
uint64_t workspace_size)
{
void* workspace_addr = nullptr;
if (workspace_size != 0) {
auto workspace_tensor = at_npu::native::OpPreparation::unsafe_empty_workspace(workspace_size);
workspace_addr = const_cast<void*>(workspace_tensor.storage().data());
}
return workspace_addr;
}
int ExecuteApiFunc(
const void* opApiFuncAddr,
aclrtStream acl_stream,
void* workspace_addr,
uint64_t workspace_size,
aclOpExecutor* executor
)
{
OpApiFunc opApiFunc = reinterpret_cast<OpApiFunc>(opApiFuncAddr);
auto api_ret = opApiFunc(workspace_addr, workspace_size, executor, acl_stream);
return api_ret;
}
void* GetWorkSpaceAddrV2(
uint64_t workspace_size, aclrtStream acl_stream)
{
void* workspace_addr = nullptr;
if (workspace_size != 0) {
auto workspace_tensor = at_npu::native::OpPreparation::unsafe_empty_workspace(workspace_size, acl_stream);
workspace_addr = const_cast<void*>(workspace_tensor.storage().data());
}
return workspace_addr;
}
int ExecuteApiFuncV2(
const void* opApiFuncAddr,
aclrtStream acl_stream,
uint64_t workspace_size,
aclOpExecutor* executor
)
{
auto workspace_addr = GetWorkSpaceAddrV2(workspace_size, acl_stream);
OpApiFunc opApiFunc = reinterpret_cast<OpApiFunc>(opApiFuncAddr);
auto api_ret = opApiFunc(workspace_addr, workspace_size, executor, acl_stream);
return api_ret;
}
bool CheckAndInitFunc(const char* aclnn_api)
{
static const auto ptaGetExecCacheAddr = GetOpApiFuncAddr("PTAGetExecCache");
static const auto initPTACacheThreadLocalAddr = GetOpApiFuncAddr("InitPTACacheThreadLocal");
static const auto setPTAHashKeyAddr = GetOpApiFuncAddr("SetPTAHashKey");
static const auto canUsePTACacheAddr = GetOpApiFuncAddr("CanUsePTACache");
PTAGetExecCache ptaGetExecCacheFunc = reinterpret_cast<PTAGetExecCache>(ptaGetExecCacheAddr);
InitPTACacheThreadLocal initPTACacheThreadLocalFunc =
reinterpret_cast<InitPTACacheThreadLocal>(initPTACacheThreadLocalAddr);
SetPTAHashKey setPTAHashKeyFunc = reinterpret_cast<SetPTAHashKey>(setPTAHashKeyAddr);
CanUsePTACache canUsePTACacheFunc = reinterpret_cast<CanUsePTACache>(canUsePTACacheAddr);
bool has_valid_funcs = (ptaGetExecCacheFunc != nullptr) &&
(initPTACacheThreadLocalFunc != nullptr) &&
(setPTAHashKeyFunc != nullptr);
bool can_use_cache = (canUsePTACacheFunc != nullptr) && canUsePTACacheFunc(aclnn_api);
bool check_result = has_valid_funcs && can_use_cache;
if (!check_result) {
return false;
} else {
initPTACacheThreadLocalFunc();
g_hash_offset = 0;
}
return true;
}
bool CheckAndInitFuncV2(const char* aclnn_api)
{
static const auto ptaFindExecCacheAddr = GetOpApiFuncAddr("PTAFindExecCache");
static const auto initPTACacheThreadLocalAddr = GetOpApiFuncAddr("InitPTACacheThreadLocal");
static const auto setPTACacheHashKeyAddr = GetOpApiFuncAddr("SetPTACacheHashKey");
static const auto canUsePTACacheAddr = GetOpApiFuncAddr("CanUsePTACache");
PTAFindExecCache ptaFindExecCacheFunc = reinterpret_cast<PTAFindExecCache>(ptaFindExecCacheAddr);
InitPTACacheThreadLocal initPTACacheThreadLocalFunc =
reinterpret_cast<InitPTACacheThreadLocal>(initPTACacheThreadLocalAddr);
SetPTACacheHashKey setPTACacheHashKeyFunc = reinterpret_cast<SetPTACacheHashKey>(setPTACacheHashKeyAddr);
CanUsePTACache canUsePTACacheFunc = reinterpret_cast<CanUsePTACache>(canUsePTACacheAddr);
bool has_valid_funcs = (ptaFindExecCacheFunc != nullptr) &&
(initPTACacheThreadLocalFunc != nullptr) &&
(setPTACacheHashKeyFunc != nullptr);
bool can_use_cache = (canUsePTACacheFunc != nullptr) && canUsePTACacheFunc(aclnn_api);
bool check_result = has_valid_funcs && can_use_cache;
if (!check_result) {
return false;
} else {
initPTACacheThreadLocalFunc();
g_hash_offset = 0;
}
return true;
}
void AddCacheConfigParams(aclrtStream acl_stream, const CacheParams& cache_params)
{
bool deterministic_status = cache_params.GetDeterministicStatus();
uint32_t aic_num = cache_params.GetAicNum();
uint32_t aiv_num = cache_params.GetAivNum();
add_param_to_buf(deterministic_status);
if (aic_num != UINT32_MAX && aiv_num != UINT32_MAX) {
add_param_to_buf(aic_num);
add_param_to_buf(aiv_num);
}
auto device = c10_npu::current_device();
add_param_to_buf(device);
add_param_to_buf(reinterpret_cast<uintptr_t>(acl_stream));
}
void AddCacheConfigParamsV2(aclrtStream acl_stream, const CacheParams& cache_params, const char* aclnn_api)
{
bool deterministic_status = cache_params.GetDeterministicStatus();
uint32_t aic_num = cache_params.GetAicNum();
uint32_t aiv_num = cache_params.GetAivNum();
add_param_to_buf_v2(deterministic_status);
if (aic_num != UINT32_MAX && aiv_num != UINT32_MAX) {
add_param_to_buf_v2(aic_num);
add_param_to_buf_v2(aiv_num);
}
add_param_to_buf_v2(std::string(aclnn_api));
add_param_to_buf_v2(reinterpret_cast<uintptr_t>(acl_stream));
}
aclOpExecutor* GetCacheExecutorV2(uint64_t* workspace_size)
{
static const auto ptaFindExecCacheAddr = GetOpApiFuncAddr("PTAFindExecCache");
static const auto setPTACacheHashKeyAddr = GetOpApiFuncAddr("SetPTACacheHashKey");
PTAFindExecCache ptaFindExecCacheFunc = reinterpret_cast<PTAFindExecCache>(ptaFindExecCacheAddr);
SetPTACacheHashKey setPTACacheHashKeyFunc = reinterpret_cast<SetPTACacheHashKey>(setPTACacheHashKeyAddr);
if (g_hash_offset == g_hash_buf_max_size) {
setPTACacheHashKeyFunc(nullptr, 0);
} else {
setPTACacheHashKeyFunc(reinterpret_cast<uint8_t *>(g_hash_buf), g_hash_offset);
}
return ptaFindExecCacheFunc(reinterpret_cast<uint8_t *>(g_hash_buf),
g_hash_offset, workspace_size);
}
aclOpExecutor* GetCacheExecutor(uint64_t* workspace_size)
{
uint64_t hashId = calc_hash_id();
static const auto ptaGetExecCacheAddr = GetOpApiFuncAddr("PTAGetExecCache");
static const auto setPTAHashKeyAddr = GetOpApiFuncAddr("SetPTAHashKey");
SetPTAHashKey setPTAHashKeyFunc = reinterpret_cast<SetPTAHashKey>(setPTAHashKeyAddr);
PTAGetExecCache ptaGetExecCacheFunc = reinterpret_cast<PTAGetExecCache>(ptaGetExecCacheAddr);
setPTAHashKeyFunc(hashId);
return ptaGetExecCacheFunc(hashId, workspace_size);
}
bool ExecuteCachedOp(aclrtStream acl_stream, const char* aclnn_api, void* phrase2)
{
uint64_t workspace_size = 0;
aclOpExecutor* executor = GetCacheExecutor(&workspace_size);
if (executor == nullptr) {
return false;
}
void* workspace_addr = nullptr;
at::Tensor workspace_tensor;
if (workspace_size != 0) {
workspace_tensor = at_npu::native::OpPreparation::unsafe_empty_workspace(workspace_size);
workspace_addr = const_cast<void*>(workspace_tensor.storage().data());
}
auto acl_call = [workspace_addr, workspace_size, acl_stream, executor, phrase2]()->int {
OpApiFunc opApiFunc = reinterpret_cast<OpApiFunc>(phrase2);
auto api_ret = opApiFunc(workspace_addr, workspace_size, executor, acl_stream);
NPU_CHECK_ERROR(api_ret, "call failed");
return api_ret;
};
at_npu::native::OpCommand::RunOpApiV2(aclnn_api, acl_call);
UnInitCacheThreadLocal();
return true;
}
bool ExecuteCachedOpV2(aclrtStream acl_stream, const char* aclnn_api, void* phrase2, int* api_ret)
{
uint64_t workspace_size = 0;
aclOpExecutor* executor = GetCacheExecutorV2(&workspace_size);
if (executor == nullptr) {
return false;
}
void *workspace_addr = nullptr;
at::Tensor workspace_tensor;
if (workspace_size != 0) {
workspace_tensor = at_npu::native::OpPreparation::unsafe_empty_workspace(workspace_size, acl_stream);
workspace_addr = const_cast<void *>(workspace_tensor.storage().data());
}
OpApiFunc opApiFunc = reinterpret_cast<OpApiFunc>(phrase2);
*api_ret = opApiFunc(workspace_addr, workspace_size, executor, acl_stream);
NPU_CHECK_ERROR(*api_ret, "call failed");
UnInitCacheThreadLocal();
return true;
}
void RunAclCall(const string &op_name, const PROC_FUNC &func)
{
at_npu::native::OpCommand::RunOpApiV2(op_name, func);
}
TORCH_NPU_API uint32_t OpApiGetTaskQueueEnable()
{
return c10_npu::option::OptionsManager::GetTaskQueueEnable();
}