aclnnGroupedMatmulFinalizeRoutingWeightNz
产品支持情况
| 产品 | 是否支持 |
|---|---|
| Ascend 950PR/Ascend 950DT | × |
| Atlas A3 训练系列产品/Atlas A3 推理系列产品 | √ |
| Atlas A2 训练系列产品/Atlas A2 推理系列产品 | √ |
| Atlas 200I/500 A2 推理产品 | × |
| Atlas 推理系列产品 | × |
| Atlas 训练系列产品 | × |
功能说明
GroupedMatmul和MoeFinalizeRouting的融合算子,GroupedMatmul计算后的输出按照索引做combine动作,支持w为AI处理器亲和数据排布格式(NZ)。
函数原型
每个算子分为两段式接口,必须先调用“aclnnGroupedMatmulFinalizeRoutingWeightNzGetWorkspaceSize”接口获取计算所需workspace大小以及包含了算子计算流程的执行器,再调用“aclnnGroupedMatmulFinalizeRoutingWeightNz”接口执行计算。
aclnnStatus aclnnGroupedMatmulFinalizeRoutingWeightNzGetWorkspaceSize(
const aclTensor *x1,
const aclTensor *x2,
const aclTensor *scale,
const aclTensor *bias,
const aclTensor *pertokenScaleOptional,
const aclTensor *groupList,
const aclTensor *sharedInput,
const aclTensor *logit,
const aclTensor *rowIndex,
int64_t dtype,
float sharedInputWeight,
int64_t sharedInputOffset,
bool transposeX1,
bool transposeX2,
int64_t groupListType,
aclTensor *out,
uint64_t *workspaceSize,
aclOpExecutor **executor)
aclnnStatus aclnnGroupedMatmulFinalizeRoutingWeightNz(
void *workspace,
uint64_t workspaceSize,
aclOpExecutor *executor,
aclrtStream stream)
aclnnGroupedMatmulFinalizeRoutingWeightNzGetWorkspaceSize
-
参数说明
参数名 输入/输出 描述 使用说明 数据类型 数据格式 维度(shape) 非连续Tensor x1 输入 输入x(左矩阵)。 - INT8 ND shape支持2维,维度为(m, k), 数据类型支持INT8,维度m的取值范围为[1,16\*1024\*8];k支持256、512、1024、1408、2048。 - x2 输入 输入weight(右矩阵)。 - INT8 NZ shape支持5维。维度为(e, n1, k1, k0, n0),其中k0 = 16,n0 = 32, x1 shape中的k和x2 shape中的k1需要满足以下关系:ceilDiv(k,16) = k1。可使用aclnnCalculateMatmulWeightSizeV2接口以及aclnnTransMatmulWeight接口完成输入Format从ND到AI处理器亲和数据排布格式(NZ)的转换。e取值范围[1,256]。 - scale 输入 量化参数中的缩放因子,per-channel量化参数。 - INT64 ND shape是2维(e, n),n = n1 \* n0,e和w的e一致,n支持2048、7168、7680。 - bias 输入 矩阵的偏移。当前为预留参数,暂不生效,传入空指针即可。 - FLOAT32 ND - pertokenScaleOptional 输入 矩阵计算的反量化参数。 FLOAT32 ND shape支持一维,维度为(m),m和x1的m一致 - groupList 输入 输入和输出分组轴方向的matmul大小分布。 INT64 ND shape支持一维,维度为(e),e和x2的e一致 - sharedInput 输入 moe计算中共享专家的输出,需要与moe专家的输出进行combine操作。 BF16 ND 支持二维,维度为(bsdp,n),bsdp必须小于等于batchSize/e,n和x2的n一致。 - logit 输入 moe专家对各个token的logit大小。 FLOAT32 ND shape支持一维,维度为(m),m和x1的m一致 - rowIndex 输入 moe专家输出按照该rowIndex进行combine,其中的值即为combine做scatter add的索引。 INT32、INT64 ND shape支持一维,维度为(m),m和x1的m一致 - dtype 输入 计算的输出类型:0:FLOAT32;1:FLOAT16;2:BFLOAT16。目前仅支持0。 INT64 - sharedInputWeight 输入 共享专家与moe专家进行combine的系数,sharedInput先与该参数乘,然后在和moe专家结果累加。 FLOAT32 - sharedInputOffset 输入 共享专家输出在总输出中的偏移。 INT64 - transposeX1 输入 左矩阵是否转置,仅支持false。 BOOL - transposeX2 输入 右矩阵是否转置,仅支持false。 BOOL - groupListType 输入 分组模式:配置为0:cumsum模式,即为前缀和;配置为1:count模式。 INT64 - out 输出 输出结果。 shape与self相同。 FLOAT32 ND 0-8 - workspaceSize 输出 返回需要在Device侧申请的workspace大小。 - - - - - executor 输出 返回op执行器,包含了算子计算流程。 - - - - - -
返回值
返回aclnnStatus状态码,具体参见aclnn返回码。
第一段接口完成入参校验,出现以下场景时报错:
返回值 错误码 描述 ACLNN_ERR_PARAM_NULLPTR 161001 传入的x1、x2、scale、bias或out是空指针。 ACLNN_ERR_PARAM_INVALID 161002 x1、x2、scale、bias、pertokenScaleOptional、groupList、sharedInput、logit、rowIndex、sharedInputWeight、sharedInputOffset、transposeX1、transposeX2或out的数据类型和数据格式不在支持的范围内。 x1、x2、scale、bias、pertokenScaleOptional、groupList、sharedInput、logit、rowIndex或out的shape不满足校验条件。 x1、x2、scale、bias、pertokenScaleOptional、groupList、sharedInput、 logit、rowIndex或out是空tensor。 dtype、sharedInputOffset、transposeX1、transposeX2、groupListType的取值范围不满足条件。
aclnnGroupedMatmulFinalizeRoutingWeightNz
-
参数说明
参数名 输入/输出 描述 workspace 输入 在Device侧申请的workspace内存地址。 workspaceSize 输入 在Device侧申请的workspace大小,由第一段接口aclnnGroupedMatmulFinalizeRoutingWeightNzGetWorkspaceSize获取。 executor 输入 op执行器,包含了算子计算流程。 stream 输入 指定执行任务的Stream。 -
返回值
返回aclnnStatus状态码,具体参见aclnn返回码。
约束说明
-
确定性计算:
- aclnnGroupedMatmulFinalizeRoutingWeightNz默认非确定性实现,支持通过aclrtCtxSetSysParamOpt开启确定性。
-
输入和输出支持以下数据类型组合:
x1 x2 scale bias pertokenScaleOptional groupList sharedInput logit rowIndex out INT8 INT8 FLOAT32 null FLOAT32 INT64 BFLOAT16 FLOAT32 INT64 FLOAT INT8 INT8 FLOAT32 null FLOAT32 INT64 null null INT64 FLOAT
调用示例
示例代码如下,仅供参考,具体编译和执行过程请参考编译与运行样例。
#include <iostream>
#include <memory>
#include <vector>
#include "acl/acl.h"
#include "aclnnop/aclnn_permute.h"
#include "aclnnop/aclnn_grouped_matmul_finalize_routing_weight_nz.h"
#include "aclnnop/aclnn_trans_matmul_weight.h"
#define CHECK_RET(cond, return_expr) \
do { \
if (!(cond)) { \
return_expr; \
} \
} while (0)
#define CHECK_FREE_RET(cond, return_expr) \
do { \
if (!(cond)) { \
Finalize(deviceId, stream); \
return_expr; \
} \
} while (0)
#define LOG_PRINT(message, ...) \
do { \
printf(message, ##__VA_ARGS__); \
} while (0)
int64_t GetShapeSize(const std::vector<int64_t> &shape)
{
int64_t shapeSize = 1;
for (auto i : shape) {
shapeSize *= i;
}
return shapeSize;
}
int Init(int32_t deviceId, aclrtStream *stream)
{
// 固定写法,资源初始化
auto ret = aclInit(nullptr);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclInit failed. ERROR: %d\n", ret); return ret);
ret = aclrtSetDevice(deviceId);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclrtSetDevice failed. ERROR: %d\n", ret); return ret);
ret = aclrtCreateStream(stream);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclrtCreateStream failed. ERROR: %d\n", ret); return ret);
return 0;
}
template <typename T>
int CreateAclTensor(const std::vector<T> &hostData, const std::vector<int64_t> &shape, void **deviceAddr,
aclDataType dataType, aclTensor **tensor)
{
auto size = GetShapeSize(shape) * sizeof(T);
// 调用aclrtMalloc申请device侧内存
auto ret = aclrtMalloc(deviceAddr, size, ACL_MEM_MALLOC_HUGE_FIRST);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclrtMalloc failed. ERROR: %d\n", ret); return ret);
// 调用aclrtMemcpy将host侧数据拷贝到device侧内存上
ret = aclrtMemcpy(*deviceAddr, size, hostData.data(), size, ACL_MEMCPY_HOST_TO_DEVICE);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclrtMemcpy failed. ERROR: %d\n", ret); return ret);
// 计算连续tensor的strides
std::vector<int64_t> strides(shape.size(), 1);
for (int64_t i = shape.size() - 2; i >= 0; i--) {
strides[i] = shape[i + 1] * strides[i + 1];
}
// 调用aclCreateTensor接口创建aclTensor
*tensor = aclCreateTensor(shape.data(), shape.size(), dataType, strides.data(), 0, aclFormat::ACL_FORMAT_ND,
shape.data(), shape.size(), *deviceAddr);
return 0;
}
template <typename T>
int CreateAclTensorWeight(const std::vector<T> &hostData, const std::vector<int64_t> &shape, void **deviceAddr,
aclDataType dataType, aclTensor **tensor)
{
auto size = static_cast<uint64_t>(GetShapeSize(shape));
const aclIntArray *mat2Size = aclCreateIntArray(shape.data(), shape.size());
auto ret = aclnnCalculateMatmulWeightSizeV2(mat2Size, dataType, &size);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclnnCalculateMatmulWeightSizeV2 failed. ERROR: %d\n", ret);
return ret);
size *= sizeof(T);
// 调用aclrtMalloc申请device侧内存
ret = aclrtMalloc(deviceAddr, size, ACL_MEM_MALLOC_HUGE_FIRST);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclrtMalloc failed. ERROR: %d\n", ret); return ret);
// 调用aclrtMemcpy将host侧数据拷贝到device侧内存上
ret = aclrtMemcpy(*deviceAddr, size, hostData.data(), size, ACL_MEMCPY_HOST_TO_DEVICE);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclrtMemcpy failed. ERROR: %d\n", ret); return ret);
// 计算连续tensor的strides
std::vector<int64_t> strides(shape.size(), 1);
for (int64_t i = shape.size() - 2; i >= 0; i--) {
strides[i] = shape[i + 1] * strides[i + 1];
}
std::vector<int64_t> storageShape;
storageShape.push_back(GetShapeSize(shape));
// 调用aclCreateTensor接口创建aclTensor
*tensor = aclCreateTensor(shape.data(), shape.size(), dataType, strides.data(), 0, aclFormat::ACL_FORMAT_ND,
storageShape.data(), storageShape.size(), *deviceAddr);
return 0;
}
int main() {
int32_t deviceId = 0;
aclrtStream stream;
auto ret = Init(deviceId, &stream);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("Init stream failed. ERROR: %d\n", ret); return ret);
// 2. 构造输入与输出,需要根据API的接口自定义构造
int64_t m = 192;
int64_t k = 2048;
int64_t n = 7168;
int64_t e = 4;
int64_t batch = 24;
int64_t bsdp = 8;
int64_t dtype = 0;
float shareInputWeight = 1.0;
int64_t sharedInputOffset = 0;
bool transposeX = false;
bool transposeW = false;
int64_t groupListType = 1;
std::vector<int64_t> xShape = {m, k};
std::vector<int64_t> wShape = {e, k, n};
std::vector<int64_t> scaleShape = {e, n};
std::vector<int64_t> pertokenScaleShape = {m};
std::vector<int64_t> groupListShape = {e};
std::vector<int64_t> sharedInputShape = {bsdp, n};
std::vector<int64_t> logitShape = {m};
std::vector<int64_t> rowIndexShape = {m};
std::vector<int64_t> outShape = {batch, n};
void *xDeviceAddr = nullptr;
void *wDeviceAddr = nullptr;
void *scaleDeviceAddr = nullptr;
void *pertokenScaleDeviceAddr = nullptr;
void *groupListDeviceAddr = nullptr;
void *sharedInputDeviceAddr = nullptr;
void *logitDeviceAddr = nullptr;
void *rowIndexDeviceAddr = nullptr;
void *outDeviceAddr = nullptr;
aclTensor* x = nullptr;
aclTensor* w = nullptr;
aclTensor* bias = nullptr;
aclTensor* groupList = nullptr;
aclTensor* scale = nullptr;
aclTensor* pertokenScale = nullptr;
aclTensor* sharedInput = nullptr;
aclTensor* logit = nullptr;
aclTensor* rowIndex = nullptr;
aclTensor* out = nullptr;
std::vector<int8_t> xHostData(GetShapeSize(xShape));
std::vector<int8_t> wHostData(GetShapeSize(wShape));
std::vector<float> scaleHostData(GetShapeSize(scaleShape));
std::vector<float> pertokenScaleHostData(GetShapeSize(pertokenScaleShape));
std::vector<int64_t> groupListHostData(GetShapeSize(groupListShape));
groupListHostData[0] = 7;
groupListHostData[1] = 32;
groupListHostData[2] = 40;
groupListHostData[3] = 64;
std::vector<uint16_t> sharedInputHostData(GetShapeSize(sharedInputShape));
std::vector<int64_t> logitHostData(GetShapeSize(logitShape));
std::vector<float> rowIndexHostData(GetShapeSize(rowIndexShape));
std::vector<float> outHostData(GetShapeSize(outShape));
// 创建x aclTensor
ret = CreateAclTensor(xHostData, xShape, &xDeviceAddr, aclDataType::ACL_INT8, &x);
std::unique_ptr<aclTensor, aclnnStatus (*)(const aclTensor *)> xTensorPtr(x, aclDestroyTensor);
std::unique_ptr<void, aclError (*)(void *)> xDeviceAddrPtr(xDeviceAddr, aclrtFree);
CHECK_RET(ret == ACL_SUCCESS, return ret);
// 创建AI处理器亲和数据排布格式的w aclTensor
ret = CreateAclTensorWeight(wHostData, wShape, &wDeviceAddr, aclDataType::ACL_INT8, &w);
std::unique_ptr<aclTensor, aclnnStatus (*)(const aclTensor *)> wTensorPtr(w, aclDestroyTensor);
std::unique_ptr<void, aclError (*)(void *)> wDeviceAddrPtr(wDeviceAddr, aclrtFree);
CHECK_RET(ret == ACL_SUCCESS, return ret);
// 创建scale aclTensor
ret = CreateAclTensor(scaleHostData, scaleShape, &scaleDeviceAddr, aclDataType::ACL_FLOAT, &scale);
std::unique_ptr<aclTensor, aclnnStatus (*)(const aclTensor *)> scaleTensorPtr(scale, aclDestroyTensor);
std::unique_ptr<void, aclError (*)(void *)> scaleDeviceAddrPtr(scaleDeviceAddr, aclrtFree);
CHECK_RET(ret == ACL_SUCCESS, return ret);
// 创建pertokenScale aclTensor
ret = CreateAclTensor(pertokenScaleHostData, pertokenScaleShape, &pertokenScaleDeviceAddr, aclDataType::ACL_FLOAT, &pertokenScale);
std::unique_ptr<aclTensor, aclnnStatus (*)(const aclTensor *)> pertokenScaleTensorPtr(pertokenScale, aclDestroyTensor);
std::unique_ptr<void, aclError (*)(void *)> pertokenScaleDeviceAddrPtr(pertokenScaleDeviceAddr, aclrtFree);
CHECK_RET(ret == ACL_SUCCESS, return ret);
// 创建groupList aclTensor
ret = CreateAclTensor(groupListHostData, groupListShape, &groupListDeviceAddr, aclDataType::ACL_INT64, &groupList);
std::unique_ptr<aclTensor, aclnnStatus (*)(const aclTensor *)> groupListTensorPtr(groupList, aclDestroyTensor);
std::unique_ptr<void, aclError (*)(void *)> groupListDeviceAddrPtr(groupListDeviceAddr, aclrtFree);
CHECK_RET(ret == ACL_SUCCESS, return ret);
// 创建sharedInput aclTensor
ret = CreateAclTensor(sharedInputHostData, sharedInputShape, &sharedInputDeviceAddr, aclDataType::ACL_BF16, &sharedInput);
std::unique_ptr<aclTensor, aclnnStatus (*)(const aclTensor *)> sharedInputTensorPtr(sharedInput, aclDestroyTensor);
std::unique_ptr<void, aclError (*)(void *)> sharedInputDeviceAddrPtr(sharedInputDeviceAddr, aclrtFree);
CHECK_RET(ret == ACL_SUCCESS, return ret);
// 创建logit aclTensor
ret = CreateAclTensor(logitHostData, logitShape, &logitDeviceAddr, aclDataType::ACL_FLOAT, &logit);
std::unique_ptr<aclTensor, aclnnStatus (*)(const aclTensor *)> logitTensorPtr(logit, aclDestroyTensor);
std::unique_ptr<void, aclError (*)(void *)> logitDeviceAddrPtr(logitDeviceAddr, aclrtFree);
CHECK_RET(ret == ACL_SUCCESS, return ret);
// 创建rowIndex aclTensor
ret = CreateAclTensor(rowIndexHostData, rowIndexShape, &rowIndexDeviceAddr, aclDataType::ACL_INT64, &rowIndex);
std::unique_ptr<aclTensor, aclnnStatus (*)(const aclTensor *)> rowIndexTensorPtr(rowIndex, aclDestroyTensor);
std::unique_ptr<void, aclError (*)(void *)> rowIndexDeviceAddrPtr(rowIndexDeviceAddr, aclrtFree);
CHECK_RET(ret == ACL_SUCCESS, return ret);
// 创建out aclTensor
ret = CreateAclTensor(outHostData, outShape, &outDeviceAddr, aclDataType::ACL_FLOAT, &out);
std::unique_ptr<aclTensor, aclnnStatus (*)(const aclTensor *)> outTensorPtr(out, aclDestroyTensor);
std::unique_ptr<void, aclError (*)(void *)> outDeviceAddrPtr(outDeviceAddr, aclrtFree);
CHECK_RET(ret == ACL_SUCCESS, return ret);
// 3. 调用CANN算子库API,需要修改为具体的Api名称
uint64_t workspaceSize = 0;
aclOpExecutor *executor;
void *workspaceAddr = nullptr;
// 调用aclnnTransMatmulWeight第一段接口
ret = aclnnTransMatmulWeightGetWorkspaceSize(w, &workspaceSize, &executor);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclnnTransMatmulWeightGetWorkspaceSize failed. ERROR: %d\n", ret);
return ret);
// 根据第一段接口计算出的workspaceSize申请device内存
if (workspaceSize > 0) {
ret = aclrtMalloc(&workspaceAddr, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("allocate workspace failed. ERROR: %d\n", ret); return ret);
}
// 调用aclnnTransMatmulWeight第二段接口
ret = aclnnTransMatmulWeight(workspaceAddr, workspaceSize, executor, stream);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclnnTransMatmulWeight failed. ERROR: %d\n", ret); return ret);
// 调用aclnnGroupedMatmulFinalizeRoutingWeightNz第一段接口
workspaceSize = 0;
ret = aclnnGroupedMatmulFinalizeRoutingWeightNzGetWorkspaceSize(x, w, scale, nullptr, pertokenScale, groupList, sharedInput, logit, rowIndex, dtype, shareInputWeight, sharedInputOffset, transposeX, transposeW, groupListType, out, &workspaceSize, &executor);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclnnGroupedMatmulFinalizeRoutingWeightNzGetWorkspaceSize failed. ERROR: %d\n", ret);
return ret);
// 根据第一段接口计算出的workspaceSize申请device内存
if (workspaceSize > 0) {
ret = aclrtMalloc(&workspaceAddr, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("allocate workspace failed. ERROR: %d\n", ret); return ret);
}
// 调用aclnnGroupedMatmulFinalizeRoutingWeightNz第二段接口
ret = aclnnGroupedMatmulFinalizeRoutingWeightNz(workspaceAddr, workspaceSize, executor, stream);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclnnGroupedMatmulFinalizeRoutingWeightNz failed. ERROR: %d\n", ret); return ret);
// 4. (固定写法)同步等待任务执行结束
ret = aclrtSynchronizeStream(stream);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclrtSynchronizeStream failed. ERROR: %d\n", ret); return ret);
// 5. 获取输出的值,将device侧内存上的结果拷贝至host侧,需要根据具体API的接口定义修改
auto size = GetShapeSize(outShape);
std::vector<float> resultData(size, 0);
ret = aclrtMemcpy(resultData.data(), resultData.size() * sizeof(resultData[0]), outDeviceAddr,
size * sizeof(resultData[0]), ACL_MEMCPY_DEVICE_TO_HOST);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("copy result from device to host failed. ERROR: %d\n", ret);
return ret);
for (int64_t i = 0; i < size; i++) {
LOG_PRINT("result[%lld] is: %f\n", i, resultData[i]);
}
// 6. 释放aclTensor资源,需要根据具体API的接口定义修改
aclDestroyTensor(x);
aclDestroyTensor(w);
aclDestroyTensor(scale);
aclDestroyTensor(pertokenScale);
aclDestroyTensor(groupList);
aclDestroyTensor(sharedInput);
aclDestroyTensor(logit);
aclDestroyTensor(rowIndex);
aclDestroyTensor(out);
// 7.释放device资源,需要根据具体API的接口定义修改
aclrtFree(xDeviceAddr);
aclrtFree(wDeviceAddr);
aclrtFree(scaleDeviceAddr);
aclrtFree(pertokenScaleDeviceAddr);
aclrtFree(groupListDeviceAddr);
aclrtFree(sharedInputDeviceAddr);
aclrtFree(logitDeviceAddr);
aclrtFree(rowIndexDeviceAddr);
aclrtFree(outDeviceAddr);
if (workspaceSize > 0) {
aclrtFree(workspaceAddr);
}
aclrtDestroyStream(stream);
aclrtResetDevice(deviceId);
aclFinalize();
return 0;
}