* Copyright (c) 2025 Huawei Technologies Co., Ltd.
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
* CANN Open Software License Agreement Version 2.0 (the "License").
* Please refer to the License for details. You may not use this file except in compliance with the License.
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
* See LICENSE in the root of the software repository for the full text of the License.
*/
* \file cube_dyn.h
* \brief
*/
#ifndef TILE_FWK_CUBE_DYN_H
#define TILE_FWK_CUBE_DYN_H
#include "tileop_common.h"
#include "cube.h"
#include <type_traits>
namespace TileOp {
constexpr uint16_t BLOCK_CUBE_M_N = 16;
constexpr uint16_t BLOCK_ALIGN_BYTE = 32;
constexpr uint16_t MAX_UINT16 = 65535;
template <typename T>
INLINE T CeilAlign(T num_1, T num_2)
{
if (num_2 == 0) {
return 0;
}
return (num_1 + num_2 - 1) / num_2 * num_2;
}
template <typename T>
INLINE T CeilDiv(T num_1, T num_2)
{
if (num_2 == 0) {
return 0;
}
return (num_1 + num_2 - 1) / num_2;
}
#define RT_OPERATION_OP_L1_COPY_IN( \
l1Addr, l1Dtype, l1Shape0, l1Shape1, l1ValidShape0, l1ValidShape1, l1Stride0, l1Stride1, gmAddr, gmDtype, \
gmShape0, gmShape1, gmOffset0, gmOffset1, copyInMode) \
do { \
constexpr CopyInMode mode = static_cast<CopyInMode>(copyInMode); \
if (mode == CopyInMode::NZ2NZ) { \
DynL1CopyInNZ2NZ<gmDtype, l1Dtype>( \
l1Addr, gmAddr, l1ValidShape0, l1ValidShape1, gmShape0, gmShape1, gmOffset0, gmOffset1, gmShape0, \
gmShape1, 0); \
} else { \
DynL1CopyIn<gmDtype, l1Dtype, mode>( \
l1Addr, gmAddr, l1ValidShape0, l1ValidShape1, gmShape0, gmShape1, gmOffset0, gmOffset1, 0); \
} \
} while (0)
template <typename GMT, typename L1T, CopyInMode mode = CopyInMode::ND2NZ>
TILEOP void DynL1CopyIn(
__cbuf__ L1T* dst, __gm__ GMT* src, unsigned TShape0, unsigned TShape1, unsigned GmShape0, unsigned GmShape1,
unsigned GmOffset0, unsigned GmOffset1, int reserved)
{
if (TShape0 == 0 || TShape1 == 0) {
return;
}
src += CalcLinearOffset(GmShape1, GmOffset0, GmOffset1);
uint16_t nValue = TShape0;
uint16_t dValue = TShape1;
uint16_t srcDValue = GmShape1;
uint16_t dstNzC0Stride = CeilAlign<uint16_t>(TShape0, BLOCK_CUBE_M_N);
constexpr uint16_t ndNum = 1;
constexpr uint16_t srcNdMatrixStride = 0;
constexpr uint16_t dstNzNStride = 1;
constexpr uint16_t dstNzMatrixStride = 1;
if constexpr (mode == CopyInMode::ND2ND) {
constexpr bool is_valid_type = std::is_same_v<GMT, half> || std::is_same_v<GMT, float> ||
std::is_same_v<GMT, std::int32_t> || std::is_same_v<GMT, std::uint64_t>;
static_assert(is_valid_type, "parameter must be one of: fp16, float (fp32), int32_t, uint64_t");
dstNzC0Stride = 1;
if constexpr (std::is_same<GMT, uint64_t>::value) {
copy_gm_to_cbuf_multi_nd2nz_b8(
(__cbuf__ int8_t*)dst, (__gm__ int8_t*)src, 0, ndNum, nValue, dValue * 8, srcNdMatrixStride,
srcDValue * 8, dstNzC0Stride, dstNzNStride, dstNzMatrixStride);
return;
}
}
if constexpr (std::is_same<GMT, int8_t>::value) {
constexpr uint16_t c0Size = BLOCK_ALIGN_BYTE / sizeof(GMT);
dstNzC0Stride = CeilAlign<uint16_t>(TShape0, c0Size);
copy_gm_to_cbuf_multi_nd2nz_b8(
(__cbuf__ L1T*)dst, (__gm__ GMT*)src, 0, ndNum, nValue, dValue, srcNdMatrixStride, srcDValue, dstNzC0Stride,
dstNzNStride, dstNzMatrixStride);
}
if constexpr (std::is_same<GMT, half>::value || std::is_same<GMT, bfloat16_t>::value) {
copy_gm_to_cbuf_multi_nd2nz_b16(
(__cbuf__ L1T*)dst, (__gm__ GMT*)src, 0, ndNum, nValue, dValue, srcNdMatrixStride, srcDValue, dstNzC0Stride,
dstNzNStride, dstNzMatrixStride);
}
if constexpr (std::is_same<GMT, float>::value || std::is_same<GMT, int32_t>::value) {
copy_gm_to_cbuf_multi_nd2nz_b32s(
(__cbuf__ L1T*)dst, (__gm__ GMT*)src, 0, ndNum, nValue, dValue, srcNdMatrixStride, srcDValue, dstNzC0Stride,
dstNzNStride, dstNzMatrixStride);
}
}
template <typename GMT, typename L1T>
TILEOP void DynL1CopyInNZ2NZ(
__cbuf__ L1T* dst, __gm__ GMT* src, unsigned TShape0, unsigned TShape1, unsigned GmShape0, unsigned GmShape1,
unsigned GmOffset0, unsigned GmOffset1, unsigned curH, unsigned curW, int reserved)
{
if (TShape0 == 0 || TShape1 == 0) {
return;
}
constexpr int64_t c0Size = BLOCK_ALIGN_BYTE / sizeof(GMT);
int64_t batchSize = curH * curW;
int64_t offsetElem = GmOffset0 * GmShape1 + GmOffset1;
int64_t batchIndex = offsetElem / batchSize;
int64_t srcOffset = batchIndex * batchSize + (GmOffset1 * curH) + (GmOffset0 - batchIndex * curH) * c0Size;
uint16_t nBurst = TShape1 / c0Size;
uint16_t lenBurst = TShape0 * c0Size * sizeof(GMT) / BLOCK_ALIGN_BYTE;
uint16_t srcStride = (curH - TShape0) * c0Size * sizeof(GMT) / BLOCK_ALIGN_BYTE;
uint16_t dstStride = 0;
if constexpr (std::is_same<GMT, int8_t>::value) {
dstStride = CeilAlign<uint16_t>(TShape0, c0Size) - TShape0;
}
if (curH - TShape0 > MAX_UINT16) {
nBurst = 1;
srcStride = 0;
for (int32_t nIdx = 0; nIdx < static_cast<int32_t>(TShape1 / c0Size); ++nIdx) {
int64_t dstOffsetStep = nIdx * c0Size * TShape0;
int64_t srcOffsetStep = nIdx * c0Size * curH + srcOffset;
copy_gm_to_cbuf(
dst + dstOffsetStep, src + srcOffsetStep, 0, nBurst, lenBurst, srcStride, dstStride, PAD_NONE);
}
} else {
copy_gm_to_cbuf(dst, src + srcOffset, 0, nBurst, lenBurst, srcStride, dstStride, PAD_NONE);
}
}
#define RT_OPERATION_OP_L1_TO_L0A( \
l0aAddr, l0aDtype, l0aShape0, l0aShape1, l0aValidShape0, l0aValidShape1, l0aStride0, l0aStride1, l1Addr, l1Dtype, \
l1Shape0, l1Shape1, l1ValidShape0, l1ValidShape1, l1Stride0, l1Stride1, l1Offset0, l1Offset1, reluMode, \
scaleValue) \
DynL1ToL0A<l1Dtype, l1Offset0, l1Offset1>( \
l0aAddr, l1Addr, l0aValidShape0, l0aValidShape1, l1ValidShape0, l1ValidShape1);
template <typename T, unsigned Offset0, unsigned Offset1>
TILEOP void DynL1ToL0A(__ca__ T* dst, __cbuf__ T* src, unsigned dstM, unsigned dstK, unsigned srcM, unsigned srcK)
{
if (dstM == 0 || dstK == 0 || srcM == 0 || srcK == 0) {
return;
}
constexpr uint16_t c0Size = BLOCK_ALIGN_BYTE / sizeof(T);
dstM = CeilAlign<uint16_t>(dstM, BLOCK_CUBE_M_N);
dstK = CeilAlign<uint16_t>(dstK, c0Size);
srcM = CeilAlign<uint16_t>(srcM, BLOCK_CUBE_M_N);
srcK = CeilAlign<uint16_t>(srcK, c0Size);
if constexpr (std::is_same<T, float>::value) {
uint64_t config = srcM | (1 << 16);
set_fmatrix(config);
dstK = CeilAlign<uint16_t>(dstK, BLOCK_CUBE_M_N);
img2colv2_cbuf_to_ca(
dst, src, dstK, dstM, Offset1, Offset0, 1, 1, 1, 1, 1, 1, false, false, false, false, srcK);
return;
}
if constexpr (std::is_same<T, int8_t>::value) {
dstM = CeilAlign<uint16_t>(dstM, c0Size);
srcM = CeilAlign<uint16_t>(srcM, c0Size);
}
uint8_t repeat = dstK / c0Size;
uint16_t srcStride = srcM / BLOCK_CUBE_M_N;
constexpr uint16_t dstStride = 0;
int32_t dstOffsetStep = BLOCK_CUBE_M_N * dstK;
int32_t srcOffset = Offset0 * c0Size + Offset1 * srcM;
int32_t srcOffsetStep = BLOCK_CUBE_M_N * c0Size;
int32_t dstOffset = 0;
for (int32_t mIdx = 0; mIdx < static_cast<int32_t>(dstM / BLOCK_CUBE_M_N); ++mIdx) {
load_cbuf_to_ca(dst + dstOffset, src + srcOffset, 0, repeat, srcStride, dstStride, 0, 0, inc);
dstOffset += dstOffsetStep;
srcOffset += srcOffsetStep;
}
}
#define RT_OPERATION_OP_L1_TO_L0_AT( \
l0aAddr, l0aDtype, l0aShape0, l0aShape1, l0aValidShape0, l0aValidShape1, l0aStride0, l0aStride1, l1Addr, l1Dtype, \
l1Shape0, l1Shape1, l1ValidShape0, l1ValidShape1, l1Stride0, l1Stride1, l1Offset0, l1Offset1, reluMode, \
scaleValue) \
DynL1ToL0At<l1Dtype, l1Offset0, l1Offset1>( \
l0aAddr, l1Addr, l0aValidShape0, l0aValidShape1, l1ValidShape0, l1ValidShape1);
template <typename T, unsigned Offset0, unsigned Offset1>
TILEOP void DynL1ToL0At(__ca__ T* dst, __cbuf__ T* src, unsigned dstM, unsigned dstK, unsigned srcK, unsigned srcM)
{
if (dstM == 0 || dstK == 0 || srcM == 0 || srcK == 0) {
return;
}
constexpr uint16_t c0Size = BLOCK_ALIGN_BYTE / sizeof(T);
dstM = CeilAlign<uint16_t>(dstM, c0Size);
dstK = CeilAlign<uint16_t>(dstK, BLOCK_CUBE_M_N);
srcM = CeilAlign<uint16_t>(srcM, c0Size);
srcK = CeilAlign<uint16_t>(srcK, BLOCK_CUBE_M_N);
if constexpr (std::is_same<T, float>::value) {
uint64_t config = srcK | (1 << 16);
set_fmatrix(config);
img2colv2_cbuf_to_ca(dst, src, dstM, dstK, Offset1, Offset0, 1, 1, 1, 1, 1, 1, false, false, true, false, srcM);
return;
}
if constexpr (std::is_same<T, int8_t>::value) {
dstK = CeilAlign<uint16_t>(dstK, c0Size);
srcK = CeilAlign<uint16_t>(srcK, c0Size);
uint8_t repeat = dstK / c0Size;
uint16_t dstFracStride = dstK / c0Size - 1;
int32_t dstOffset = 0;
int32_t srcOffset = Offset0 * c0Size + Offset1 * srcK;
int32_t dstOffsetStep = dstK * c0Size;
int32_t srcOffsetStep = srcK * c0Size;
for (int32_t mIdx = 0; mIdx < static_cast<int32_t>(dstM / c0Size); ++mIdx) {
load_cbuf_to_ca_transpose(dst + dstOffset, src + srcOffset, 0, repeat, 1, 0, inc, dstFracStride);
dstOffset += dstOffsetStep;
srcOffset += srcOffsetStep;
}
return;
}
if (dstK == srcK) {
uint8_t repeat = (dstK / BLOCK_CUBE_M_N) * (dstM / c0Size);
load_cbuf_to_ca(dst, src, 0, repeat, 1, 0, 0, 1, inc);
} else {
uint8_t repeat = dstK / BLOCK_CUBE_M_N;
int32_t dstOffset = 0;
int32_t dstOffsetStep = BLOCK_CUBE_M_N * dstK;
int32_t srcOffset = Offset0 * c0Size + Offset1 * srcK;
int32_t srcOffsetStep = srcK * c0Size;
for (int32_t mIdx = 0; mIdx < static_cast<int32_t>(dstM / c0Size); ++mIdx) {
load_cbuf_to_ca(dst + dstOffset, src + srcOffset, 0, repeat, 1, 0, 0, 1, inc);
dstOffset += dstOffsetStep;
srcOffset += srcOffsetStep;
}
}
}
#define RT_OPERATION_OP_L1_TO_L0_B( \
l0bAddr, l0bDtype, l0bShape0, l0bShape1, l0bValidShape0, l0bValidShape1, l0bStride0, l0bStride1, l1Addr, l1Dtype, \
l1Shape0, l1Shape1, l1ValidShape0, l1ValidShape1, l1Stride0, l1Stride1, l1Offset0, l1Offset1, reluMode, \
scaleValue) \
DynL1ToL0B<l1Dtype, l1Offset0, l1Offset1>( \
l0bAddr, l1Addr, l0bValidShape0, l0bValidShape1, l1ValidShape0, l1ValidShape1);
template <typename T, unsigned Offset0, unsigned Offset1>
TILEOP void DynL1ToL0B(__cb__ T* dst, __cbuf__ T* src, unsigned dstK, unsigned dstN, unsigned srcK, unsigned srcN)
{
if (dstK == 0 || dstN == 0 || srcK == 0 || srcN == 0) {
return;
}
if constexpr (std::is_same<T, float>::value) {
dstK = CeilAlign<uint16_t>(dstK, BLOCK_CUBE_M_N);
dstN = CeilAlign<uint16_t>(dstN, BLOCK_CUBE_M_N);
srcN = CeilAlign<uint16_t>(srcN, BLOCK_ALIGN_BYTE / sizeof(T));
srcK = CeilAlign<uint16_t>(srcK, BLOCK_CUBE_M_N);
uint64_t config = srcK | (1 << 16);
set_fmatrix_b(config);
img2colv2_cbuf_to_cb(dst, src, dstN, dstK, Offset1, Offset0, 1, 1, 1, 1, 1, 1, false, false, false, true, srcN);
return;
}
constexpr uint16_t c0Size = BLOCK_ALIGN_BYTE / sizeof(T);
dstK = CeilAlign<uint16_t>(dstK, c0Size);
dstN = CeilAlign<uint16_t>(dstN, c0Size);
srcN = CeilAlign<uint16_t>(srcN, c0Size);
srcK = CeilAlign<uint16_t>(srcK, c0Size);
if constexpr (std::is_same<T, int8_t>::value) {
uint8_t repeat = dstK / BLOCK_ALIGN_BYTE;
constexpr uint16_t srcStride = 1;
uint16_t dstStride = dstN / BLOCK_CUBE_M_N - 1;
constexpr uint16_t dstFracStride = 0;
int32_t dstOffset = 0;
int32_t srcOffset = Offset0 * BLOCK_ALIGN_BYTE + Offset1 * srcK;
int32_t dstOffsetStep = BLOCK_ALIGN_BYTE * BLOCK_ALIGN_BYTE;
int32_t srcOffsetStep = srcK * BLOCK_ALIGN_BYTE;
for (int32_t nIdx = 0; nIdx < static_cast<int32_t>(dstN / BLOCK_ALIGN_BYTE); ++nIdx) {
load_cbuf_to_cb_transpose(
dst + dstOffset, src + srcOffset, 0, repeat, srcStride, dstStride, inc, dstFracStride);
dstOffset += dstOffsetStep;
srcOffset += srcOffsetStep;
}
return;
}
uint8_t repeat = dstN / BLOCK_CUBE_M_N;
uint16_t srcStride = srcK / c0Size;
constexpr uint16_t dstStride = 0;
for (int32_t kIdx = 0; kIdx < static_cast<int32_t>(dstK / c0Size); ++kIdx) {
load_cbuf_to_cb(
dst + kIdx * c0Size * dstN,
src + kIdx * BLOCK_CUBE_M_N * c0Size + (Offset0 * BLOCK_CUBE_M_N + Offset1 * srcK), 0, repeat, srcStride,
dstStride, 0, 1, inc);
}
}
#define RT_OPERATION_OP_L1_TO_L0_BT( \
l0bAddr, l0bDtype, l0bShape0, l0bShape1, l0bValidShape0, l0bValidShape1, l0bStride0, l0bStride1, l1Addr, l1Dtype, \
l1Shape0, l1Shape1, l1ValidShape0, l1ValidShape1, l1Stride0, l1Stride1, l1Offset0, l1Offset1, reluMode, \
scaleValue) \
DynL1ToL0Bt<l1Dtype, l1Offset0, l1Offset1>( \
l0bAddr, l1Addr, l0bValidShape0, l0bValidShape1, l1ValidShape0, l1ValidShape1);
template <typename T, unsigned Offset0, unsigned Offset1>
TILEOP void DynL1ToL0Bt(__cb__ T* dst, __cbuf__ T* src, unsigned dstK, unsigned dstN, unsigned srcN, unsigned srcK)
{
if (dstK == 0 || dstN == 0 || srcK == 0 || srcN == 0) {
return;
}
constexpr uint16_t c0Size = BLOCK_ALIGN_BYTE / sizeof(T);
dstN = CeilAlign<uint16_t>(dstN, BLOCK_CUBE_M_N);
dstK = CeilAlign<uint16_t>(dstK, c0Size);
srcN = CeilAlign<uint16_t>(srcN, BLOCK_CUBE_M_N);
srcK = CeilAlign<uint16_t>(srcK, c0Size);
if constexpr (std::is_same<T, int8_t>::value) {
dstN = CeilAlign<uint16_t>(dstN, c0Size);
srcN = CeilAlign<uint16_t>(srcN, c0Size);
}
if (dstN == srcN) {
uint8_t repeat = dstN / BLOCK_CUBE_M_N * dstK / c0Size;
constexpr uint16_t srcStride = 1;
constexpr uint16_t dstStride = 0;
load_cbuf_to_cb(
dst, src + (Offset0 * c0Size + Offset1 * srcN), (uint16_t)0, repeat, srcStride, dstStride, (uint8_t)0,
(bool)0, (addr_cal_mode_t)0);
} else {
uint8_t repeat = dstN / BLOCK_CUBE_M_N;
constexpr uint16_t srcStride = 1;
constexpr uint16_t dstStride = 0;
for (int32_t kIdx = 0; kIdx < static_cast<int32_t>(dstK / c0Size); ++kIdx) {
load_cbuf_to_cb(
dst + kIdx * c0Size * dstN, src + kIdx * srcN * c0Size + (Offset0 * c0Size + Offset1 * srcN),
(uint16_t)0, repeat, srcStride, dstStride, (uint8_t)0, (bool)0, (addr_cal_mode_t)0);
}
}
}
#define RT_OPERATION_OP_L1_TO_BT( \
biasTableAddr, biasTableDtype, biasTableShape0, biasTableShape1, biasTableValidShape0, biasTableValidShape1, \
biasTableStride0, biasTableStride1, l1Addr, l1Dtype, l1Shape0, l1Shape1, l1ValidShape0, l1ValidShape1, l1Stride0, \
l1Stride1, l1Offset0, l1Offset1) \
DynL1ToBT<l1Dtype, biasTableDtype, l1Offset1>(biasTableAddr, l1Addr, biasTableValidShape1);
template <typename L1T, typename BTT, unsigned Offset>
TILEOP void DynL1ToBT(uint64_t dst, __cbuf__ L1T* src, unsigned nSize)
{
constexpr uint16_t nBurst = 1;
uint16_t lenBurst = CeilDiv<uint16_t>(nSize * sizeof(L1T), 64);
constexpr uint16_t sourceGap = 0;
constexpr uint16_t dstGap = 0;
constexpr bool convControl = std::is_same<L1T, half>::value;
copy_cbuf_to_bt(dst, src + Offset, convControl, nBurst, lenBurst, sourceGap, dstGap);
}
#define RT_OPERATION_OP_L1_TO_FIX_QUANT_PRE( \
fixPipeAddr, fixPipeDtype, fixPipeShape0, fixPipeShape1, fixPipeValidShape0, fixPipeValidShape1, fixPipeStride0, \
fixPipeStride1, l1Addr, l1Dtype, l1Shape0, l1Shape1, l1ValidShape0, l1ValidShape1, l1Stride0, l1Stride1, \
l1Offset0, l1Offset1) \
DynL1ToFB<l1Dtype, l1Offset1>(fixPipeAddr, l1Addr, fixPipeValidShape1);
template <typename T, unsigned L1Offset>
TILEOP void DynL1ToFB(__fbuf__ T* dst, __cbuf__ T* src, unsigned nSize)
{
uint16_t deqDataSize = CeilDiv<uint16_t>(nSize * sizeof(uint64_t), 128) * 128;
uint16_t fbufBurstLen = deqDataSize / 128;
copy_cbuf_to_fbuf(dst, src + L1Offset, 1, fbufBurstLen, 0, 0);
uint64_t deqTensorAddr = ((uint64_t)dst >> static_cast<uint64_t>(7)) << 8;
set_fpc(deqTensorAddr);
}
#define RT_OPERATION_OP_A_MUL_B( \
l0cAddr, l0cDtype, l0cShape0, l0cShape1, l0cValidShape0, l0cValidShape1, l0cStride0, l0cStride1, l0aAddr, \
l0aDtype, l0aShape0, l0aShape1, l0aValidShape0, l0aValidShape1, l0aStride0, l0aStride1, l0bAddr, l0bDtype, \
l0bShape0, l0bShape1, l0bValidShape0, l0bValidShape1, l0bStride0, l0bStride1, hasbias) \
DynTmad<l0cDtype, l0aDtype, l0bDtype, 0, 0, hasbias>( \
l0cAddr, l0aAddr, l0bAddr, l0aValidShape0, l0aValidShape1, l0bValidShape1, false, 0, l0cValidShape0, \
l0cValidShape1);
#define RT_OPERATION_OP_A_MULACC_B( \
l0cAddr, l0cDtype, l0cShape0, l0cShape1, l0cValidShape0, l0cValidShape1, l0cStride0, l0cStride1, l0aAddr, \
l0aDtype, l0aShape0, l0aShape1, l0aValidShape0, l0aValidShape1, l0aStride0, l0aStride1, l0bAddr, l0bDtype, \
l0bShape0, l0bShape1, l0bValidShape0, l0bValidShape1, l0bStride0, l0bStride1, hasbias) \
DynTmad<l0cDtype, l0aDtype, l0bDtype, 0, 0, hasbias>( \
l0cAddr, l0aAddr, l0bAddr, l0aValidShape0, l0aValidShape1, l0bValidShape1, true, 0, l0cValidShape0, \
l0cValidShape1);
template <typename Tc, typename Ta, typename Tb, unsigned Offset0, unsigned Offset1, bool HasBias = false>
TILEOP void DynTmad(
__cc__ Tc* c, __ca__ Ta* a, __cb__ Tb* b, uint16_t m, uint16_t k, uint16_t n, bool zero_C, int uf,
unsigned L0CShape0, unsigned L0CShape1)
{
if (m == 0 || k == 0 || n == 0) {
return;
}
if constexpr (HasBias) {
zero_C = false;
}
m = CeilAlign<uint16_t>(m, BLOCK_CUBE_M_N);
if constexpr (std::is_same<Tb, int8_t>::value) {
n = CeilAlign<uint16_t>(n, 32);
}
constexpr bool kDirectionAlign = true;
mad((__cc__ Tc*)(c + (Offset0 * BLOCK_CUBE_M_N) + Offset1 * L0CShape0), a, b, m, k, n, 0, kDirectionAlign, HasBias,
zero_C);
pipe_barrier(PIPE_M);
}
#define RT_OPERATION_OP_L0C_COPY_OUT( \
gmAddr, gmDtype, gmShape0, gmShape1, l0cAddr, l0cDtype, l0cShape0, l0cShape1, l0cValidShape0, l0cValidShape1, \
l0cStride0, l0cStride1, gmOffset0, gmOffset1, reluMode, enableGmAcc, scaleValue) \
do { \
if (enableGmAcc) { \
DynL0CCopyOut<gmDtype, l0cDtype, 1, true, reluMode>( \
gmAddr, l0cAddr, l0cValidShape0, l0cValidShape1, gmShape0, gmShape1, gmOffset0, gmOffset1, gmShape0, \
gmShape1, 0); \
} else { \
DynL0CCopyOut<gmDtype, l0cDtype, 0, true, reluMode>( \
gmAddr, l0cAddr, l0cValidShape0, l0cValidShape1, gmShape0, gmShape1, gmOffset0, gmOffset1, gmShape0, \
gmShape1, 0, scaleValue); \
} \
} while (0)
template <typename GMT, typename L0CT, bool enableNZ2ND, uint8_t reluMode>
TILEOP void DynL0CCopyOut(
__gm__ GMT* dst, __cc__ L0CT* src, unsigned oriTShape0, unsigned oriTShape1, unsigned GmShape0, unsigned GmShape1,
unsigned GmOffset0, unsigned GmOffset1, unsigned curH, unsigned curW, int uf, uint64_t scaleValue = 0)
{
if (oriTShape0 == 0 || oriTShape1 == 0) {
return;
}
uint16_t mSize = oriTShape0;
uint16_t nSize = oriTShape1;
uint32_t dstStrideDstD = GmShape1;
uint16_t srcStride = CeilAlign<uint16_t>(oriTShape0, BLOCK_CUBE_M_N);
bool channelSplit = false;
int64_t gmOffset = (GmOffset0 * GmShape1) + GmOffset1;
if constexpr (!enableNZ2ND) {
int64_t c0Size = std::is_same<GMT, int32_t>::value ? BLOCK_CUBE_M_N : BLOCK_ALIGN_BYTE / sizeof(GMT);
nSize = CeilAlign<uint16_t>(nSize, c0Size);
int64_t wAlign = CeilAlign<int64_t>(curW, c0Size);
int64_t elemPerBatch = curH * wAlign;
int64_t batchIdx = gmOffset / elemPerBatch;
gmOffset = batchIdx * elemPerBatch + (GmOffset1 * curH) + (GmOffset0 - batchIdx * curH) * c0Size;
channelSplit = std::is_same<GMT, float>::value;
dstStrideDstD = std::is_same<GMT, int32_t>::value ? curH * 2 : curH;
}
constexpr uint64_t ndNum = 1;
constexpr uint64_t srcNdStride = 0;
constexpr uint64_t dstNdStride = 0;
constexpr uint64_t ndPara = (ndNum & 0xffff) | ((srcNdStride & 0xffff) << 16) | ((dstNdStride & 0xffff) << 32);
set_nd_para(ndPara);
uint64_t quantPre = NoQuant;
if (std::is_same<L0CT, float>::value) {
if (std::is_same<GMT, half>::value) {
quantPre = QuantMode_t::F322F16;
} else if (std::is_same<GMT, bfloat16_t>::value) {
quantPre = QuantMode_t::F322BF16;
} else {
quantPre = QuantMode_t::NoQuant;
}
} else if constexpr (std::is_same<L0CT, int32_t>::value && std::is_same<GMT, half>::value) {
if (scaleValue == 0) {
quantPre = QuantMode_t::VDEQF16;
} else {
set_quant_pre(scaleValue);
quantPre = QuantMode_t::DEQF16;
}
}
uint8_t unitFlagMode = uf;
copy_matrix_cc_to_gm(
(__gm__ GMT*)(dst + gmOffset), (__cc__ L0CT*)src, 0, nSize, mSize, dstStrideDstD, srcStride, unitFlagMode,
quantPre, reluMode, channelSplit, enableNZ2ND);
}
template <typename GMT, typename L0CT, int isAcc, bool enableNZ2ND, uint8_t reluMode>
TILEOP void DynL0CCopyOut(
__gm__ GMT* dst, __cc__ L0CT* src, unsigned oriTShape0, unsigned oriTShape1, unsigned GmShape0, unsigned GmShape1,
unsigned GmOffset0, unsigned GmOffset1, unsigned curH, unsigned curW, int uf)
{
static_assert(reluMode == 0, "Relu operation is not supported in GM accumulate mode");
SetAtomicAddition<GMT>();
DynL0CCopyOut<GMT, L0CT, enableNZ2ND, 0>(
dst, src, oriTShape0, oriTShape1, GmShape0, GmShape1, GmOffset0, GmOffset1, curH, curW, uf);
if constexpr (isAcc == 1) {
set_atomic_none();
}
}
#define RT_OPERATION_OP_L0C_TO_L1( \
l1Addr, l1Dtype, l1Shape0, l1Shape1, l1ValidShape0, l1ValidShape1, l1Stride0, l1Stride1, l0cAddr, l0cDtype, \
l0cShape0, l0cShape1, l0cValidShape0, l0cValidShape1, l0cStride0, l0cStride1, offset0, offset1, reluMode, \
scaleValue) \
DynL0CToL1<l1Dtype, l0cDtype, reluMode>( \
l1Addr, l0cAddr, l0cValidShape0, l0cValidShape1, l1ValidShape0, l1ValidShape1, offset0, offset1, scaleValue);
template <typename L1T, typename L0CT, uint8_t reluMode>
TILEOP void DynL0CToL1(
__cbuf__ L1T* dst, __cc__ L0CT* src, unsigned shape0, unsigned shape1, unsigned l1Shape0, unsigned l1Shape1,
unsigned l1Offset0, unsigned l1Offset1, unsigned l0cShape0, unsigned l0cShape1, unsigned l0cOffset0,
unsigned l0cOffset1, uint64_t scaleValue = 0)
{
int64_t c0Size = BLOCK_ALIGN_BYTE / sizeof(L1T);
uint16_t mSize = CeilAlign<uint16_t>(shape0, BLOCK_CUBE_M_N);
uint16_t nSize = CeilAlign<uint16_t>(shape1, c0Size);
uint32_t dstStrideDstD = CeilAlign<uint16_t>(l1Shape0, BLOCK_CUBE_M_N);
;
uint16_t srcStride = CeilAlign<uint16_t>(l0cShape0, BLOCK_CUBE_M_N);
uint8_t unitFlagMode = 0;
uint64_t quantPre = NoQuant;
uint8_t reluPre = 0;
if (std::is_same<L0CT, float>::value) {
if (std::is_same<L1T, half>::value) {
quantPre = QuantMode_t::F322F16;
} else if (std::is_same<L1T, bfloat16_t>::value) {
quantPre = QuantMode_t::F322BF16;
} else {
quantPre = QuantMode_t::NoQuant;
}
} else if constexpr (std::is_same<L0CT, int32_t>::value && std::is_same<L1T, half>::value) {
if (scaleValue == 0) {
quantPre = QuantMode_t::VDEQF16;
} else {
set_quant_pre(scaleValue);
quantPre = QuantMode_t::DEQF16;
}
}
bool channelSplit = std::is_same<L1T, float>::value;
bool nZ2NDEN = false;
int64_t l1Offset = l1Offset1 * l1Shape0 + l1Offset0 * c0Size;
int64_t l0cOffset = l0cOffset1 * l0cShape0 + l0cOffset0 * c0Size;
copy_matrix_cc_to_cbuf(
(__cbuf__ L1T*)(dst + l1Offset), (__cc__ L0CT*)(src + l0cOffset), 0, nSize, mSize, dstStrideDstD, srcStride,
unitFlagMode, quantPre, reluMode, channelSplit, nZ2NDEN);
}
template <
typename T, typename T2, typename T3, int64_t dstRawShape0, int64_t offsetRawShape1, int64_t srcColumnStartOffset,
int64_t blockSize>
TILEOP void GatherInL1(
__cbuf__ T* dst, int64_t dstOriginShape0, int64_t dstOriginShape1, __gm__ T* src, int64_t srcRawShape1,
__gm__ T2* offsets, __gm__ T3* blockTable, int64_t offsetsRowStartOffset, int64_t offsetsColumnStartOffset,
int64_t GMBlockTableStride1, int64_t GMBlockTableOffset0, int64_t GMBlockTableOffset1)
{
static_assert(std::is_same_v<T2, int32_t> || std::is_same_v<T2, int64_t>);
constexpr uint16_t c0Size = BLOCK_SIZE / sizeof(T);
uint16_t nBurst = dstOriginShape1 / c0Size;
uint16_t dstStride = CeilAlign<uint16_t>(dstOriginShape0, BLOCK_CUBE_M_N) - 1;
int64_t offsetsStartOffset = offsetsRowStartOffset * offsetRawShape1 + offsetsColumnStartOffset;
blockTable += GMBlockTableOffset0 * GMBlockTableStride1 + GMBlockTableOffset1;
pipe_barrier(PIPE_ALL);
if (dstOriginShape1 % c0Size > 0) {
uint16_t dValue = dstOriginShape1;
uint16_t srcDValue = srcRawShape1;
uint16_t dstNzC0Stride = CeilAlign<uint16_t>(dstOriginShape0, BLOCK_CUBE_M_N);
if constexpr (std::is_same<T, int8_t>::value) {
dstNzC0Stride = CeilAlign<uint16_t>(dstOriginShape0, c0Size);
for (int64_t i = 0; i < dstOriginShape0; i++) {
uint64_t gatherOffset = offsets[i + offsetsStartOffset];
gatherOffset = CalaOffset2PageAttention<uint64_t, T3, blockSize>(blockTable, gatherOffset);
copy_gm_to_cbuf_multi_nd2nz_b8(
(__cbuf__ T*)dst + i * c0Size, (__gm__ T*)src + gatherOffset * srcRawShape1 + srcColumnStartOffset,
0, 1, 1, dValue, 0, srcDValue, dstNzC0Stride, 1, 1);
}
}
if constexpr (std::is_same<T, half>::value || std::is_same<T, bfloat16_t>::value) {
for (int64_t i = 0; i < dstOriginShape0; i++) {
uint64_t gatherOffset = offsets[i + offsetsStartOffset];
gatherOffset = CalaOffset2PageAttention<uint64_t, T3, blockSize>(blockTable, gatherOffset);
copy_gm_to_cbuf_multi_nd2nz_b16(
(__cbuf__ T*)dst + i * c0Size, (__gm__ T*)src + gatherOffset * srcRawShape1 + srcColumnStartOffset,
0, 1, 1, dValue, 0, srcDValue, dstNzC0Stride, 1, 1);
}
}
if constexpr (std::is_same<T, float>::value) {
for (int64_t i = 0; i < dstOriginShape0; i++) {
uint64_t gatherOffset = offsets[i + offsetsStartOffset];
gatherOffset = CalaOffset2PageAttention<uint64_t, T3, blockSize>(blockTable, gatherOffset);
copy_gm_to_cbuf_multi_nd2nz_b32s(
(__cbuf__ T*)dst + i * c0Size, (__gm__ T*)src + gatherOffset * srcRawShape1 + srcColumnStartOffset,
0, 1, 1, dValue, 0, srcDValue, dstNzC0Stride, 1, 1);
}
}
} else {
if constexpr (std::is_same<T, int8_t>::value) {
dstStride = CeilAlign<uint16_t>(dstOriginShape0, c0Size) - 1;
}
for (int64_t i = 0; i < dstOriginShape0; i++) {
uint64_t gatherOffset = offsets[i + offsetsStartOffset];
gatherOffset = CalaOffset2PageAttention<uint64_t, T3, blockSize>(blockTable, gatherOffset);
copy_gm_to_cbuf(
dst + i * c0Size, src + gatherOffset * srcRawShape1 + srcColumnStartOffset, 0, nBurst, 1, 0, dstStride,
PAD_NONE);
}
}
pipe_barrier(PIPE_ALL);
}
template <typename T, unsigned dstM, unsigned dstK, unsigned Offset0, unsigned Offset1, unsigned srcM, unsigned srcK>
TILEOP void DynL1ToL0A(__ca__ T* dst, __cbuf__ T* src)
{
DynL1ToL0A<T, Offset0, Offset1>(dst, src, dstM, dstK, srcM, srcK);
}
template <typename T, unsigned dstM, unsigned dstK, unsigned Offset0, unsigned Offset1, unsigned srcK, unsigned srcM>
TILEOP void DynL1ToL0At(__ca__ T* dst, __cbuf__ T* src)
{
DynL1ToL0At<T, Offset0, Offset1>(dst, src, dstM, dstK, srcK, srcM);
}
template <typename T, unsigned dstK, unsigned dstN, unsigned Offset0, unsigned Offset1, unsigned srcK, unsigned srcN>
TILEOP void DynL1ToL0B(__cb__ T* dst, __cbuf__ T* src)
{
DynL1ToL0B<T, Offset0, Offset1>(dst, src, dstK, dstN, srcK, srcN);
}
template <typename T, unsigned dstK, unsigned dstN, unsigned Offset0, unsigned Offset1, unsigned srcN, unsigned srcK>
TILEOP void DynL1ToL0Bt(__cb__ T* dst, __cbuf__ T* src)
{
DynL1ToL0Bt<T, Offset0, Offset1>(dst, src, dstK, dstN, srcN, srcK);
}
template <typename GMT, typename L1T>
TILEOP void DynL1CopyOutND(
__gm__ GMT* dst, __cbuf__ L1T* src, unsigned TShape0, unsigned TShape1, unsigned GmShape0, unsigned GmShape1,
int reserved)
{
uint16_t nBurst = TShape0;
uint16_t lenBurst = TShape1 * sizeof(GMT) / BLOCK_SIZE;
uint16_t srcStride = 0;
uint16_t dstStride = (GmShape1 - TShape1) * sizeof(GMT) / BLOCK_SIZE;
if (lenBurst == 0) {
nBurst = 1;
lenBurst = TShape0 * TShape1 * sizeof(GMT);
if (lenBurst == 0) {
lenBurst = 1;
}
srcStride = 0;
dstStride = 0;
}
copy_cbuf_to_gm(dst, src, 0 , nBurst, lenBurst, srcStride, dstStride);
}
template <typename GMT, typename L1T>
TILEOP void DynL1CopyOut(
__gm__ GMT* dst, __cbuf__ L1T* src, unsigned TShape0, unsigned TShape1, unsigned GmShape0, unsigned GmShape1,
int reserved)
{
TileOp::DynL1CopyOutND<GMT, L1T>(dst, src, TShape0, TShape1, GmShape0, GmShape1, reserved);
}
template <typename GMT, typename L1T>
TILEOP void DynL1CopyIn(
__cbuf__ L1T* dst, __gm__ GMT* src, unsigned TShape0, unsigned TShape1, unsigned GmShape0, unsigned GmShape1,
int reserved)
{
uint16_t nBurst = TShape0;
uint16_t lenBurst = TShape1 * sizeof(GMT) / 32;
uint16_t srcStride = 0;
uint16_t dstStride = (GmShape1 - TShape1) * sizeof(GMT) / BLOCK_SIZE;
if (lenBurst == 0) {
nBurst = 1;
lenBurst = TShape0 * TShape1 * sizeof(GMT);
if (lenBurst == 0) {
lenBurst = 1;
}
srcStride = 0;
dstStride = 0;
}
copy_gm_to_cbuf(dst, src, 0 , nBurst, lenBurst, srcStride, dstStride, PAD_NONE);
}
* Deprecated: The following interfaces are deprecated and will no longer be maintained.
*/
* brief: dynamic l1 copy in nz2nz functions with batch
*/
template <typename GMT, typename L1T, unsigned TShape0, unsigned TShape1>
TILEOP void DynL1CopyInNZ2NZ(
__cbuf__ L1T* dst, __gm__ GMT* src, unsigned GmShape0, unsigned GmShape1, unsigned GmOffset0, unsigned GmOffset1,
unsigned curH, unsigned curW, int reserved)
{
auto inputC0Size = 32 / sizeof(L1T);
auto batchSize = curH * curW;
auto offsetElem = GmOffset0 * GmShape1 + GmOffset1;
auto batchIndex = offsetElem / batchSize;
auto offsetWithNZ = batchIndex * batchSize + (GmOffset1 * curH) + (GmOffset0 - batchIndex * curH) * inputC0Size;
int32_t C0 = 32 / sizeof(GMT);
uint16_t nBurst = TShape1 / C0;
uint16_t lenBurst = TShape0 * C0 * sizeof(GMT) / 32;
uint16_t srcStride = (curH - TShape0) * C0 * sizeof(GMT) / 32;
uint16_t dstStride = 0;
copy_gm_to_cbuf(dst, src + offsetWithNZ, 0 , nBurst, lenBurst, srcStride, dstStride, PAD_NONE);
}
* brief: dynamic l1 copy in nd2nz functions
*/
template <typename GMT, typename L1T, unsigned TShape0, unsigned TShape1>
TILEOP void DynL1CopyIn(
__cbuf__ L1T* dst, __gm__ GMT* src, unsigned GmShape0, unsigned GmShape1, unsigned GmOffset0, unsigned GmOffset1,
int reserved)
{
src += CalcLinearOffset(GmShape1, GmOffset0, GmOffset1);
constexpr uint16_t ndNum = 1;
constexpr uint16_t nValue = TShape0;
constexpr uint16_t dValue = TShape1;
constexpr uint16_t srcNdMatrixStride = 0;
uint16_t srcDValue = GmShape1;
auto c0Size = 32 / sizeof(GMT);
constexpr uint16_t dstNzC0Stride = TShape0;
constexpr uint16_t dstNzNStride = 1;
constexpr uint16_t dstNzMatrixStride = 1;
if constexpr (std::is_same<GMT, int8_t>::value) {
copy_gm_to_cbuf_multi_nd2nz_b8(
(__cbuf__ L1T*)dst, (__gm__ GMT*)src, 0 , ndNum, nValue, dValue, srcNdMatrixStride, srcDValue,
dstNzC0Stride, dstNzNStride, dstNzMatrixStride);
}
if constexpr (std::is_same<GMT, half>::value || std::is_same<GMT, bfloat16_t>::value) {
copy_gm_to_cbuf_multi_nd2nz_b16(
(__cbuf__ L1T*)dst, (__gm__ GMT*)src, 0 , ndNum, nValue, dValue, srcNdMatrixStride, srcDValue,
dstNzC0Stride, dstNzNStride, dstNzMatrixStride);
}
if constexpr (std::is_same<GMT, float>::value) {
copy_gm_to_cbuf_multi_nd2nz_b32s(
(__cbuf__ L1T*)dst, (__gm__ GMT*)src, 0 , ndNum, nValue, dValue, srcNdMatrixStride, srcDValue,
dstNzC0Stride, dstNzNStride, dstNzMatrixStride);
}
}
template <typename GMT, typename L1T, unsigned TShape0, unsigned TShape1, unsigned GmShape0, unsigned GmShape1>
TILEOP void DynL1CopyOutND(__gm__ GMT* dst, __cbuf__ L1T* src, int reserved)
{
uint16_t nBurst = TShape0;
uint16_t lenBurst = TShape1 * sizeof(GMT) / BLOCK_SIZE;
uint16_t srcStride = 0;
uint16_t dstStride = (GmShape1 - TShape1) * sizeof(GMT) / BLOCK_SIZE;
if (lenBurst == 0) {
nBurst = 1;
lenBurst = TShape0 * TShape1 * sizeof(GMT);
if (lenBurst == 0) {
lenBurst = 1;
}
srcStride = 0;
dstStride = 0;
}
copy_cbuf_to_gm(dst, src, 0 , nBurst, lenBurst, srcStride, dstStride);
}
template <typename GMT, typename L1T, unsigned TShape0, unsigned TShape1, unsigned GmShape0, unsigned GmShape1>
TILEOP void DynL1CopyOut(__gm__ GMT* dst, __cbuf__ L1T* src, int reserved)
{
TileOp::DynL1CopyOutND<GMT, L1T, TShape0, TShape1, GmShape0, GmShape1>(dst, src, reserved);
}
template <typename GMT, typename L1T, unsigned TShape0, unsigned TShape1>
TILEOP void DynL1CopyInND(__cbuf__ L1T* dst, __gm__ GMT* src, unsigned GmShape0, unsigned GmShape1, int reserved)
{
uint16_t nBurst = TShape0;
uint16_t lenBurst = TShape1 * sizeof(GMT) / 32;
uint16_t srcStride = 0;
uint16_t dstStride = (GmShape1 - TShape1) * sizeof(GMT) / BLOCK_SIZE;
if (lenBurst == 0) {
nBurst = 1;
lenBurst = TShape0 * TShape1 * sizeof(GMT);
if (lenBurst == 0) {
lenBurst = 1;
}
srcStride = 0;
dstStride = 0;
}
copy_gm_to_cbuf(dst, src, 0 , nBurst, lenBurst, srcStride, dstStride, PAD_NONE);
}
template <typename GMT, typename L0CT, unsigned TShape0, unsigned TShape1, unsigned oriTShape0, unsigned oriTShape1>
TILEOP void DynL0CCopyOut(
__gm__ GMT* dst, __cc__ L0CT* src, unsigned GmShape0, unsigned GmShape1, unsigned GmOffset0, unsigned GmOffset1,
int uf)
{
uint16_t MSize = oriTShape0 < (GmShape0 - GmOffset0) ? oriTShape0 : (GmShape0 - GmOffset0);
uint16_t NSize = TShape1 < (GmShape1 - GmOffset1) ? TShape1 : (GmShape1 - GmOffset1);
uint32_t dstStride_dst_D = GmShape1;
uint16_t srcStride = TShape0;
uint64_t ndNum = 1;
uint64_t src_nd_stride = 0;
uint64_t dst_nd_stride = 0;
uint8_t UnitFlagMode = uf;
uint64_t QuantPRE = NoQuant;
uint8_t ReLUPRE = 0;
bool channelSplit = false;
bool NZ2ND_EN = true;
uint64_t config = 0, nd_para = 0;
nd_para = nd_para | (ndNum & 0xffff);
nd_para = nd_para | ((src_nd_stride & 0xffff) << 16);
nd_para = nd_para | ((dst_nd_stride & 0xffff) << 32);
if (std::is_same<L0CT, float>::value) {
if (std::is_same<GMT, half>::value) {
QuantPRE = QuantMode_t::F322F16;
} else if (std::is_same<GMT, bfloat16_t>::value) {
QuantPRE = QuantMode_t::F322BF16;
} else {
QuantPRE = QuantMode_t::NoQuant;
}
}
set_nd_para(nd_para);
copy_matrix_cc_to_gm(
(__gm__ GMT*)(dst + (GmOffset0 * GmShape1) + GmOffset1), (__cc__ L0CT*)src, 0, NSize, MSize, dstStride_dst_D,
srcStride, UnitFlagMode, QuantPRE, ReLUPRE, channelSplit, NZ2ND_EN);
}
template <
typename GMT, typename L0CT, unsigned TShape0, unsigned TShape1, unsigned oriTShape0, unsigned oriTShape1,
int isAcc>
TILEOP void DynL0CCopyOut(
__gm__ GMT* dst, __cc__ L0CT* src, unsigned GmShape0, unsigned GmShape1, unsigned GmOffset0, unsigned GmOffset1,
int uf)
{
SetAtomicAddition<GMT>();
DynL0CCopyOut<GMT, L0CT, TShape0, TShape1, oriTShape0, oriTShape1>(
dst, src, GmShape0, GmShape1, GmOffset0, GmOffset1, uf);
if constexpr (isAcc == 1) {
set_atomic_none();
}
}
}
#endif