* Copyright (c) 2025 Huawei Technologies Co., Ltd.
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
* CANN Open Software License Agreement Version 2.0 (the "License").
* Please refer to the License for details. You may not use this file except in compliance with the License.
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
* See LICENSE in the root of the software repository for the full text of the License.
*/
* \file kernel_micro_datacopy_impl.h
* \brief
*/
#ifndef ASCENDC_MODULE_MICRO_DATACOPY_LOAD_IMPL_H
#define ASCENDC_MODULE_MICRO_DATACOPY_LOAD_IMPL_H
namespace AscendC {
namespace MicroAPI {
template <int outputNum, LoadDist dist>
__simd_callee__ inline void CheckLoadDist()
{
if constexpr (outputNum == 1) {
static_assert(SupportEnum<dist, LoadDist::DIST_NORM, LoadDist::DIST_BRC_B8, LoadDist::DIST_BRC_B16,
LoadDist::DIST_BRC_B32, LoadDist::DIST_US_B8, LoadDist::DIST_US_B16, LoadDist::DIST_DS_B8,
LoadDist::DIST_DS_B16, LoadDist::DIST_UNPACK_B8, LoadDist::DIST_UNPACK_B16, LoadDist::DIST_BLK,
LoadDist::DIST_E2B_B16, LoadDist::DIST_E2B_B32, LoadDist::DIST_UNPACK_B32,
LoadDist::DIST_UNPACK4_B8>(),
"LoadAlign not support this dist on current device");
} else {
static_assert(SupportEnum<dist, LoadDist::DIST_DINTLV_B8, LoadDist::DIST_DINTLV_B16,
LoadDist::DIST_DINTLV_B32>(),
"LoadAlign not support this dist on current device");
}
}
template <typename T = DefaultType, LoadDist dist = LoadDist::DIST_NORM, typename U>
__simd_callee__ inline void DataCopyImpl(U& dstReg, __ubuf__ T* srcAddr)
{
using ActualT = typename U::ActualT;
static_assert(std::is_same_v<T, DefaultType> || std::is_same_v<T, ActualT>, "T type is not correct!");
CheckLoadDist<1, dist>();
constexpr auto distValue = std::integral_constant<::Dist, static_cast<::Dist>(dist)>();
if constexpr (SupportBytes<ActualT, 1>()) {
vlds((RegTensor<uint8_t>&)dstReg, (__ubuf__ uint8_t*)srcAddr, 0, distValue);
} else if constexpr (SupportBytes<ActualT, 8>()) {
if constexpr (CheckRegTrait<U, RegTraitNumOne>()) {
vlds((RegTensor<uint32_t>&)dstReg, (__ubuf__ uint32_t*)srcAddr, 0, distValue);
} else if constexpr (CheckRegTrait<U, RegTraitNumTwo>()) {
constexpr auto dintlvDist =
std::integral_constant<::Dist, static_cast<::Dist>(LoadDist::DIST_DINTLV_B32)>();
vlds((RegTensor<uint32_t>&)dstReg.reg[0], (RegTensor<uint32_t>&)dstReg.reg[1],
(__ubuf__ uint32_t*)srcAddr, 0, dintlvDist);
}
} else {
if constexpr(SupportType<ActualT, complex32>() && (CheckRegTrait<U, RegTraitNumTwo>())) {
constexpr auto dintlvDist =
std::integral_constant<::Dist, static_cast<::Dist>(LoadDist::DIST_DINTLV_B16)>();
vlds((RegTensor<uint16_t>&)dstReg.reg[0], (RegTensor<uint16_t>&)dstReg.reg[1],
(__ubuf__ uint16_t*)srcAddr, 0, dintlvDist);
} else {
static_assert(SupportBytes<ActualT, 1, 2, 4, 8>(),
"LoadAlign only support type b8/b16/b32/b64 on current device");
if constexpr (std::is_same_v<T, bool>) {
vlds((RegTensor<int8_t>&)dstReg, (__ubuf__ int8_t*)srcAddr, 0, distValue);
} else if constexpr (std::is_same_v<T, complex32>) {
vlds((RegTensor<int32_t>&)dstReg, (__ubuf__ int32_t*)srcAddr, 0, distValue);
} else {
vlds(dstReg, srcAddr, 0, distValue);
}
}
}
}
template <typename T = DefaultType, PostLiteral postMode, LoadDist dist = LoadDist::DIST_NORM, typename U>
__simd_callee__ inline void DataCopyImpl(U& dstReg, __ubuf__ T*& srcAddr, int32_t postUpdateStride)
{
using ActualT = typename U::ActualT;
static_assert(std::is_same_v<T, DefaultType> || std::is_same_v<T, ActualT>, "T type is not correct!");
CheckLoadDist<1, dist>();
constexpr auto distValue = std::integral_constant<::Dist, static_cast<::Dist>(dist)>();
constexpr auto postValue = std::integral_constant<::Post, static_cast<::Post>(postMode)>();
if constexpr (SupportBytes<ActualT, 1>()) {
vlds((RegTensor<uint8_t>&)dstReg, (__ubuf__ uint8_t*&)srcAddr, postUpdateStride, distValue, postValue);
} else if constexpr (SupportBytes<ActualT, 8>()) {
if constexpr (CheckRegTrait<U, RegTraitNumOne>()) {
vlds((RegTensor<uint32_t>&)dstReg, (__ubuf__ uint32_t*&)srcAddr, postUpdateStride * 2, distValue,
postValue);
} else if constexpr (CheckRegTrait<U, RegTraitNumTwo>()) {
constexpr auto dintlvDist =
std::integral_constant<::Dist, static_cast<::Dist>(LoadDist::DIST_DINTLV_B32)>();
vlds((RegTensor<uint32_t>&)dstReg.reg[0], (RegTensor<uint32_t>&)dstReg.reg[1],
(__ubuf__ uint32_t*&)srcAddr, postUpdateStride * 2, dintlvDist, postValue);
}
} else {
if constexpr(SupportType<ActualT, complex32>() && (CheckRegTrait<U, RegTraitNumTwo>())) {
constexpr auto dintlvDist =
std::integral_constant<::Dist, static_cast<::Dist>(LoadDist::DIST_DINTLV_B16)>();
vlds((RegTensor<uint16_t>&)dstReg.reg[0], (RegTensor<uint16_t>&)dstReg.reg[1],
(__ubuf__ uint16_t*&)srcAddr, postUpdateStride * 2, dintlvDist, postValue);
} else {
static_assert(SupportBytes<ActualT, 1, 2, 4, 8>(),
"LoadAlign only support type b8/b16/b32/b64 on current device");
if constexpr (std::is_same_v<T, bool>) {
vlds((RegTensor<int8_t>&)dstReg, (__ubuf__ int8_t*&)srcAddr, postUpdateStride, distValue, postValue);
} else if constexpr (SupportBytes<ActualT, 4>()) {
vlds((RegTensor<int32_t>&)dstReg, (__ubuf__ int32_t*&)srcAddr, postUpdateStride, distValue, postValue);
} else {
vlds(dstReg, srcAddr, postUpdateStride, distValue, postValue);
}
}
}
}
template <typename T = DefaultType, LoadDist dist = LoadDist::DIST_NORM, typename U>
__simd_callee__ inline void DataCopyImpl(U& dstReg, __ubuf__ T* srcAddr, AddrReg offset)
{
using ActualT = typename U::ActualT;
static_assert(std::is_same_v<T, DefaultType> || std::is_same_v<T, ActualT>, "T type is not correct!");
static_assert(CheckRegTrait<U, RegTraitNumOne>(), "RegTensor only suppoort RegTraitNumOne on current device!");
CheckLoadDist<1, dist>();
constexpr auto distValue = std::integral_constant<::Dist, static_cast<::Dist>(dist)>();
if constexpr (SupportBytes<ActualT, 1>()) {
vld((RegTensor<uint8_t>&)dstReg, (__ubuf__ uint8_t*)srcAddr, offset, distValue);
} else {
static_assert(SupportBytes<ActualT, 1, 2, 4, 8>(),
"LoadAlign only support type b8/b16/b32/b64 on current device");
if constexpr (std::is_same_v<T, bool>) {
vld((RegTensor<int8_t>&)dstReg, (__ubuf__ int8_t*)srcAddr, offset, distValue);
} else if constexpr (SupportBytes<ActualT, 4>()) {
vld((RegTensor<int32_t>&)dstReg, (__ubuf__ int32_t*)srcAddr, offset, distValue);
} else if constexpr (SupportBytes<ActualT, 8>()) {
vld((RegTensor<int32_t>&)dstReg, (__ubuf__ int32_t*)srcAddr, offset, distValue);
} else {
vld(dstReg, srcAddr, offset, distValue);
}
}
}
template <typename T = DefaultType, LoadDist dist, typename U>
__simd_callee__ inline void DataCopyImpl(U& dstReg0, U& dstReg1, __ubuf__ T* srcAddr)
{
using ActualT = typename U::ActualT;
static_assert(std::is_same_v<T, DefaultType> || std::is_same_v<T, ActualT>, "T type is not correct!");
static_assert(CheckRegTrait<U, RegTraitNumOne>(), "RegTensor only suppoort RegTraitNumOne on current device!");
CheckLoadDist<2, dist>();
constexpr auto distValue = std::integral_constant<::Dist, static_cast<::Dist>(dist)>();
if constexpr (SupportBytes<ActualT, 1>()) {
vlds((RegTensor<uint8_t>&)dstReg0, (RegTensor<uint8_t>&)dstReg1, (__ubuf__ uint8_t*)srcAddr, 0, distValue);
} else {
static_assert(SupportBytes<ActualT, 1, 2, 4, 8>(),
"LoadAlign only support type b8/b16/b32/b64 on current device");
if constexpr (std::is_same_v<T, bool>) {
vlds((RegTensor<int8_t>&)dstReg0, (RegTensor<int8_t>&)dstReg1, (__ubuf__ int8_t*)srcAddr, 0, distValue);
} else if constexpr (SupportBytes<ActualT, 4>()) {
vlds((RegTensor<int32_t>&)dstReg0, (RegTensor<int32_t>&)dstReg1, (__ubuf__ int32_t*)srcAddr, 0, distValue);
} else if constexpr (SupportBytes<ActualT, 8>()) {
vlds((RegTensor<int64_t>&)dstReg0, (RegTensor<int64_t>&)dstReg1, (__ubuf__ int64_t*)srcAddr, 0, distValue);
} else {
vlds(dstReg0, dstReg1, srcAddr, 0, distValue);
}
}
}
template <typename T = DefaultType, PostLiteral postMode, LoadDist dist, typename U>
__simd_callee__ inline void DataCopyImpl(U& dstReg0, U& dstReg1, __ubuf__ T*& srcAddr, int32_t postUpdateStride)
{
using ActualT = typename U::ActualT;
static_assert(std::is_same_v<T, DefaultType> || std::is_same_v<T, ActualT>, "T type is not correct!");
static_assert(CheckRegTrait<U, RegTraitNumOne>(), "RegTensor only suppoort RegTraitNumOne on current device!");
CheckLoadDist<2, dist>();
constexpr auto distValue = std::integral_constant<::Dist, static_cast<::Dist>(dist)>();
constexpr auto postValue = std::integral_constant<::Post, static_cast<::Post>(postMode)>();
if constexpr (SupportBytes<ActualT, 1>()) {
vlds((RegTensor<uint8_t>&)dstReg0, (RegTensor<uint8_t>&)dstReg1, (__ubuf__ uint8_t*&)srcAddr, postUpdateStride, distValue,
postValue);
} else {
static_assert(SupportBytes<ActualT, 1, 2, 4, 8>(),
"LoadAlign only support type b8/b16/b32/b64 on current device");
if constexpr (std::is_same_v<T, bool>) {
vlds((RegTensor<int8_t>&)dstReg0, (RegTensor<int8_t>&)dstReg1,
(__ubuf__ int8_t*&)srcAddr, postUpdateStride, distValue, postValue);
} else if constexpr (SupportBytes<ActualT, 4>()) {
vlds((RegTensor<int32_t>&)dstReg0, (RegTensor<int32_t>&)dstReg1, (__ubuf__ int32_t*&)srcAddr,
postUpdateStride, distValue, postValue);
} else if constexpr (SupportBytes<ActualT, 8>()) {
vlds((RegTensor<int64_t>&)dstReg0, (RegTensor<int64_t>&)dstReg1, (__ubuf__ int64_t*&)srcAddr,
postUpdateStride, distValue, postValue);
} else {
vlds(dstReg0, dstReg1, srcAddr, postUpdateStride, distValue, postValue);
}
}
}
template <typename T = DefaultType, LoadDist dist, typename U>
__simd_callee__ inline void DataCopyImpl(U& dstReg0, U& dstReg1, __ubuf__ T* srcAddr, AddrReg offset)
{
using ActualT = typename U::ActualT;
static_assert(std::is_same_v<T, DefaultType> || std::is_same_v<T, ActualT>, "T type is not correct!");
static_assert(CheckRegTrait<U, RegTraitNumOne>(), "RegTensor only suppoort RegTraitNumOne on current device!");
CheckLoadDist<2, dist>();
constexpr auto distValue = std::integral_constant<::Dist, static_cast<::Dist>(dist)>();
if constexpr (SupportBytes<ActualT, 1>()) {
vld((RegTensor<uint8_t>&)dstReg0, (RegTensor<uint8_t>&)dstReg1, (__ubuf__ uint8_t*)srcAddr, offset,
distValue);
} else {
static_assert(SupportBytes<ActualT, 1, 2, 4, 8>(),
"LoadAlign only support type b8/b16/b32/b64 on current device");
if constexpr (std::is_same_v<T, bool>) {
vld((RegTensor<int8_t>&)dstReg0, (RegTensor<int8_t>&)dstReg1, (__ubuf__ int8_t*)srcAddr, offset,
distValue);
} else if constexpr (SupportBytes<ActualT, 4>()) {
vld((RegTensor<int32_t>&)dstReg0, (RegTensor<int32_t>&)dstReg1, (__ubuf__ int32_t*)srcAddr, offset,
distValue);
} else if constexpr (SupportBytes<ActualT, 8>()) {
vld((RegTensor<int32_t>&)dstReg0, (RegTensor<int32_t>&)dstReg1, (__ubuf__ int32_t*)srcAddr, offset,
distValue);
} else {
vld(dstReg0, dstReg1, srcAddr, offset, distValue);
}
}
}
template <typename T>
__simd_callee__ inline void DataCopyUnAlignPreImpl(UnalignReg& ureg, __ubuf__ T* srcAddr)
{
static_assert(SupportBytes<T, 1, 2, 4, 8>(),
"LoadUnAlignPre only support type b8/b16/b32/b64 on current device");
if constexpr (SupportBytes<T, 1>()) {
vldas(ureg, (__ubuf__ uint8_t*)srcAddr);
} else if constexpr (sizeof(T) == 8) {
vldas(ureg, (__ubuf__ uint32_t*)srcAddr);
} else {
if constexpr (std::is_same_v<T, bool>) {
vldas(ureg, (__ubuf__ int8_t*)srcAddr);
} else if constexpr (SupportBytes<T, 4>()) {
vldas(ureg, (__ubuf__ int32_t*)srcAddr);
} else {
vldas(ureg, srcAddr);
}
}
}
template <typename T = DefaultType, PostLiteral postMode = PostLiteral::POST_MODE_UPDATE, typename U>
__simd_callee__ inline void DataCopyUnAlignImpl(U& dstReg, UnalignReg& ureg, __ubuf__ T*& srcAddr,
uint32_t stride)
{
using ActualT = typename U::ActualT;
static_assert(std::is_same_v<T, DefaultType> || std::is_same_v<T, ActualT>, "T type is not correct!");
static_assert(SupportEnum<postMode, PostLiteral::POST_MODE_UPDATE>(),
"LoadUnAlign only support update mode when load from local memory!");
static_assert(SupportBytes<ActualT, 1, 2, 4, 8>(),
"LoadUnAlign only support type b8/b16/b32/b64 on current device");
constexpr auto postValue = std::integral_constant<::Post, static_cast<::Post>(postMode)>();
if constexpr (SupportBytes<ActualT, 1>()) {
vldus((RegTensor<uint8_t>&)dstReg, ureg, (__ubuf__ uint8_t*&)srcAddr, stride, postValue);
} else if constexpr (SupportBytes<ActualT, 8>()) {
if constexpr (CheckRegTrait<U, RegTraitNumOne>()) {
vldus((RegTensor<uint32_t>&)dstReg, ureg, (__ubuf__ uint32_t*&)srcAddr, stride * 2, postValue);
} else if constexpr (CheckRegTrait<U, RegTraitNumTwo>()) {
RegTensor<uint32_t> tmp1;
RegTensor<uint32_t> tmp2;
constexpr uint32_t one_repeat_num = VECTOR_REG_WIDTH / sizeof(ActualT);
uint32_t tmpStride1 = (stride > one_repeat_num) ? one_repeat_num : stride;
vldus(tmp1, ureg, (__ubuf__ uint32_t*&)srcAddr, tmpStride1 * 2, postValue);
uint32_t tmpStride2 = (stride > one_repeat_num) ? stride - one_repeat_num : 0;
vldus(tmp2, ureg, (__ubuf__ uint32_t*&)srcAddr, tmpStride2 * 2, postValue);
DeInterleave((RegTensor<uint32_t>&)dstReg.reg[0], (RegTensor<uint32_t>&)dstReg.reg[1], tmp1, tmp2);
}
} else {
if constexpr(SupportType<ActualT, complex32>() && (CheckRegTrait<U, RegTraitNumTwo>())) {
RegTensor<uint16_t> tmp1;
RegTensor<uint16_t> tmp2;
constexpr uint32_t one_repeat_num = VECTOR_REG_WIDTH / sizeof(ActualT);
uint32_t tmpStride1 = (stride > one_repeat_num) ? one_repeat_num : stride;
vldus(tmp1, ureg, (__ubuf__ uint16_t*&)srcAddr, tmpStride1 * 2, postValue);
uint32_t tmpStride2 = (stride > one_repeat_num) ? stride - one_repeat_num : 0;
vldus(tmp2, ureg, (__ubuf__ uint16_t*&)srcAddr, tmpStride2 * 2, postValue);
DeInterleave((RegTensor<uint16_t>&)dstReg.reg[0], (RegTensor<uint16_t>&)dstReg.reg[1], tmp1, tmp2);
} else {
if constexpr (std::is_same_v<T, bool>) {
vldus((RegTensor<int8_t>&)dstReg, ureg, (__ubuf__ int8_t*&)srcAddr, stride, postValue);
} else if constexpr (SupportBytes<ActualT, 4>()) {
vldus((RegTensor<int32_t>&)dstReg, ureg, (__ubuf__ int32_t*&)srcAddr, stride, postValue);
} else {
vldus(dstReg, ureg, srcAddr, stride, postValue);
}
}
}
}
template <typename T = DefaultType, typename U>
__simd_callee__ inline void DataCopyUnAlignImpl(U& dstReg, UnalignReg& ureg, __ubuf__ T* srcAddr)
{
using ActualT = typename U::ActualT;
static_assert(std::is_same_v<T, DefaultType> || std::is_same_v<T, ActualT>, "T type is not correct!");
static_assert(SupportBytes<ActualT, 1, 2, 4, 8>(),
"LoadUnAlign only support type b8/b16/b32/b64 on current device");
if constexpr (SupportBytes<T, 1>()) {
vldus((RegTensor<uint8_t>&)dstReg, ureg, (__ubuf__ uint8_t*)srcAddr);
} else {
if constexpr (std::is_same_v<T, bool>) {
vldus((RegTensor<int8_t>&)dstReg, ureg, (__ubuf__ int8_t*)srcAddr);
} else if constexpr (SupportBytes<T, 8>()) {
vldus((RegTensor<int64_t>&)dstReg, ureg, (__ubuf__ int64_t*)srcAddr);
} else if constexpr (SupportBytes<T, 4>()) {
vldus((RegTensor<int32_t>&)dstReg, ureg, (__ubuf__ int32_t*)srcAddr);
} else {
vldus(dstReg, ureg, srcAddr);
}
}
}
template <typename T = DefaultType, typename U>
__simd_callee__ inline void LoadImpl(U& dstReg, __ubuf__ T* srcAddr)
{
using ActualT = typename U::ActualT;
static_assert(std::is_same_v<T, DefaultType> || std::is_same_v<T, ActualT>, "T type is not correct!");
static_assert(SupportBytes<ActualT, 1, 2, 4, 8>(),
"Load only support type b8/b16/b32/b64 on current device");
static_assert(CheckRegTrait<U, RegTraitNumOne>(), "RegTensor only suppoort RegTraitNumOne on current device!");
UnalignRegForLoad ureg;
DataCopyUnAlignPreImpl<T>(ureg, srcAddr);
DataCopyUnAlignImpl<T, U>(dstReg, ureg, srcAddr);
}
template <typename T>
__simd_callee__ inline void DataCopyUnAlignPreImpl(UnalignReg& ureg, __ubuf__ T* srcAddr, AddrReg& areg)
{
static_assert(SupportBytes<T, 1, 2, 4, 8>(),
"LoadUnAlignPre only support type b8/b16/b32/b64 on current device");
if constexpr (SupportBytes<T, 1>()) {
vlda(ureg, (__ubuf__ uint8_t*)srcAddr, areg);
} else {
if constexpr (std::is_same_v<T, bool>) {
vlda(ureg, (__ubuf__ int8_t*)srcAddr, areg);
} else if constexpr (SupportBytes<T, 8>()) {
vlda(ureg, (__ubuf__ int32_t*)srcAddr, areg);
} else if constexpr (SupportBytes<T, 4>()) {
vlda(ureg, (__ubuf__ int32_t*)srcAddr, areg);
} else {
vlda(ureg, srcAddr, areg);
}
}
}
template <typename T = DefaultType, typename U>
__simd_callee__ inline void DataCopyUnAlignImpl(U& dstReg, UnalignReg& ureg, __ubuf__ T*& srcAddr,
AddrReg& areg, uint32_t inc)
{
using ActualT = typename U::ActualT;
static_assert(std::is_same_v<T, DefaultType> || std::is_same_v<T, ActualT>, "T type is not correct!");
static_assert(CheckRegTrait<U, RegTraitNumOne>(), "RegTensor only suppoort RegTraitNumOne on current device!");
static_assert(SupportBytes<ActualT, 1, 2, 4, 8>(), "LoadUnAlign only support type b8/b16/b32/b64 on current device");
if constexpr (SupportBytes<T, 1>()) {
vldu((RegTensor<uint8_t>&)dstReg, ureg, areg, (__ubuf__ uint8_t*&)srcAddr, inc);
} else {
if constexpr (std::is_same_v<T, bool>) {
vldu((RegTensor<int8_t>&)dstReg, ureg, areg, (__ubuf__ int8_t*&)srcAddr, inc);
} else if constexpr (SupportBytes<T, 8>()) {
vldu((RegTensor<int32_t>&)dstReg, ureg, areg, (__ubuf__ int32_t*&)srcAddr, inc);
} else if constexpr (SupportBytes<T, 4>()) {
vldu((RegTensor<int32_t>&)dstReg, ureg, areg, (__ubuf__ int32_t*&)srcAddr, inc);
} else {
vldu(dstReg, ureg, areg, srcAddr, inc);
}
}
}
template <typename T = DefaultType, DataCopyMode dataMode, typename U>
__simd_callee__ inline void DataCopyImpl(U& dstReg, __ubuf__ T* srcAddr, uint32_t dataBlockStride,
MaskReg& mask)
{
using ActualT = typename U::ActualT;
static_assert(std::is_same_v<T, DefaultType> || std::is_same_v<T, ActualT>, "T type is not correct!");
static_assert(CheckRegTrait<U, RegTraitNumOne>(), "RegTensor only suppoort RegTraitNumOne on current device!");
if constexpr (SupportBytes<ActualT, 1>()) {
vsldb((RegTensor<uint8_t>&)dstReg, (__ubuf__ uint8_t*)srcAddr, (dataBlockStride << 16u), mask);
} else {
static_assert(SupportBytes<ActualT, 1, 2, 4>(), "LoadAlign only support type b8/b16/b32 on current device");
if constexpr (std::is_same_v<T, bool>) {
vsldb((RegTensor<int8_t>&)dstReg, (__ubuf__ int8_t*)srcAddr, (dataBlockStride << 16u), mask);
} else if constexpr (std::is_same_v<T, complex32>) {
vsldb((RegTensor<int32_t>&)dstReg, (__ubuf__ int32_t*)srcAddr, (dataBlockStride << 16u), mask);
} else {
vsldb(dstReg, srcAddr, (dataBlockStride << 16u), mask);
}
}
}
template <typename T = DefaultType, DataCopyMode dataMode, PostLiteral postMode, typename U>
__simd_callee__ inline void DataCopyImpl(U& dstReg, __ubuf__ T*& srcAddr, uint32_t dataBlockStride,
uint32_t repeatStride, MaskReg &mask)
{
if constexpr (postMode == PostLiteral::POST_MODE_NORMAL) {
DataCopyImpl<T, dataMode, U>(dstReg, srcAddr, dataBlockStride, mask);
} else {
using ActualT = typename U::ActualT;
static_assert(std::is_same_v<T, DefaultType> || std::is_same_v<T, ActualT>, "T type is not correct!");
static_assert(CheckRegTrait<U, RegTraitNumOne>(), "RegTensor only suppoort RegTraitNumOne on current device!");
if constexpr (SupportBytes<ActualT, 1>()) {
constexpr auto postValue = std::integral_constant<::Post, static_cast<::Post>(postMode)>();
vsldb((RegTensor<uint8_t>&)dstReg, (__ubuf__ uint8_t*&)srcAddr,
(dataBlockStride << 16u) | (repeatStride & 0xFFFFU), mask, postValue);
} else {
static_assert(SupportBytes<ActualT, 1, 2, 4>(), "LoadAlign only support type b8/b16/b32");
constexpr auto postValue = std::integral_constant<::Post, static_cast<::Post>(postMode)>();
if constexpr (std::is_same_v<T, bool>) {
vsldb((RegTensor<int8_t>&)dstReg, (__ubuf__ int8_t*&)srcAddr,
(dataBlockStride << 16u) | (repeatStride & 0xFFFFU), mask, postValue);
} else if constexpr (std::is_same_v<T, complex32>) {
vsldb((RegTensor<int32_t>&)dstReg, (__ubuf__ int32_t*&)srcAddr,
(dataBlockStride << 16u) | (repeatStride & 0xFFFFU), mask, postValue);
} else {
vsldb(dstReg, srcAddr, (dataBlockStride << 16u) | (repeatStride & 0xFFFFU), mask, postValue);
}
}
}
}
}
}
#endif