* Copyright (c) 2025 Huawei Technologies Co., Ltd.
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
* CANN Open Software License Agreement Version 2.0 (the "License").
* Please refer to the License for details. You may not use this file except in compliance with the License.
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
* See LICENSE in the root of the software repository for the full text of the License.
*/
* \file kernel_operator_data_copy_intf.h
* \brief
*/
#ifndef ASCENDC_MODULE_OPERATOR_DATA_COPY_INTERFACE_H
#define ASCENDC_MODULE_OPERATOR_DATA_COPY_INTERFACE_H
#include "kernel_macros.h"
#include "common_types.h"
#include "kernel_struct_data_copy.h"
#include "tile_api/kernel_tensor_tile_intf_utils.h"
#include "utils/kernel_utils_macros.h"
#include "utils/kernel_utils_struct_confusion_pad.h"
#include "kernel_tensor.h"
#if defined(ASCENDC_CPU_DEBUG) && ASCENDC_CPU_DEBUG == 1
#include <cstddef>
#include <cstdint>
#include "stub_def.h"
#include "kernel_fp16.h"
#endif
namespace AscendC {
* DataCopy *
* ************************************************************************************************* */
* @ingroup DataCopy Level 0
* @brief datacopy from src to dst, applicable to vector data
* @param [out] dst output LocalTensor
* @param [in] src input GlobalTensor
* @param [in] repeatParams.blockCount number of blocks
* @param [in] repeatParams.blockLen Length of blocks
* @param [in] repeatParams.srcGap src block gap
* @param [in] repeatParams.dstGap dst block gap
*/
template <typename T>
__aicore__ inline void __inout_pipe__(MTE2)
DataCopy(const LocalTensor<T>& dst, const GlobalTensor<T>& src, const DataCopyParams& repeatParams);
* @ingroup DataCopy Level 0
* @brief format transform(such as nd2nz) during data load from OUT to L1
* @param [out] dst output LocalTensor
* @param [in] src input GlobalTensor
* @param [in] intriParams.ndNum nd number of data to be moved
* @param [in] intriParams.nValue n value
* @param [in] intriParams.dValue d value in unit of element
* @param [in] intriParams.srcNdMatrixStride stride between nd matrixs at source ND matrix in unit of element
* @param [in] intriParams.srcDValue SRC_D value in unit of element
* @param [in] intriParams.dstNzC0Stride stride of nz between 2 C0 in L1 in unit of C0_size
* @param [in] intriParams.dstNzNStride stride of n between 2 C0 in L1
* @param [in] intriParams.dstNzMatrixStride DST_nz_matrix_stride in L1 in unit of element
*/
#if (__NPU_ARCH__ == 3101) || (__NPU_ARCH__ == 5102)
template <typename T, bool enableSmallC0 = false>
__aicore__ inline __inout_pipe__(MTE2) void DataCopy(const LocalTensor<T>& dst, const GlobalTensor<T>& src,
const Nd2NzParams& intriParams);
#else
template <typename T>
__aicore__ inline __inout_pipe__(MTE2) void DataCopy(const LocalTensor<T>& dst, const GlobalTensor<T>& src,
const Nd2NzParams& intriParams);
#endif
* @ingroup DataCopy Level 0
* @brief format transform(such as nd2nz) during data load from UB to L1(Only TSCM)
* @param [out] dst output LocalTensor
* @param [in] src input LocalTensor
* @param [in] intriParams.ndNum nd number of data to be moved, onlyc can be 1
* @param [in] intriParams.nValue n value
* @param [in] intriParams.dValue d value in unit of element
* @param [in] intriParams.srcNdMatrixStride stride between nd matrixs at source ND matrix in unit of element
* @param [in] intriParams.srcDValue SRC_D value in unit of element
* @param [in] intriParams.dstNzC0Stride stride of nz between 2 C0 in L1 in unit of C0_size
* @param [in] intriParams.dstNzNStride stride of n between 2 C0 in L1
* @param [in] intriParams.dstNzMatrixStride DST_nz_matrix_stride in L1 in unit of element
*/
template <typename T>
__aicore__ inline void DataCopy(const LocalTensor<T>& dst, const LocalTensor<T>& src,
const Nd2NzParams& intriParams);
#if (__NPU_ARCH__ == 3101) || (__NPU_ARCH__ == 5102)
* @ingroup DataCopy Level 0
* @brief format transform(such as dn2nz) during data load from OUT to L1
* @param [out] dst output LocalTensor
* @param [in] src input GlobalTensor
* @param [in] intriParams.dnNum dn number of data to be moved
* @param [in] intriParams.nValue n value
* @param [in] intriParams.dValue d value in unit of element
* @param [in] intriParams.srcDnMatrixStride stride between DN matrixs at source DN matrix in unit of element
* @param [in] intriParams.srcDValue SRC_D value in unit of element
* @param [in] intriParams.dstNzC0Stride stride of nz between 2 C0 in L1 in unit of C0_size
* @param [in] intriParams.dstNzNStride stride of n between 2 C0 in L1
* @param [in] intriParams.dstNzMatrixStride DST_nz_matrix_stride in L1 in unit of element
*/
template <typename T, bool enableSmallC0 = false>
__aicore__ inline __inout_pipe__(MTE2) void DataCopy(const LocalTensor<T>& dst, const GlobalTensor<T>& src,
const Dn2NzParams& intriParams);
#endif
* @ingroup DataCopy Level 0
* @brief datacopy from src to dst, applicable to vector data
* @param [out] dst output GlobalTensor
* @param [in] src input LocalTensor
* @param [in] repeatParams.blockCount number of blocks
* @param [in] repeatParams.blockLen Length of blocks
* @param [in] repeatParams.srcGap src block gap
* @param [in] repeatParams.dstGap dst block gap
*/
template <typename T>
__aicore__ inline __inout_pipe__(MTE3) void DataCopy(const GlobalTensor<T>& dst, const LocalTensor<T>& src,
const DataCopyParams& repeatParams);
* @ingroup DataCopy Level 0
* @brief datacopy from src to dst, applicable to vector data
* @param [out] dst output LocalTensor
* @param [in] src input LocalTensor
* @param [in] repeatParams.blockCount number of blocks
* @param [in] repeatParams.blockLen Length of blocks
* @param [in] repeatParams.srcGap src block gap
* @param [in] repeatParams.dstGap dst block gap
*/
template <typename T>
__aicore__ inline void DataCopy(const LocalTensor<T>& dst, const LocalTensor<T>& src,
const DataCopyParams& repeatParams);
* @ingroup DataCopy Level 0
* @brief datacopy from L1 to bt, applicable to vector data
* @param [out] dst output LocalTensor
* @param [in] src input LocalTensor
* @param [in] repeatParams.blockCount number of blocks
* @param [in] repeatParams.blockLen Length of blocks
* @param [in] repeatParams.srcGap src block gap
* @param [in] repeatParams.dstGap dst block gap
*/
template <typename T, typename U>
__aicore__ inline void DataCopy(const LocalTensor<T>& dst, const LocalTensor<U>& src,
const DataCopyParams& repeatParams);
* @ingroup Copy Level 0
* @brief datacopy from src to dst, applicable to vector data
* @param [out] dst output LocalTensor
* @param [in] src input LocalTensor
* @param [in] mask[]/mask mask array/count
* @param [in] repeatTime repeat times
* @param [in] repeatParams.dstStride dst block stride
* @param [in] repeatParams.srcStride src block stride
* @param [in] repeatParams.dstRepeatSize dst repeat stride
* @param [in] repeatParams.srcRepeatSize src repeat stride
*/
template <typename T, bool isSetMask = true>
__aicore__ inline __inout_pipe__(V) void Copy(const LocalTensor<T>& dst, const LocalTensor<T>& src,
const uint64_t mask[], const uint8_t repeatTime,
const CopyRepeatParams& repeatParams);
template <typename T, bool isSetMask = true>
__aicore__ inline __inout_pipe__(V) void Copy(const LocalTensor<T>& dst, const LocalTensor<T>& src,
const uint64_t mask, const uint8_t repeatTime,
const CopyRepeatParams& repeatParams);
#if (__NPU_ARCH__ == 3101) || (__NPU_ARCH__ == 5102)
template <typename T, bool isSetMask = true>
__aicore__ inline __inout_pipe__(V) void Copy(const LocalTensor<T> &dst, const LocalTensor<T> &src,
const uint32_t count);
#endif
* @ingroup DataCopy Level 1
* @brief datacopy from src to dst, applicable to vector data
* @param [out] dst output LocalTensor
* @param [in] src input GlobalTensor
* @param [in] SliceInfo dstSliceInfo[] ub
* @param [in] SliceInfo srcSliceInfo[] gm
* @param [in] dimValue dim value also for length for dstSliceInfo[] and srcSliceInfo[]
*/
template <typename T>
__aicore__ inline __inout_pipe__(MTE2) void DataCopy(const LocalTensor<T>& dst, const GlobalTensor<T>& src,
const SliceInfo dstSliceInfo[], const SliceInfo srcSliceInfo[],
const uint32_t dimValue = 1);
* @ingroup DataCopy Level 1
* @brief datacopy from src to dst, applicable to vector data
* @param [out] dst output LocalTensor
* @param [in] src input GlobalTensor
* @param [in] SliceInfo dstSliceInfo[] gm
* @param [in] SliceInfo srcSliceInfo[] ub
* @param [in] dimValue dim value also for length for dstSliceInfo[] and srcSliceInfo[]
*/
template <typename T>
__aicore__ inline __inout_pipe__(MTE3) void DataCopy(const GlobalTensor<T>& dst, const LocalTensor<T>& src,
const SliceInfo dstSliceInfo[], const SliceInfo srcSliceInfo[],
const uint32_t dimValue = 1);
* @ingroup DataCopy Level 2
* @brief datacopy from src to dst, applicable to vector data
* @param [out] dst output LocalTensor
* @param [in] src input GlobalTensor
* @param [in] count Number of operands
*/
template <typename T>
__aicore__ inline __inout_pipe__(MTE2) void DataCopy(const LocalTensor<T>& dst, const GlobalTensor<T>& src,
const uint32_t count);
* @ingroup DataCopy Level 2
* @brief datacopy from src to dst, applicable to vector data
* @param [out] dst output GlobalTensor
* @param [in] src input LocalTensor
* @param [in] count Number of operands
*/
template <typename T>
__aicore__ inline __inout_pipe__(MTE3) void DataCopy(const GlobalTensor<T>& dst, const LocalTensor<T>& src,
const uint32_t count);
* @ingroup DataCopy Level 2
* @brief datacopy from src to dst, applicable to vector data
* @param [out] dst output LocalTensor
* @param [in] src input LocalTensor
* @param [in] count Number of operands
*/
template <typename T>
__aicore__ inline void DataCopy(const LocalTensor<T>& dst, const LocalTensor<T>& src,
const uint32_t count);
* @ingroup DataCopy Level 2
* @brief datacopy from src to dst, nz2nd, applicable to simulated cube data(such as data from l0c, 16*16)
* @param [out] dst output GlobalTensor
* @param [in] src input LocalTensor
*/
template <typename T>
__aicore__ inline __inout_pipe__(MTE3) void DataCopy(const GlobalTensor<T>& dst, const LocalTensor<T>& src,
const Nz2NdParamsFull& intriParams);
* DataCopy Enhanced *
* ************************************************************************************************* */
* @ingroup DataCopy
* @brief datacopy from src to dst, applicable to cube data
* @param [out] dst output LocalTensor
* @param [in] src input GlobalTensor
* @param [in] intriParams.blockCount number of blocks
* @param [in] intriParams.blockLen Length of blocks
* @param [in] intriParams.srcGap src block gap
* @param [in] intriParams.dstGap dst block gap
* @param [in] enhancedParams.blockMode Basic fractal of data movement
* @param [in] enhancedParams.deqScale Auxiliary parameters for path accuracy conversion
* @param [in] enhancedParams.deqValue size of convert with path precision
* @param [in] enhancedParams.sidStoreMode Multiplex input
* @param [in] enhancedParams.isRelu Configure whether Relu can be performed along the circuit
*/
template <typename T>
__aicore__ inline __inout_pipe__(MTE2) void DataCopy(const LocalTensor<T>& dst, const GlobalTensor<T>& src,
const DataCopyParams& intriParams,
const DataCopyEnhancedParams& enhancedParams);
template <typename T>
__aicore__ inline __inout_pipe__(MTE3) void DataCopy(const GlobalTensor<T>& dst, const LocalTensor<T>& src,
const DataCopyParams& intriParams,
const DataCopyEnhancedParams& enhancedParams);
template <typename T>
__aicore__ inline void DataCopy(const LocalTensor<T>& dst, const LocalTensor<T>& src,
const DataCopyParams& intriParams, const DataCopyEnhancedParams& enhancedParams);
template <typename T, typename U>
__aicore__ inline void DataCopy(const LocalTensor<T>& dst, const LocalTensor<U>& src,
const DataCopyCO12DstParams& intriParams);
template <typename T, typename U>
__aicore__ inline void DataCopy(const GlobalTensor<T>& dst, const LocalTensor<U>& src,
const DataCopyCO12DstParams& intriParams);
#if (defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201))
template <typename T, typename U,
typename Std::enable_if<Std::is_same<PrimT<T>, bfloat16_t>::value && Std::is_same<PrimT<U>, float>::value,
bool>::type = true>
__aicore__ inline void DataCopy(const LocalTensor<T>& dst, const LocalTensor<U>& src,
const DataCopyParams& intriParams, const DataCopyEnhancedParams& enhancedParams);
#endif
template <
typename T, typename U,
typename Std::enable_if<Std::is_same<PrimT<T>, half>::value && Std::is_same<PrimT<U>, float>::value, bool>::type = true>
__aicore__ inline void DataCopy(const LocalTensor<T>& dst, const LocalTensor<U>& src,
const DataCopyParams& intriParams, const DataCopyEnhancedParams& enhancedParams);
template <typename T, typename U,
typename Std::enable_if<Std::is_same<PrimT<T>, half>::value && Std::is_same<PrimT<U>, int32_t>::value,
bool>::type = true>
__aicore__ inline __inout_pipe__(V) void DataCopy(const LocalTensor<T>& dst, const LocalTensor<U>& src,
const DataCopyParams& intriParams,
const DataCopyEnhancedParams& enhancedParams);
template <typename T, typename U,
typename Std::enable_if<Std::is_same<PrimT<T>, int16_t>::value && Std::is_same<PrimT<U>, int32_t>::value,
bool>::type = true>
__aicore__ inline __inout_pipe__(V) void DataCopy(const LocalTensor<T>& dst, const LocalTensor<U>& src,
const DataCopyParams& intriParams,
const DataCopyEnhancedParams& enhancedParams);
template <typename T, typename U,
typename Std::enable_if<Std::is_same<PrimT<T>, int8_t>::value && Std::is_same<PrimT<U>, int32_t>::value,
bool>::type = true>
__aicore__ inline __inout_pipe__(V) void DataCopy(const LocalTensor<T>& dst, const LocalTensor<U>& src,
const DataCopyParams& intriParams,
const DataCopyEnhancedParams& enhancedParams);
template <typename T, typename U,
typename Std::enable_if<Std::is_same<PrimT<T>, uint8_t>::value && Std::is_same<PrimT<U>, int32_t>::value,
bool>::type = true>
__aicore__ inline __inout_pipe__(V) void DataCopy(const LocalTensor<T>& dst, const LocalTensor<U>& src,
const DataCopyParams& intriParams,
const DataCopyEnhancedParams& enhancedParams);
template <
typename T, typename U,
typename Std::enable_if<Std::is_same<PrimT<T>, float>::value && Std::is_same<PrimT<U>, half>::value, bool>::type = true>
__aicore__ inline __inout_pipe__(V) void DataCopy(const LocalTensor<T>& dst, const LocalTensor<U>& src,
const DataCopyParams& intriParams,
const DataCopyEnhancedParams& enhancedParams);
#if (__NPU_ARCH__ == 3101) || (__NPU_ARCH__ == 5102)
template <typename T, PaddingMode mode = PaddingMode::Normal>
__aicore__ inline __inout_pipe__(MTE2) void DataCopyPad(const LocalTensor<T>& dst,
const GlobalTensor<T>& src,
const DataCopyParams& dataCopyParams,
const DataCopyPadParams& padParams);
template <typename T, PaddingMode mode = PaddingMode::Normal>
__aicore__ inline __inout_pipe__(MTE3) void DataCopyPad(const GlobalTensor<T>& dst,
const LocalTensor<T>& src,
const DataCopyParams& dataCopyParams);
#else
template <typename T>
__aicore__ inline __inout_pipe__(MTE2) void DataCopyPad(const LocalTensor<T>& dst,
const GlobalTensor<T>& src,
const DataCopyParams& dataCopyParams,
const DataCopyPadParams& padParams);
template <typename T>
__aicore__ inline __inout_pipe__(MTE3) void DataCopyPad(const GlobalTensor<T>& dst,
const LocalTensor<T>& src,
const DataCopyParams& dataCopyParams);
#endif
template <typename T>
__aicore__ inline void DataCopyPad(const LocalTensor<T>& dst, const LocalTensor<T>& src,
const DataCopyParams& dataCopyParams, const Nd2NzParams& nd2nzParams);
#if (__NPU_ARCH__ == 3101) || (__NPU_ARCH__ == 5102)
template <typename T, PaddingMode mode = PaddingMode::Normal>
__aicore__ inline __inout_pipe__(MTE2) void DataCopyPad(const LocalTensor<T>& dst,
const GlobalTensor<T>& src,
const DataCopyExtParams& dataCopyParams,
const DataCopyPadExtParams<T>& padParams);
#else
template <typename T>
__aicore__ inline __inout_pipe__(MTE2) void DataCopyPad(const LocalTensor<T>& dst,
const GlobalTensor<T>& src,
const DataCopyExtParams& dataCopyParams,
const DataCopyPadExtParams<T>& padParams);
#endif
template <typename T, typename U,
typename Std::enable_if<Std::is_same<PrimT<T>, U>::value && (!Std::is_same<T, U>::value), bool>::type = true>
__aicore__ inline __inout_pipe__(MTE2) void DataCopyPad(const LocalTensor<T>& dst,
const GlobalTensor<T>& src,
const DataCopyExtParams& dataCopyParams,
const DataCopyPadExtParams<U>& padParams);
#if (__NPU_ARCH__ == 3101) || (__NPU_ARCH__ == 5102)
template <typename T, PaddingMode mode = PaddingMode::Normal>
__aicore__ inline __inout_pipe__(MTE3) void DataCopyPad(const GlobalTensor<T>& dst,
const LocalTensor<T>& src,
const DataCopyExtParams& dataCopyParams);
#else
template <typename T>
__aicore__ inline __inout_pipe__(MTE3) void DataCopyPad(const GlobalTensor<T>& dst,
const LocalTensor<T>& src,
const DataCopyExtParams& dataCopyParams);
#endif
template <typename T>
__aicore__ inline void DataCopyPad(const LocalTensor<T>& dst, const LocalTensor<T>& src,
const DataCopyExtParams& dataCopyParams, const Nd2NzParams& nd2nzParams);
template <typename T, TPosition pos = TPosition::MAX>
__aicore__ inline void SetPadValue(T paddingValue);
#if (__NPU_ARCH__ == 3101) || (__NPU_ARCH__ == 5102)
template <typename T, uint8_t dim, const NdDmaConfig &config = kDefaultNdDmaConfig>
__aicore__ inline void DataCopy(const LocalTensor<T> &dst, const GlobalTensor<T> &src,
const MultiCopyParams<T, dim> ¶ms);
__aicore__ inline void NdDmaDci();
__aicore__ inline void SetLoopModePara(const LoopModeParams& loopParams, DataCopyMVType type);
__aicore__ inline void ResetLoopModePara(DataCopyMVType type);
#endif
}
* DataCopy(Layout) API Level2 *
* ************************************************************************************************* */
namespace AscendC {
template <typename T, size_t row, size_t column>
using NZLayout = typename NZLayoutFormat<T, row, column>::type;
template <typename T, size_t row, size_t column>
using RowMajorLayout = NDLayoutFormat<T, row, column>;
template <typename T, size_t row, size_t column>
using ColumnMajorLayout = DNLayoutFormat<T, row, column>;
template <typename T, size_t row, size_t column>
using ZNLayout = ZNLayoutFormat<T, row, column>;
template <typename T>
__aicore__ inline decltype(auto) MakeNZLayout(size_t row, size_t column);
template <typename T>
__aicore__ inline decltype(auto) MakeRowMajorLayout(size_t row, size_t column);
template <typename T>
__aicore__ inline decltype(auto) MakeColumnMajorLayout(size_t row, size_t column);
template <typename T>
__aicore__ inline decltype(auto) MakeZNLayout(size_t row, size_t column);
template <const DataCopyTrait& trait = DEFAULT_DATA_COPY_TRAIT, typename T, typename U>
__aicore__ inline typename Std::enable_if<VerifyingDataCopyTemplate<T, U>, void>::type
DataCopy(const T& dst, const U& src);
}
#include "../../impl/basic_api/kernel_operator_data_copy_intf_impl.h"
#endif