* Copyright (c) 2025 Huawei Technologies Co., Ltd.
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
* CANN Open Software License Agreement Version 2.0 (the "License").
* Please refer to the License for details. You may not use this file except in compliance with the License.
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
* See LICENSE in the root of the software repository for the full text of the License.
*/
* \file kernel_operator_data_copy_base_impl.h
* \brief
*/
#if !defined(__ASCENDC_INCLUDE_INTERNAL_HEADERS__)
#pragma message("impl/basic_api/kernel_operator_data_copy_base_impl.h is an internal header file and must not be used directly. Functions or variables defined in this file may be removed in the future. Please use \"#include \"basic_api/kernel_operator_data_copy_intf.h\"\" and use public functions or variables defined in interface headers files.")
#define __ASCENDC_INCLUDE_INTERNAL_HEADERS__
#define __UNDEF_ASCENDC_INCLUDE_INTERNAL_HEADERS_KERNEL_OPERATOR_DATA_COPY_BASE_IMPL_H__
#endif
#ifndef ASCENDC_MODULE_OPERATOR_DATA_COPY_BASE_IMPL_H
#define ASCENDC_MODULE_OPERATOR_DATA_COPY_BASE_IMPL_H
#include "kernel_tensor.h"
#include "kernel_process_lock.h"
#include "kernel_struct_data_copy.h"
#if ASCENDC_CPU_DEBUG
#include "kernel_check.h"
#endif
#if __NPU_ARCH__ == 1001
#include "dav_c100/kernel_operator_data_copy_impl.h"
#elif __NPU_ARCH__ == 2002
#include "dav_m200/kernel_operator_data_copy_impl.h"
#elif __NPU_ARCH__ == 2201
#include "dav_c220/kernel_operator_data_copy_impl.h"
#include "dav_c220/kernel_operator_set_atomic_impl.h"
#elif __NPU_ARCH__ == 3002
#include "dav_m300/kernel_operator_data_copy_impl.h"
#elif __NPU_ARCH__ == 3102
#include "dav_m310/kernel_operator_data_copy_impl.h"
#elif __NPU_ARCH__ == 3510
#include "dav_3510/kernel_operator_data_copy_impl.h"
#elif (__NPU_ARCH__ == 5102)
#include "dav_m510/kernel_operator_data_copy_impl.h"
#elif (__NPU_ARCH__ == 3003)
#include "dav_l300/kernel_operator_data_copy_impl.h"
#elif (__NPU_ARCH__ == 3113)
#include "dav_l311/kernel_operator_data_copy_impl.h"
#endif
namespace AscendC {
#if __NPU_ARCH__ == 2201
template <typename T, enum ReduceType reduceType = ReduceType::NONE>
__aicore__ inline void DataCopyWithReduce(const GlobalTensor<T>& dst, const LocalTensor<T>& src,
const uint32_t count)
{
struct DataCopyParams repeatParams;
repeatParams.blockLen = count / AscendCUtils::GetC0Count(sizeof(T));
DataCopyWithReduce<T, reduceType>(dst, src, repeatParams);
}
template <typename T, enum ReduceType reduceType = ReduceType::NONE>
__aicore__ inline void DataCopyWithReduce(const GlobalTensor<T>& dst, const LocalTensor<T>& src,
const DataCopyParams& repeatParams)
{
AscendC::SetAtomicNoneImpl();
if constexpr (reduceType == ReduceType::SUM) {
AscendC::SetAtomicAddImpl<T>();
} else if constexpr (reduceType == ReduceType::MIN) {
AscendC::SetAtomicMinImpl<T>();
} else if constexpr (reduceType == ReduceType::MAX) {
AscendC::SetAtomicMaxImpl<T>();
}
DataCopy(dst, src, repeatParams);
AscendC::SetAtomicNoneImpl();
}
template <typename T, enum ReduceType reduceType = ReduceType::NONE>
__aicore__ inline void DataCopyPadWithReduce(const GlobalTensor<T>& dst, const LocalTensor<T>& src,
const DataCopyExtParams& dataCopyExtParams)
{
AscendC::SetAtomicNoneImpl();
if constexpr (reduceType == ReduceType::SUM) {
AscendC::SetAtomicAddImpl<T>();
} else if constexpr (reduceType == ReduceType::MIN) {
AscendC::SetAtomicMinImpl<T>();
} else if constexpr (reduceType == ReduceType::MAX) {
AscendC::SetAtomicMaxImpl<T>();
}
DataCopyPad(dst, src, dataCopyExtParams);
AscendC::SetAtomicNoneImpl();
}
#endif
__aicore__ inline void DataCopyGetOffsetList(
const SliceInfo sliceInfo[], uint32_t shapeInfo[], const uint32_t dimValue, uint32_t *count, uint32_t *offsetList)
{
uint32_t sliceSize = 1;
uint32_t copyCount = 1;
uint32_t currentCount = 1;
uint32_t preCopyCount = 0;
uint32_t iter = 0;
uint32_t totalSliceCount = 0;
for (uint32_t i = 0; i < dimValue; i++) {
if (i == 0) {
*(offsetList + totalSliceCount) = 0;
totalSliceCount++;
continue;
}
iter = 0;
sliceSize = sliceSize * shapeInfo[i - 1];
currentCount =
(sliceInfo[i].endIndex - sliceInfo[i].startIndex + 1 + sliceInfo[i].stride) / (1 + sliceInfo[i].stride);
preCopyCount = copyCount;
copyCount = copyCount * currentCount;
for (uint32_t j = preCopyCount; j < copyCount; j += preCopyCount) {
iter++;
for (uint32_t k = 0; k < preCopyCount; k++) {
*(offsetList + totalSliceCount) =
(*(offsetList + k)) + (iter * (1 + sliceInfo[i].stride)) * sliceSize;
totalSliceCount++;
}
}
}
*count = totalSliceCount;
}
__aicore__ inline uint32_t DataCopyGetPhyStartIndex(
const SliceInfo sliceInfo[], uint32_t shapeInfo[], const uint32_t dimValue)
{
uint32_t phyStartIndex = 0;
uint32_t sliceSize = 1;
for (uint32_t i = 0; i < dimValue; i++) {
if (i == 0) {
phyStartIndex = phyStartIndex + sliceInfo[i].startIndex;
} else {
sliceSize = sliceSize * shapeInfo[i - 1];
phyStartIndex = phyStartIndex + sliceSize * sliceInfo[i].startIndex;
}
}
return phyStartIndex;
}
#if (__NPU_ARCH__ == 3510) || (__NPU_ARCH__ == 5102)
template <typename T, bool enableSmallC0 = false>
__aicore__ inline void DataCopyGM2L1ND2NZ(const LocalTensor<T>& dst, const GlobalTensor<T>& src,
const Nd2NzParams& intriParams)
{
using PrimType = PrimT<T>;
const uint8_t cacheMode = ExtractCacheMode(src);
DataCopyGM2L1ND2NZImpl<PrimType, enableSmallC0>((__cbuf__ PrimType*)dst.GetPhyAddr(),
(__gm__ PrimType*)src.GetPhyAddr(), intriParams, cacheMode);
}
template <typename T>
__aicore__ inline void DataCopyGM2UBND2NZ(const LocalTensor<T>& dst, const GlobalTensor<T>& src,
const Nd2NzParams& intriParams)
{
using PrimType = PrimT<T>;
const uint8_t cacheMode = ExtractCacheMode(src);
DataCopyGM2UBND2NZImpl((__ubuf__ PrimType*)dst.GetPhyAddr(), (__gm__ PrimType*)src.GetPhyAddr(), intriParams,
cacheMode);
}
#endif
}
#endif
#if defined(__UNDEF_ASCENDC_INCLUDE_INTERNAL_HEADERS_KERNEL_OPERATOR_DATA_COPY_BASE_IMPL_H__)
#undef __ASCENDC_INCLUDE_INTERNAL_HEADERS__
#undef __UNDEF_ASCENDC_INCLUDE_INTERNAL_HEADERS_KERNEL_OPERATOR_DATA_COPY_BASE_IMPL_H__
#endif