* Copyright (c) 2025 Huawei Technologies Co., Ltd.
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
* CANN Open Software License Agreement Version 2.0 (the "License").
* Please refer to the License for details. You may not use this file except in compliance with the License.
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
* See LICENSE in the root of the software repository for the full text of the License.
*/
* \file kernel_operator_vec_reduce_intf_impl.h
* \brief
*/
#if !defined(__ASCENDC_INCLUDE_INTERNAL_HEADERS__)
#pragma message("impl/basic_api/kernel_operator_vec_reduce_intf_impl.h is an internal header file and must not be used directly. Functions or variables defined in this file may be removed in the future. Please use \"#include \"basic_api/kernel_operator_vec_reduce_intf.h\"\" and use public functions or variables defined in interface headers files.")
#define __ASCENDC_INCLUDE_INTERNAL_HEADERS__
#define __UNDEF_ASCENDC_INCLUDE_INTERNAL_HEADERS_KERNEL_OPERATOR_VEC_REDUCE_INTF_IMPL_H__
#endif
#ifndef ASCENDC_MODULE_OPERATOR_VEC_REDUCE_INTERFACE_IMPL_H
#define ASCENDC_MODULE_OPERATOR_VEC_REDUCE_INTERFACE_IMPL_H
#include "kernel_tensor.h"
#include "mstx_local_tensor_info.h"
#if __NPU_ARCH__ == 1001
#include "dav_c100/kernel_operator_vec_reduce_impl.h"
#elif __NPU_ARCH__ == 2002
#include "dav_m200/kernel_operator_vec_reduce_impl.h"
#elif __NPU_ARCH__ == 2201
#include "dav_c220/kernel_operator_vec_reduce_impl.h"
#elif __NPU_ARCH__ == 3002
#include "dav_m300/kernel_operator_vec_reduce_impl.h"
#elif __NPU_ARCH__ == 3102
#include "dav_m310/kernel_operator_vec_reduce_impl.h"
#elif __NPU_ARCH__ == 3510
#include "dav_3510/kernel_operator_vec_reduce_impl.h"
#elif (__NPU_ARCH__ == 5102)
#include "dav_m510/kernel_operator_vec_reduce_impl.h"
#elif (__NPU_ARCH__ == 3003)
#include "dav_l300/kernel_operator_vec_reduce_impl.h"
#elif (__NPU_ARCH__ == 3113)
#include "dav_l311/kernel_operator_vec_reduce_impl.h"
#endif
#include "kernel_check.h"
#include "kernel_npu_debug.h"
namespace AscendC {
#pragma begin_pipe(V)
namespace Internal {
template <typename T>
__aicore__ inline void CheckReduceBaseParams(const LocalTensor<T>& dst, const LocalTensor<T>& src,
const int32_t repeatTime, const __gm__ char* apiName)
{
using PrimType = PrimT<T>;
ASCENDC_DEBUG_ASSERT((SupportType<PrimType, half, float>()), KERNEL_LOG_INTERNAL(KERNEL_ERROR,
"Failed to check dtype in %s, current api support dtype combination is "
"src and dst both: half / float.\n", apiName));
ReportNopWarning<int32_t>(repeatTime, "repeatTime", apiName);
CheckValueRange<int32_t>(repeatTime, 0, 255, "repeatTime", apiName);
CheckTensorPhyPosition<Hardware::UB>(dst, "dst", "VECIN / VECCALC / VECOUT", apiName);
CheckTensorPhyPosition<Hardware::UB>(src, "src", "VECIN / VECCALC / VECOUT", apiName);
CheckTensorAlignment(src, ONE_BLK_SIZE, "src", apiName);
}
template <typename T, typename U>
__aicore__ inline void CheckReduceDataBlockParams(const LocalTensor<T>& dst, const LocalTensor<U>& src,
const int32_t repeatTime, const __gm__ char* apiName)
{
CheckReduceBaseParams<T>(dst, src, repeatTime, apiName);
using DstPrimType = PrimT<T>;
CheckTensorAlignment(dst, 8 * sizeof(DstPrimType), "dst", apiName);
}
template <typename T, typename U>
__aicore__ inline void CheckReduceRepeatCommonParams(const LocalTensor<T>& dst, const LocalTensor<U>& src,
const int32_t repeatTime, ReduceOrder order, const __gm__ char* apiName)
{
#if __NPU_ARCH__ == 3510
CheckValueRange<int32_t>(repeatTime, 0, 255, "repeatTime", apiName);
ReportNopWarning<int32_t>(repeatTime, "repeatTime", apiName);
CheckTensorPhyPosition<Hardware::UB>(dst, "dst", "VECIN / VECCALC / VECOUT", apiName);
CheckTensorPhyPosition<Hardware::UB>(src, "src", "VECIN / VECCALC / VECOUT", apiName);
CheckTensorAlignment(src, ONE_BLK_SIZE, "src", apiName);
#else
CheckReduceBaseParams<T>(dst, src, repeatTime, apiName);
#endif
#if defined(__NPU_ARCH__) && ((__NPU_ARCH__ == 2201) || \
(__NPU_ARCH__ == 3002) || (__NPU_ARCH__ == 3102) || \
(__NPU_ARCH__ == 5102) || (__NPU_ARCH__ == 3003) || \
(__NPU_ARCH__ == 3113) || (__NPU_ARCH__ == 3510))
CheckValueRange<int>(static_cast<int>(order), 0, 3, "order", apiName);
#elif (__NPU_ARCH__ == 1001) || (__NPU_ARCH__ == 2002)
CheckValueRange<int>(static_cast<int>(order), 0, 1, "order", apiName);
#endif
using DstPrimType = PrimT<T>;
CheckTensorAlignment(dst, 2 * sizeof(DstPrimType), "dst", apiName);
}
template <typename T, typename U>
__aicore__ inline void CheckReduceRepeatCommonParams(const LocalTensor<T>& dst, const LocalTensor<U>& src,
const int32_t repeatTime, const __gm__ char* apiName)
{
#if __NPU_ARCH__ == 3510
CheckValueRange<int32_t>(repeatTime, 0, 255, "repeatTime", apiName);
ReportNopWarning<int32_t>(repeatTime, "repeatTime", apiName);
CheckTensorPhyPosition<Hardware::UB>(dst, "dst", "VECIN / VECCALC / VECOUT", apiName);
CheckTensorPhyPosition<Hardware::UB>(src, "src", "VECIN / VECCALC / VECOUT", apiName);
CheckTensorAlignment(src, ONE_BLK_SIZE, "src", apiName);
#else
CheckReduceBaseParams<T>(dst, src, repeatTime, apiName);
#endif
using DstPrimType = PrimT<T>;
CheckTensorAlignment(dst, sizeof(DstPrimType), "dst", apiName);
}
template <ReduceType reduceType, typename T, typename U, bool isSetMask, typename MaskType>
__aicore__ inline void ReduceDataBlockCommon(const LocalTensor<T>& dst, const LocalTensor<U>& src, const MaskType mask,
const int32_t repeatTime, const int32_t dstRepStride, const int32_t srcBlkStride, const int32_t srcRepStride)
{
using DstPrimType = PrimT<T>;
using SrcPrimType = PrimT<U>;
static_assert((SupportEnum<reduceType, ReduceType::SUM, ReduceType::MAX, ReduceType::MIN>()),
"Invalid reduceType for ReduceDataBlock, only ReduceType::SUM, ReduceType::MAX and ReduceType::MIN are supported.");
static_assert(Std::is_same_v<SrcPrimType, DstPrimType>,
"ReduceDataBlock only supports identical src type and dst type for current NPU_ARCH.");
#if defined(ASCENDC_DEBUG) || defined(ASCENDC_CPU_DEBUG)
CheckReduceDataBlockParams<T, U>(dst, src, repeatTime, "ReduceDataBlock");
if constexpr (Std::is_same_v<MaskType, int32_t>) {
CheckMaskValue<SrcPrimType, isSetMask>(mask, "ReduceDataBlock");
} else {
CheckMaskArray<SrcPrimType, isSetMask>(mask, "ReduceDataBlock");
}
#endif
#if ASCENDC_CPU_DEBUG
MaskSetter::Instance().SetMask(isSetMask);
if (!CheckFunVecReduceOther(dst, src, repeatTime, mask, dstRepStride, srcBlkStride, srcRepStride, "BlockReduceMax")) {
ASCENDC_REPORT_CHECK_ERROR("ReduceDataBlock",
(Std::is_same_v<MaskType, int32_t> ? KernelFuncType::MASK_COUNT_MODE : KernelFuncType::MASK_BIT_MODE));
}
#endif
if constexpr (reduceType == ReduceType::SUM) {
BlockReduceSumImpl<SrcPrimType, isSetMask>((__ubuf__ DstPrimType*)dst.GetPhyAddr(), (__ubuf__ SrcPrimType*)src.GetPhyAddr(), repeatTime,
mask, dstRepStride, srcBlkStride, srcRepStride);
} else if constexpr (reduceType == ReduceType::MAX) {
BlockReduceMaxImpl<SrcPrimType, isSetMask>((__ubuf__ DstPrimType*)dst.GetPhyAddr(), (__ubuf__ SrcPrimType*)src.GetPhyAddr(), repeatTime,
mask, dstRepStride, srcBlkStride, srcRepStride);
} else {
BlockReduceMinImpl<SrcPrimType, isSetMask>((__ubuf__ DstPrimType*)dst.GetPhyAddr(), (__ubuf__ SrcPrimType*)src.GetPhyAddr(), repeatTime,
mask, dstRepStride, srcBlkStride, srcRepStride);
}
}
template <ReduceType reduceType, typename T, typename U, bool isSetMask, typename MaskType>
__aicore__ inline void ReducePairElemCommon(const LocalTensor<T>& dst, const LocalTensor<U>& src, const MaskType mask,
const int32_t repeatTime, const int32_t dstRepStride, const int32_t srcBlkStride, const int32_t srcRepStride)
{
using DstPrimType = PrimT<T>;
using SrcPrimType = PrimT<U>;
static_assert((SupportEnum<reduceType, ReduceType::SUM>()),
"Invalid reduceType for ReducePairElem, only ReduceType::SUM is supported.");
static_assert(Std::is_same_v<SrcPrimType, DstPrimType>,
"ReducePairElem only supports identical src type and dst type for current NPU_ARCH.");
#if defined(ASCENDC_DEBUG) || defined(ASCENDC_CPU_DEBUG)
ASCENDC_DEBUG_ASSERT((SupportType<SrcPrimType, half, float>()), KERNEL_LOG_INTERNAL(KERNEL_ERROR,
"Failed to check dtype in ReducePairElem, current api support dtype combination is "
"src and dst both: half / float.\n"));
CheckValueRange<int32_t>(repeatTime, 0, 255, "repeatTime", "ReducePairElem");
ReportNopWarning<int32_t>(repeatTime, "repeatTime", "ReducePairElem");
CheckVectorTensor("ReducePairElem", NamedTensor(dst, "dst"), NamedTensor(src, "src"));
if constexpr (Std::is_same_v<MaskType, int32_t>) {
CheckMaskValue<SrcPrimType, isSetMask>(mask, "ReducePairElem");
} else {
CheckMaskArray<SrcPrimType, isSetMask>(mask, "ReducePairElem");
}
#endif
#if ASCENDC_CPU_DEBUG
MaskSetter::Instance().SetMask(isSetMask);
if (!CheckFunVecReduceOther(dst, src, repeatTime, mask, dstRepStride, srcBlkStride, srcRepStride, "PairReduceSum")) {
ASCENDC_REPORT_CHECK_ERROR("ReducePairElem",
(Std::is_same_v<MaskType, int32_t> ? KernelFuncType::MASK_COUNT_MODE : KernelFuncType::MASK_BIT_MODE));
}
#endif
PairReduceSumImpl<SrcPrimType, isSetMask>((__ubuf__ DstPrimType*)dst.GetPhyAddr(), (__ubuf__ SrcPrimType*)src.GetPhyAddr(), repeatTime,
mask, dstRepStride, srcBlkStride, srcRepStride);
}
template <typename T, typename U, bool isSetMask, typename MaskType>
__aicore__ inline void CheckReduceRepeatSumParams(const LocalTensor<T>& dst, const LocalTensor<U>& src,
const MaskType mask, const int32_t repeatTime, const int32_t dstRepStride, const int32_t srcBlkStride,
const int32_t srcRepStride)
{
#if defined(ASCENDC_DEBUG) || defined(ASCENDC_CPU_DEBUG)
CheckReduceRepeatCommonParams<T, U>(dst, src, repeatTime, "ReduceRepeat");
#endif
#if ASCENDC_CPU_DEBUG
MaskSetter::Instance().SetMask(isSetMask);
if (!CheckFunVecReduceOther(dst, src, repeatTime, mask, dstRepStride, srcBlkStride, srcRepStride, "WholeReduceSum")) {
ASCENDC_REPORT_CHECK_ERROR("ReduceRepeat",
(Std::is_same_v<MaskType, int32_t> ? KernelFuncType::MASK_COUNT_MODE : KernelFuncType::MASK_BIT_MODE));
}
#endif
}
template <typename T, typename U, bool isSetMask, typename MaskType>
__aicore__ inline void CheckReduceRepeatMaxMinParams(const LocalTensor<T>& dst, const LocalTensor<U>& src,
const MaskType mask, const int32_t repeatTime, const int32_t dstRepStride, const int32_t srcBlkStride,
const int32_t srcRepStride, ReduceOrder order)
{
#if defined(ASCENDC_DEBUG) || defined(ASCENDC_CPU_DEBUG)
CheckReduceRepeatCommonParams<T, U>(dst, src, repeatTime, order, "ReduceRepeat");
#endif
#if ASCENDC_CPU_DEBUG && (__NPU_ARCH__ == 2002 || __NPU_ARCH__ == 2201)
MaskSetter::Instance().SetMask(isSetMask);
if (!CheckFunVecReduceOtherWhl(dst, src, repeatTime, mask, dstRepStride, srcBlkStride, srcRepStride,
order, "WholeReduceMax")) {
ASCENDC_REPORT_CHECK_ERROR("ReduceRepeat",
(Std::is_same_v<MaskType, int32_t> ? KernelFuncType::MASK_COUNT_MODE : KernelFuncType::MASK_BIT_MODE));
}
#endif
#if ASCENDC_CPU_DEBUG && (__NPU_ARCH__ == 3002 || __NPU_ARCH__ == 3003 || __NPU_ARCH__ == 3113)
MaskSetter::Instance().SetMask(isSetMask);
if (!CheckFunVecReduceOther(dst, src, repeatTime, mask, dstRepStride, srcBlkStride, srcRepStride,
"WholeReduceMax")) {
ASCENDC_REPORT_CHECK_ERROR("ReduceRepeat",
(Std::is_same_v<MaskType, int32_t> ? KernelFuncType::MASK_COUNT_MODE : KernelFuncType::MASK_BIT_MODE));
}
#endif
}
template <ReduceType reduceType, typename T, typename U, bool isSetMask, typename MaskType>
__aicore__ inline void ReduceRepeatCommon(const LocalTensor<T>& dst, const LocalTensor<U>& src,
const MaskType mask, const int32_t repeatTime, const int32_t dstRepStride, const int32_t srcBlkStride,
const int32_t srcRepStride, ReduceOrder order)
{
using DstPrimType = PrimT<T>;
using SrcPrimType = PrimT<U>;
static_assert((SupportEnum<reduceType, ReduceType::SUM, ReduceType::MAX, ReduceType::MIN>()),
"Invalid reduceType for ReduceRepeat, only ReduceType::SUM, ReduceType::MAX and ReduceType::MIN are supported.");
#if defined(__NPU_ARCH__) && ((__NPU_ARCH__ == 3510) || (__NPU_ARCH__ == 5102) || (__NPU_ARCH__ == 3003) || (__NPU_ARCH__ == 3113))
static_assert((reduceType == ReduceType::SUM || Std::is_same_v<SrcPrimType, DstPrimType>),
"ReduceRepeat for MAX/MIN only supports identical dst type (T) and src type (U) for current NPU_ARCH.");
#else
static_assert(Std::is_same_v<SrcPrimType, DstPrimType>,
"ReduceRepeat only supports identical dst type (T) and src type (U) for current NPU_ARCH.");
#endif
#if defined(ASCENDC_DEBUG) || defined(ASCENDC_CPU_DEBUG)
if constexpr (Std::is_same_v<MaskType, int32_t>) {
CheckMaskValue<SrcPrimType, isSetMask>(mask, "ReduceRepeat");
} else {
CheckMaskArray<SrcPrimType, isSetMask>(mask, "ReduceRepeat");
}
#endif
if constexpr (reduceType == ReduceType::SUM) {
CheckReduceRepeatSumParams<T, U, isSetMask>(dst, src, mask, repeatTime, dstRepStride, srcBlkStride, srcRepStride);
WholeReduceSumImpl<SrcPrimType, isSetMask>((__ubuf__ DstPrimType*)dst.GetPhyAddr(), (__ubuf__ SrcPrimType*)src.GetPhyAddr(),
mask, repeatTime, dstRepStride, srcBlkStride, srcRepStride);
} else {
CheckReduceRepeatMaxMinParams<T, U, isSetMask>(dst, src, mask, repeatTime, dstRepStride, srcBlkStride, srcRepStride, order);
if constexpr (reduceType == ReduceType::MAX) {
WholeReduceMaxImpl<SrcPrimType, isSetMask>((__ubuf__ DstPrimType*)dst.GetPhyAddr(), (__ubuf__ SrcPrimType*)src.GetPhyAddr(),
mask, repeatTime, dstRepStride, srcBlkStride, srcRepStride, order);
} else {
WholeReduceMinImpl<SrcPrimType, isSetMask>((__ubuf__ DstPrimType*)dst.GetPhyAddr(), (__ubuf__ SrcPrimType*)src.GetPhyAddr(),
mask, repeatTime, dstRepStride, srcBlkStride, srcRepStride, order);
}
}
}
}
template <ReduceType reduceType, typename T, typename U, bool isSetMask>
__aicore__ inline void ReduceDataBlock(const LocalTensor<T>& dst, const LocalTensor<U>& src, const int32_t mask,
const int32_t repeatTime, const int32_t dstRepStride, const int32_t srcBlkStride, const int32_t srcRepStride)
{
#ifdef __MSTX_DFX_REPORT__
MstxTensor::GetMstxVecReduceBlkInfo(dst, src, mask, repeatTime, dstRepStride, srcBlkStride, srcRepStride, isSetMask, "ReduceDataBlock");
#endif
Internal::ReduceDataBlockCommon<reduceType, T, U, isSetMask>(dst, src, mask, repeatTime, dstRepStride, srcBlkStride, srcRepStride);
}
template <ReduceType reduceType, typename T, typename U, bool isSetMask>
__aicore__ inline void ReduceDataBlock(const LocalTensor<T>& dst, const LocalTensor<U>& src, const uint64_t mask[],
const int32_t repeatTime, const int32_t dstRepStride, const int32_t srcBlkStride, const int32_t srcRepStride)
{
#ifdef __MSTX_DFX_REPORT__
MstxTensor::GetMstxVecReduceBlkInfo(dst, src, mask[0], mask[1], repeatTime, dstRepStride, srcBlkStride, srcRepStride, isSetMask, "ReduceDataBlock");
#endif
Internal::ReduceDataBlockCommon<reduceType, T, U, isSetMask>(dst, src, mask, repeatTime, dstRepStride, srcBlkStride, srcRepStride);
}
template <ReduceType reduceType, typename T, typename U, bool isSetMask>
__aicore__ inline void ReducePairElem(const LocalTensor<T>& dst, const LocalTensor<U>& src, const int32_t mask,
const int32_t repeatTime, const int32_t dstRepStride, const int32_t srcBlkStride, const int32_t srcRepStride)
{
#ifdef __MSTX_DFX_REPORT__
MstxTensor::GetMstxVecReducePairInfo(dst, src, mask, repeatTime, dstRepStride, srcBlkStride, srcRepStride, isSetMask, "ReducePairElem");
#endif
Internal::ReducePairElemCommon<reduceType, T, U, isSetMask>(dst, src, mask, repeatTime, dstRepStride, srcBlkStride, srcRepStride);
}
template <ReduceType reduceType, typename T, typename U, bool isSetMask>
__aicore__ inline void ReducePairElem(const LocalTensor<T>& dst, const LocalTensor<U>& src, const uint64_t mask[],
const int32_t repeatTime, const int32_t dstRepStride, const int32_t srcBlkStride, const int32_t srcRepStride)
{
#ifdef __MSTX_DFX_REPORT__
MstxTensor::GetMstxVecReducePairInfo(dst, src, mask[0], mask[1], repeatTime, dstRepStride, srcBlkStride, srcRepStride, isSetMask, "ReducePairElem");
#endif
Internal::ReducePairElemCommon<reduceType, T, U, isSetMask>(dst, src, mask, repeatTime, dstRepStride, srcBlkStride, srcRepStride);
}
template <ReduceType reduceType, typename T, typename U, bool isSetMask>
__aicore__ inline void ReduceRepeat(const LocalTensor<T>& dst, const LocalTensor<U>& src,
const int32_t mask, const int32_t repeatTime, const int32_t dstRepStride, const int32_t srcBlkStride,
const int32_t srcRepStride, ReduceOrder order)
{
#ifdef __MSTX_DFX_REPORT__
MstxTensor::GetMstxVecReduceInfo(dst, src, mask, repeatTime, dstRepStride, srcBlkStride, srcRepStride, isSetMask, "ReduceRepeat");
#endif
Internal::ReduceRepeatCommon<reduceType, T, U, isSetMask>(dst, src, mask, repeatTime,
dstRepStride, srcBlkStride, srcRepStride, order);
}
template <ReduceType reduceType, typename T, typename U, bool isSetMask>
__aicore__ inline void ReduceRepeat(const LocalTensor<T>& dst, const LocalTensor<U>& src,
const uint64_t mask[], const int32_t repeatTime, const int32_t dstRepStride, const int32_t srcBlkStride,
const int32_t srcRepStride, ReduceOrder order)
{
#ifdef __MSTX_DFX_REPORT__
MstxTensor::GetMstxVecReduceInfo(dst, src, mask[0], mask[1], repeatTime, dstRepStride, srcBlkStride, srcRepStride, isSetMask, "ReduceRepeat");
#endif
Internal::ReduceRepeatCommon<reduceType, T, U, isSetMask>(dst, src, mask, repeatTime,
dstRepStride, srcBlkStride, srcRepStride, order);
}
* BlockReduceSum has been updated, please use ReduceDataBlock instead.
* @ingroup BlockReduceSum
* @brief Sum all elements in each block
* @param [out] dst output LocalTensor
* @param [in] src input LocalTensor
* @param [in] repeatTime repeat times
* @param [in] mask[]/maskcount mask array/count
* @param [in] dstRepStride dst repeat stride
* @param [in] srcBlkStride src block stride
* @param [in] srcRepStride src repeat stride
*/
template <typename T, bool isSetMask = true>
__aicore__ inline void BlockReduceSum(const LocalTensor<T>& dst, const LocalTensor<T>& src,
const int32_t repeatTime, const int32_t mask, const int32_t dstRepStride, const int32_t srcBlkStride,
const int32_t srcRepStride)
{
ReduceDataBlock<ReduceType::SUM, T, T, isSetMask>(dst, src, mask, repeatTime, dstRepStride, srcBlkStride, srcRepStride);
}
* BlockReduceMax has been updated, please use ReduceDataBlock instead.
* @ingroup BlockReduceMax
* @brief Maximize all elements in each block
* @param [out] dst output LocalTensor
* @param [in] src input LocalTensor
* @param [in] repeatTime repeat times
* @param [in] mask[]/maskcount mask array/count
* @param [in] dstRepStride dst repeat stride
* @param [in] srcBlkStride src block stride
* @param [in] srcRepStride src repeat stride
*/
template <typename T, bool isSetMask = true>
__aicore__ inline void BlockReduceMax(const LocalTensor<T>& dst, const LocalTensor<T>& src,
const int32_t repeatTime, const int32_t mask, const int32_t dstRepStride, const int32_t srcBlkStride,
const int32_t srcRepStride)
{
ReduceDataBlock<ReduceType::MAX, T, T, isSetMask>(dst, src, mask, repeatTime, dstRepStride, srcBlkStride, srcRepStride);
}
* BlockReduceMin has been updated, please use ReduceDataBlock instead.
* @ingroup BlockReduceMin
* @brief Find the minimum value of all elements in each block
* @param [out] dst output LocalTensor
* @param [in] src input LocalTensor
* @param [in] repeatTime repeat times
* @param [in] mask[]/maskcount mask array/count
* @param [in] dstRepStride dst repeat stride
* @param [in] srcBlkStride src block stride
* @param [in] srcRepStride src repeat stride
*/
template <typename T, bool isSetMask = true>
__aicore__ inline void BlockReduceMin(const LocalTensor<T>& dst, const LocalTensor<T>& src,
const int32_t repeatTime, const int32_t mask, const int32_t dstRepStride, const int32_t srcBlkStride,
const int32_t srcRepStride)
{
ReduceDataBlock<ReduceType::MIN, T, T, isSetMask>(dst, src, mask, repeatTime, dstRepStride, srcBlkStride, srcRepStride);
}
* PairReduceSum has been updated, please use ReducePairElem instead.
* @ingroup PairReduceSum
* @brief Sum of adjacent inner pair (parity) elements
* @param [out] dst output LocalTensor
* @param [in] src input LocalTensor
* @param [in] repeatTime repeat times
* @param [in] mask[]/maskcount mask array/count
* @param [in] dstRepStride dst repeat stride
* @param [in] srcBlkStride src block stride
* @param [in] srcRepStride src repeat stride
*/
template <typename T, bool isSetMask = true>
__aicore__ inline void PairReduceSum(const LocalTensor<T>& dst, const LocalTensor<T>& src,
const int32_t repeatTime, const int32_t mask, const int32_t dstRepStride, const int32_t srcBlkStride,
const int32_t srcRepStride)
{
ReducePairElem<ReduceType::SUM, T, T, isSetMask>(dst, src, mask, repeatTime, dstRepStride, srcBlkStride, srcRepStride);
}
template <typename T, bool isSetMask = true>
__aicore__ inline void BlockReduceSum(const LocalTensor<T>& dst, const LocalTensor<T>& src,
const int32_t repeatTime, const uint64_t mask[], const int32_t dstRepStride, const int32_t srcBlkStride,
const int32_t srcRepStride)
{
ReduceDataBlock<ReduceType::SUM, T, T, isSetMask>(dst, src, mask, repeatTime, dstRepStride, srcBlkStride, srcRepStride);
}
template <typename T, bool isSetMask = true>
__aicore__ inline void BlockReduceMax(const LocalTensor<T>& dst, const LocalTensor<T>& src,
const int32_t repeatTime, const uint64_t mask[], const int32_t dstRepStride, const int32_t srcBlkStride,
const int32_t srcRepStride)
{
ReduceDataBlock<ReduceType::MAX, T, T, isSetMask>(dst, src, mask, repeatTime, dstRepStride, srcBlkStride, srcRepStride);
}
template <typename T, bool isSetMask = true>
__aicore__ inline void BlockReduceMin(const LocalTensor<T>& dst, const LocalTensor<T>& src,
const int32_t repeatTime, const uint64_t mask[], const int32_t dstRepStride, const int32_t srcBlkStride,
const int32_t srcRepStride)
{
ReduceDataBlock<ReduceType::MIN, T, T, isSetMask>(dst, src, mask, repeatTime, dstRepStride, srcBlkStride, srcRepStride);
}
template <typename T, bool isSetMask = true>
__aicore__ inline void PairReduceSum(const LocalTensor<T>& dst, const LocalTensor<T>& src,
const int32_t repeatTime, const uint64_t mask[], const int32_t dstRepStride, const int32_t srcBlkStride,
const int32_t srcRepStride)
{
ReducePairElem<ReduceType::SUM, T, T, isSetMask>(dst, src, mask, repeatTime, dstRepStride, srcBlkStride, srcRepStride);
}
#if defined(__NPU_ARCH__) && ((__NPU_ARCH__ == 3510) || (__NPU_ARCH__ == 5102) || (__NPU_ARCH__ == 3003) || (__NPU_ARCH__ == 3113))
template <typename T, bool isSetMask = true, typename U = T>
__aicore__ inline void RepeatReduceSum(const LocalTensor<U>& dst, const LocalTensor<T>& src,
const int32_t repeatTime, const int32_t mask, const int32_t dstBlkStride, const int32_t srcBlkStride,
const int32_t dstRepStride, const int32_t srcRepStride)
{
#ifdef __MSTX_DFX_REPORT__
MstxTensor::GetMstxVecReduceRepeatInfo(dst, src, mask, repeatTime, dstRepStride, srcBlkStride, srcRepStride, isSetMask, "RepeatReduceSum");
#endif
using DstPrimType = PrimT<U>;
using SrcPrimType = PrimT<T>;
ASCENDC_CHECK_VALUE_RANGE(repeatTime, 0, 255, "repeatTime", "RepeatReduceSum");
#if ASCENDC_CPU_DEBUG
MaskSetter::Instance().SetMask(isSetMask);
if (!CheckFunVecReduceOther(dst, src, repeatTime, mask, dstRepStride, srcBlkStride, srcRepStride,
"RepeatReduceSum")) {
ASCENDC_REPORT_CHECK_ERROR("RepeatReduceSum", KernelFuncType::MASK_COUNT_MODE);
}
#endif
RepeatReduceSumImpl<SrcPrimType, isSetMask, DstPrimType>((__ubuf__ DstPrimType*)dst.GetPhyAddr(), (__ubuf__ SrcPrimType*)src.GetPhyAddr(), repeatTime,
mask, dstBlkStride, srcBlkStride, dstRepStride, srcRepStride);
}
#else
template <typename T, bool isSetMask = true>
__aicore__ inline void RepeatReduceSum(const LocalTensor<T>& dst, const LocalTensor<T>& src,
const int32_t repeatTime, const int32_t mask, const int32_t dstBlkStride, const int32_t srcBlkStride,
const int32_t dstRepStride, const int32_t srcRepStride)
{
#ifdef __MSTX_DFX_REPORT__
MstxTensor::GetMstxVecReduceRepeatInfo(dst, src, mask, repeatTime, dstRepStride, srcBlkStride, srcRepStride, isSetMask, "RepeatReduceSum");
#endif
using PrimType = PrimT<T>;
#if defined(ASCENDC_DEBUG) || defined(ASCENDC_CPU_DEBUG)
ASCENDC_DEBUG_ASSERT((SupportType<PrimType, half, float>()), KERNEL_LOG_INTERNAL(KERNEL_ERROR,
"Failed to check dtype in RepeatReduceSum, current api support dtype combination is "
"src and dst both: half / float.\n"));
CheckValueRange<int32_t>(repeatTime, 0, 255, "repeatTime", "RepeatReduceSum");
ReportNopWarning<int32_t>(repeatTime, "repeatTime", "RepeatReduceSum");
CheckTensorPhyPosition<Hardware::UB>(dst, "dst", "VECIN / VECCALC / VECOUT", "RepeatReduceSum");
CheckTensorPhyPosition<Hardware::UB>(src, "src", "VECIN / VECCALC / VECOUT", "RepeatReduceSum");
CheckTensorAlignment(src, ONE_BLK_SIZE, "src", "RepeatReduceSum");
if constexpr (sizeof(PrimType) == 2) {
CheckTensorAlignment(dst, 2, "dst", "RepeatReduceSum");
} else if constexpr (sizeof(PrimType) == 4) {
CheckTensorAlignment(dst, 4, "dst", "RepeatReduceSum");
}
CheckMaskValue<PrimType, isSetMask>(mask, "RepeatReduceSum");
#endif
#if ASCENDC_CPU_DEBUG
MaskSetter::Instance().SetMask(isSetMask);
if (!CheckFunVecReduceOther(dst, src, repeatTime, mask, dstRepStride, srcBlkStride, srcRepStride,
"RepeatReduceSum")) {
ASCENDC_REPORT_CHECK_ERROR("RepeatReduceSum", KernelFuncType::MASK_COUNT_MODE);
}
#endif
RepeatReduceSumImpl<PrimType, isSetMask>((__ubuf__ PrimType*)dst.GetPhyAddr(), (__ubuf__ PrimType*)src.GetPhyAddr(), repeatTime,
mask, dstBlkStride, srcBlkStride, dstRepStride, srcRepStride);
}
#endif
* WholeReduceSum has been updated, please use ReduceRepeat instead.
* @ingroup WholeReduceSum
* @brief Sum of all effective elements in each repeatTime
* @param [out] dst output LocalTensor
* @param [in] src input LocalTensor
* @param [in] repeatTime repeat times
* @param [in] mask[]/maskcount mask array/count
* @param [in] dstRepStride dst repeat stride
* @param [in] srcBlkStride src block stride
* @param [in] srcRepStride src repeat stride
*/
#if defined(__NPU_ARCH__) && ((__NPU_ARCH__ == 3510) || (__NPU_ARCH__ == 5102) || (__NPU_ARCH__ == 3003) || (__NPU_ARCH__ == 3113))
template <typename T, bool isSetMask = true, typename U = T>
__aicore__ inline void WholeReduceSum(const LocalTensor<U>& dst, const LocalTensor<T>& src,
const uint64_t mask[], const int32_t repeatTime, const int32_t dstRepStride, const int32_t srcBlkStride,
const int32_t srcRepStride)
{
ReduceRepeat<ReduceType::SUM, U, T, isSetMask>(dst, src, mask, repeatTime, dstRepStride, srcBlkStride, srcRepStride);
}
#else
template <typename T, bool isSetMask = true>
__aicore__ inline void WholeReduceSum(const LocalTensor<T>& dst, const LocalTensor<T>& src,
const uint64_t mask[], const int32_t repeatTime, const int32_t dstRepStride, const int32_t srcBlkStride,
const int32_t srcRepStride)
{
ReduceRepeat<ReduceType::SUM, T, T, isSetMask>(dst, src, mask, repeatTime, dstRepStride, srcBlkStride, srcRepStride);
}
#endif
* WholeReduceMax has been updated, please use ReduceRepeat instead.
* @ingroup WholeReduceMax
* @brief Index of the maximum value of all elements in each repeat
* @param [out] dst output LocalTensor
* @param [in] src input LocalTensor
* @param [in] repeatTime repeat times
* @param [in] mask[]/maskcount mask array/count
* @param [in] dstRepStride dst repeat stride
* @param [in] srcBlkStride src block stride
* @param [in] srcRepStride src repeat stride
*/
template <typename T, bool isSetMask = true>
__aicore__ inline void WholeReduceMax(const LocalTensor<T>& dst, const LocalTensor<T>& src,
const uint64_t mask[], const int32_t repeatTime, const int32_t dstRepStride, const int32_t srcBlkStride,
const int32_t srcRepStride, ReduceOrder order = ReduceOrder::ORDER_VALUE_INDEX)
{
ReduceRepeat<ReduceType::MAX, T, T, isSetMask>(dst, src, mask, repeatTime, dstRepStride, srcBlkStride, srcRepStride, order);
}
template <typename T, bool isSetMask = true>
__aicore__ inline void WholeReduceMin(const LocalTensor<T>& dst, const LocalTensor<T>& src,
const uint64_t mask[], const int32_t repeatTime, const int32_t dstRepStride, const int32_t srcBlkStride,
const int32_t srcRepStride, ReduceOrder order = ReduceOrder::ORDER_VALUE_INDEX)
{
ReduceRepeat<ReduceType::MIN, T, T, isSetMask>(dst, src, mask, repeatTime, dstRepStride, srcBlkStride, srcRepStride, order);
}
#if defined(__NPU_ARCH__) && ((__NPU_ARCH__ == 3510) || (__NPU_ARCH__ == 5102) || __NPU_ARCH__ == 3003 || __NPU_ARCH__ == 3113)
template <typename T, bool isSetMask = true, typename U = T>
__aicore__ inline void WholeReduceSum(const LocalTensor<U>& dst, const LocalTensor<T>& src,
const int32_t mask, const int32_t repeatTime, const int32_t dstRepStride, const int32_t srcBlkStride,
const int32_t srcRepStride)
{
ReduceRepeat<ReduceType::SUM, U, T, isSetMask>(dst, src, mask, repeatTime, dstRepStride, srcBlkStride, srcRepStride);
}
#else
template <typename T, bool isSetMask = true>
__aicore__ inline void WholeReduceSum(const LocalTensor<T>& dst, const LocalTensor<T>& src,
const int32_t mask, const int32_t repeatTime, const int32_t dstRepStride, const int32_t srcBlkStride,
const int32_t srcRepStride)
{
ReduceRepeat<ReduceType::SUM, T, T, isSetMask>(dst, src, mask, repeatTime, dstRepStride, srcBlkStride, srcRepStride);
}
#endif
template <typename T, bool isSetMask = true>
__aicore__ inline void WholeReduceMax(const LocalTensor<T>& dst, const LocalTensor<T>& src,
const int32_t mask, const int32_t repeatTime, const int32_t dstRepStride, const int32_t srcBlkStride,
const int32_t srcRepStride, ReduceOrder order = ReduceOrder::ORDER_VALUE_INDEX)
{
ReduceRepeat<ReduceType::MAX, T, T, isSetMask>(dst, src, mask, repeatTime, dstRepStride, srcBlkStride, srcRepStride, order);
}
template <typename T, bool isSetMask = true>
__aicore__ inline void WholeReduceMin(const LocalTensor<T>& dst, const LocalTensor<T>& src,
const int32_t mask, const int32_t repeatTime, const int32_t dstRepStride, const int32_t srcBlkStride,
const int32_t srcRepStride, ReduceOrder order = ReduceOrder::ORDER_VALUE_INDEX)
{
ReduceRepeat<ReduceType::MIN, T, T, isSetMask>(dst, src, mask, repeatTime, dstRepStride, srcBlkStride, srcRepStride, order);
}
* @ingroup ReduceMax Level 0
* @brief Index of the maximum value of all input elements
* @param [out] dst output LocalTensor
* @param [in] src input LocalTensor
* @param [in] sharedTmpBuffer LocalTensor to store the intermediate results
* @param [in] repeatTime repeat times
* @param [in] mask[]/maskcount mask array/count
* @param [in] srcRepStride src repeat stride
* @param [in] calIndex Specify whether to get the index with the highest value
*/
template <typename T>
__aicore__ inline void ReduceMax(const LocalTensor<T>& dst, const LocalTensor<T>& src,
const LocalTensor<T>& sharedTmpBuffer, const int32_t mask, const int32_t repeatTime, const int32_t srcRepStride,
bool calIndex)
{
#ifdef __MSTX_DFX_REPORT__
MstxTensor::GetMstxVecReduceComplexInfo(dst, src, sharedTmpBuffer, mask, repeatTime, srcRepStride, "ReduceMax");
#endif
using PrimType = PrimT<T>;
#if ASCENDC_CPU_DEBUG
if (!CheckFunVecReduce(dst, src, sharedTmpBuffer, repeatTime, mask, calIndex, srcRepStride, "ReduceMax")) {
ASCENDC_REPORT_CHECK_ERROR("ReduceMax", KernelFuncType::MASK_COUNT_MODE);
}
#endif
#if defined(__NPU_ARCH__) && ((__NPU_ARCH__ == 3510) || (__NPU_ARCH__ == 5102) || (__NPU_ARCH__ == 3003) || (__NPU_ARCH__ == 3113))
ReduceMaxImpl<PrimType>((__ubuf__ PrimType*)dst.GetPhyAddr(), (__ubuf__ PrimType*)src.GetPhyAddr(),
(__ubuf__ PrimType*)sharedTmpBuffer.GetPhyAddr(), mask, repeatTime, srcRepStride, calIndex);
#else
ASCENDC_ASSERT((SupportType<PrimType, half, float>()), {KERNEL_LOG(KERNEL_ERROR, "Failed to check dtype in ReduceMax, "
"current api support dtype combination is src and dst both: half / float");});
if (mask == 0) {
return;
}
ReduceRepeatParams params(mask, repeatTime, DEFAULT_REDUCE_DST_REP_STRIDE, DEFAULT_BLK_STRIDE, srcRepStride);
ReduceImpl<PrimType>((__ubuf__ PrimType*)dst.GetPhyAddr(), (__ubuf__ PrimType*)src.GetPhyAddr(),
(__ubuf__ PrimType*)sharedTmpBuffer.GetPhyAddr(), params, calIndex, ReduceMode::REDUCE_MAX);
#endif
}
* @ingroup ReduceMin
* @brief Index of the minimum value of all input elements
* @param [out] dst output LocalTensor
* @param [in] src input LocalTensor
* @param [in] sharedTmpBuffer LocalTensor to store the intermediate results
* @param [in] repeatTime repeat times
* @param [in] mask[]/maskcount mask array/count
* @param [in] srcRepStride src repeat stride
* @param [in] calIndex Specify whether to get the index with the highest value
*/
template <typename T>
__aicore__ inline void ReduceMin(const LocalTensor<T>& dst, const LocalTensor<T>& src,
const LocalTensor<T>& sharedTmpBuffer, const int32_t mask, const int32_t repeatTime, const int32_t srcRepStride,
bool calIndex)
{
#ifdef __MSTX_DFX_REPORT__
MstxTensor::GetMstxVecReduceComplexInfo(dst, src, sharedTmpBuffer, mask, repeatTime, srcRepStride, "ReduceMin");
#endif
using PrimType = PrimT<T>;
#if ASCENDC_CPU_DEBUG
if (!CheckFunVecReduce(dst, src, sharedTmpBuffer, repeatTime, mask, calIndex, srcRepStride, "ReduceMin")) {
ASCENDC_REPORT_CHECK_ERROR("ReduceMin", KernelFuncType::MASK_COUNT_MODE);
}
#endif
#if (__NPU_ARCH__ == 3510) || (__NPU_ARCH__ == 5102) || (__NPU_ARCH__ == 3003) || (__NPU_ARCH__ == 3113)
ReduceMinImpl<PrimType>((__ubuf__ PrimType*)dst.GetPhyAddr(), (__ubuf__ PrimType*)src.GetPhyAddr(),
(__ubuf__ PrimType*)sharedTmpBuffer.GetPhyAddr(), mask, repeatTime, srcRepStride, calIndex);
#else
ASCENDC_ASSERT((SupportType<PrimType, half, float>()), {KERNEL_LOG(KERNEL_ERROR, "Failed to check dtype in ReduceMin, "
"current api support dtype combination is src and dst both: half / float");});
if (mask == 0) {
return;
}
struct ReduceRepeatParams params(mask, repeatTime, DEFAULT_REDUCE_DST_REP_STRIDE, DEFAULT_BLK_STRIDE, srcRepStride);
ReduceImpl<PrimType>((__ubuf__ PrimType*)dst.GetPhyAddr(), (__ubuf__ PrimType*)src.GetPhyAddr(),
(__ubuf__ PrimType*)sharedTmpBuffer.GetPhyAddr(), params, calIndex, ReduceMode::REDUCE_MIN);
#endif
}
* @ingroup ReduceSum
* @brief sum all input elements
* @param [out] dst output LocalTensor
* @param [in] src input LocalTensor
* @param [in] sharedTmpBuffer LocalTensor to store the intermediate results
* @param [in] repeatTime repeat times
* @param [in] mask[]/maskcount mask array/count
* @param [in] srcRepStride src repeat stride
*/
template <typename T>
__aicore__ inline void ReduceSum(const LocalTensor<T>& dst, const LocalTensor<T>& src,
const LocalTensor<T>& sharedTmpBuffer, const int32_t mask, const int32_t repeatTime, const int32_t srcRepStride)
{
#ifdef __MSTX_DFX_REPORT__
MstxTensor::GetMstxVecReduceComplexInfo(dst, src, sharedTmpBuffer, mask, repeatTime, srcRepStride, "ReduceSum");
#endif
using PrimType = PrimT<T>;
#if ASCENDC_CPU_DEBUG
if (!CheckFunVecReduce(dst, src, sharedTmpBuffer, repeatTime, mask, srcRepStride, "ReduceSum")) {
ASCENDC_REPORT_CHECK_ERROR("ReduceSum", KernelFuncType::MASK_COUNT_MODE);
}
#endif
#if (__NPU_ARCH__ == 3510) || (__NPU_ARCH__ == 5102) || (__NPU_ARCH__ == 3003) || (__NPU_ARCH__ == 3113)
ReduceSumImpl<PrimType>((__ubuf__ PrimType*)dst.GetPhyAddr(), (__ubuf__ PrimType*)src.GetPhyAddr(),
(__ubuf__ PrimType*)sharedTmpBuffer.GetPhyAddr(), mask, repeatTime, srcRepStride);
#else
ASCENDC_ASSERT((SupportType<PrimType, half, float>()), {KERNEL_LOG(KERNEL_ERROR, "Failed to check dtype in ReduceSum, "
"current api support dtype combination is src and dst both: half / float");});
if (mask == 0) {
return;
}
ReduceRepeatParams params(mask, repeatTime, DEFAULT_REDUCE_DST_REP_STRIDE, DEFAULT_BLK_STRIDE, srcRepStride);
ReduceImpl<PrimType>((__ubuf__ PrimType*)dst.GetPhyAddr(), (__ubuf__ PrimType*)src.GetPhyAddr(),
(__ubuf__ PrimType*)sharedTmpBuffer.GetPhyAddr(), params, 0, ReduceMode::REDUCE_SUM);
#endif
}
template <typename T>
__aicore__ inline void ReduceMax(const LocalTensor<T>& dst, const LocalTensor<T>& src,
const LocalTensor<T>& sharedTmpBuffer, const uint64_t mask[], const int32_t repeatTime, const int32_t srcRepStride,
bool calIndex)
{
#ifdef __MSTX_DFX_REPORT__
MstxTensor::GetMstxVecReduceComplexInfo(dst, src, sharedTmpBuffer, mask[0], mask[1], repeatTime, srcRepStride, "ReduceMax");
#endif
using PrimType = PrimT<T>;
#if ASCENDC_CPU_DEBUG
if (!CheckFunVecReduce(dst, src, sharedTmpBuffer, repeatTime, mask, calIndex, srcRepStride, "ReduceMax")) {
ASCENDC_REPORT_CHECK_ERROR("ReduceMax", KernelFuncType::MASK_BIT_MODE);
}
#endif
#if defined(__NPU_ARCH__) && ((__NPU_ARCH__ == 3510) || (__NPU_ARCH__ == 5102) || (__NPU_ARCH__ == 3003) || (__NPU_ARCH__ == 3113))
ReduceMaxImpl<PrimType>((__ubuf__ PrimType*)dst.GetPhyAddr(), (__ubuf__ PrimType*)src.GetPhyAddr(),
(__ubuf__ PrimType*)sharedTmpBuffer.GetPhyAddr(), mask, repeatTime, srcRepStride, calIndex);
#else
ASCENDC_ASSERT((SupportType<PrimType, half, float>()), { KERNEL_LOG(KERNEL_ERROR, "Failed to check dtype in ReduceMax, "
"current api support dtype combination is src and dst both: half / float");});
if (mask[0] == 0 && mask[1] == 0) {
return;
}
struct ReduceRepeatParams params(mask, repeatTime, DEFAULT_REDUCE_DST_REP_STRIDE, DEFAULT_BLK_STRIDE, srcRepStride);
ReduceImpl<PrimType>((__ubuf__ PrimType*)dst.GetPhyAddr(), (__ubuf__ PrimType*)src.GetPhyAddr(),
(__ubuf__ PrimType*)sharedTmpBuffer.GetPhyAddr(), params, calIndex, ReduceMode::REDUCE_MAX);
#endif
}
template <typename T>
__aicore__ inline void ReduceMin(const LocalTensor<T>& dst, const LocalTensor<T>& src,
const LocalTensor<T>& sharedTmpBuffer, const uint64_t mask[], const int32_t repeatTime, const int32_t srcRepStride,
bool calIndex)
{
#ifdef __MSTX_DFX_REPORT__
MstxTensor::GetMstxVecReduceComplexInfo(dst, src, sharedTmpBuffer, mask[0], mask[1], repeatTime, srcRepStride, "ReduceMin");
#endif
using PrimType = PrimT<T>;
#if ASCENDC_CPU_DEBUG
if (!CheckFunVecReduce(dst, src, sharedTmpBuffer, repeatTime, mask, calIndex, srcRepStride, "ReduceMin")) {
ASCENDC_REPORT_CHECK_ERROR("ReduceMin", KernelFuncType::MASK_BIT_MODE);
}
#endif
#if (__NPU_ARCH__ == 3510) || (__NPU_ARCH__ == 5102) || (__NPU_ARCH__ == 3003) || (__NPU_ARCH__ == 3113)
ReduceMinImpl<PrimType>((__ubuf__ PrimType*)dst.GetPhyAddr(), (__ubuf__ PrimType*)src.GetPhyAddr(),
(__ubuf__ PrimType*)sharedTmpBuffer.GetPhyAddr(), mask, repeatTime, srcRepStride, calIndex);
#else
ASCENDC_ASSERT((SupportType<PrimType, half, float>()), { KERNEL_LOG(KERNEL_ERROR, "Failed to check dtype in ReduceMin, "
"current api support dtype combination is src and dst both: half / float");});
if (mask[0] == 0 && mask[1] == 0) {
return;
}
struct ReduceRepeatParams params(mask, repeatTime, DEFAULT_REDUCE_DST_REP_STRIDE, DEFAULT_BLK_STRIDE, srcRepStride);
ReduceImpl<PrimType>((__ubuf__ PrimType*)dst.GetPhyAddr(), (__ubuf__ PrimType*)src.GetPhyAddr(),
(__ubuf__ PrimType*)sharedTmpBuffer.GetPhyAddr(), params, calIndex, ReduceMode::REDUCE_MIN);
#endif
}
template <typename T>
__aicore__ inline void ReduceSum(const LocalTensor<T>& dst, const LocalTensor<T>& src,
const LocalTensor<T>& sharedTmpBuffer, const uint64_t mask[], const int32_t repeatTime, const int32_t srcRepStride)
{
#ifdef __MSTX_DFX_REPORT__
MstxTensor::GetMstxVecReduceComplexInfo(dst, src, sharedTmpBuffer, mask[0], mask[1], repeatTime, srcRepStride, "ReduceSum");
#endif
using PrimType = PrimT<T>;
#if ASCENDC_CPU_DEBUG
if (!CheckFunVecReduce(dst, src, sharedTmpBuffer, repeatTime, mask, srcRepStride, "ReduceSum")) {
ASCENDC_REPORT_CHECK_ERROR("ReduceSum", KernelFuncType::MASK_BIT_MODE);
}
#endif
#if (__NPU_ARCH__ == 3510) || (__NPU_ARCH__ == 5102) || (__NPU_ARCH__ == 3003) || (__NPU_ARCH__ == 3113)
ReduceSumImpl<PrimType>((__ubuf__ PrimType*)dst.GetPhyAddr(), (__ubuf__ PrimType*)src.GetPhyAddr(),
(__ubuf__ PrimType*)sharedTmpBuffer.GetPhyAddr(), mask, repeatTime, srcRepStride);
#else
ASCENDC_ASSERT((SupportType<PrimType, half, float>()), { KERNEL_LOG(KERNEL_ERROR, "Failed to check dtype in ReduceSum, "
"current api support dtype combination is src and dst both: half / float");});
if (mask[0] == 0 && mask[1] == 0) {
return;
}
struct ReduceRepeatParams params(mask, repeatTime, DEFAULT_REDUCE_DST_REP_STRIDE, DEFAULT_BLK_STRIDE, srcRepStride);
ReduceImpl<PrimType>((__ubuf__ PrimType*)dst.GetPhyAddr(), (__ubuf__ PrimType*)src.GetPhyAddr(),
(__ubuf__ PrimType*)sharedTmpBuffer.GetPhyAddr(), params, 0, ReduceMode::REDUCE_SUM);
#endif
}
* @ingroup ReduceMin Level 2
* @brief Index of the minimum value of all input elements
* @param [out] dst output LocalTensor
* @param [in] src input LocalTensor
* @param [in] sharedTmpBuffer LocalTensor to store the intermediate results
* @param [in] count Number of data involved in calculation
* @param [in] calIndex Specify whether to get the index with the highest value
*/
template <typename T>
__aicore__ inline void ReduceMin(const LocalTensor<T>& dst, const LocalTensor<T>& src,
const LocalTensor<T>& sharedTmpBuffer, const int32_t count, bool calIndex)
{
#ifdef __MSTX_DFX_REPORT__
MstxTensor::GetMstxVecReduceComplexInfo(dst, src, sharedTmpBuffer, count, "ReduceMin");
#endif
using PrimType = PrimT<T>;
#if defined(__NPU_ARCH__) && ((__NPU_ARCH__ == 3510) || (__NPU_ARCH__ == 5102) || (__NPU_ARCH__ == 3003) || (__NPU_ARCH__ == 3113))
#if ASCENDC_CPU_DEBUG
int32_t oneRepSize = ONE_REPEAT_BYTE_SIZE / sizeof(PrimType);
int32_t repeats = count < oneRepSize ? 1 : (count / oneRepSize);
if (!CheckFunVecReduce(dst, src, sharedTmpBuffer, repeats, count, calIndex, "vreduce_min")) {
ASSERT(false && "check vreduce min instr failed");
}
#endif
ReduceMinImpl<PrimType>((__ubuf__ PrimType*)dst.GetPhyAddr(),
(__ubuf__ PrimType*)src.GetPhyAddr(), (__ubuf__ PrimType*)sharedTmpBuffer.GetPhyAddr(), count, calIndex);
#else
ASCENDC_ASSERT((SupportType<PrimType, half, float>()), { KERNEL_LOG(KERNEL_ERROR, "Failed to check dtype in ReduceMin, "
"current api support dtype combination is src and dst both: half / float");});
ASCENDC_CHECK_VALUE_RANGE(count, 1, TOTAL_UB_SIZE / sizeof(PrimType), "count", "ReduceMin");
int32_t elementNumPerRep = ONE_REPEAT_BYTE_SIZE / sizeof(PrimType);
int32_t repeatTime = count / elementNumPerRep;
int32_t tailCount = count % elementNumPerRep;
int32_t bodyCount = elementNumPerRep;
if (repeatTime == 0) {
repeatTime = 1;
bodyCount = count;
tailCount = 0;
}
#if ASCENDC_CPU_DEBUG
if (!CheckFunVecReduce(dst, src, sharedTmpBuffer, repeatTime, count, calIndex, "ReduceMin")) {
ASCENDC_REPORT_CHECK_ERROR("ReduceMin", KernelFuncType::NONE_MODE);
}
#endif
if (count == 0) {
return;
}
struct ReduceRepeatParams params(bodyCount, repeatTime, DEFAULT_REDUCE_DST_REP_STRIDE, DEFAULT_BLK_STRIDE,
DEFAULT_REPEAT_STRIDE);
ReduceImpl<PrimType>((__ubuf__ PrimType*)dst.GetPhyAddr(), (__ubuf__ PrimType*)src.GetPhyAddr(),
(__ubuf__ PrimType*)sharedTmpBuffer.GetPhyAddr(), params, calIndex, ReduceMode::REDUCE_MIN);
if (tailCount != 0) {
ReduceTailCompute(dst, src, sharedTmpBuffer, count, calIndex, ReduceMode::REDUCE_MIN);
}
#endif
}
* @ingroup ReduceMax Level 2
* @brief Index of the maximum value of all input elements
* @param [out] dst output LocalTensor
* @param [in] src input LocalTensor
* @param [in] sharedTmpBuffer LocalTensor to store the intermediate results
* @param [in] count Number of data involved in calculation
* @param [in] calIndex Specify whether to get the index with the highest value
*/
template <typename T>
__aicore__ inline void ReduceMax(const LocalTensor<T>& dst, const LocalTensor<T>& src,
const LocalTensor<T>& sharedTmpBuffer, const int32_t count, bool calIndex)
{
#ifdef __MSTX_DFX_REPORT__
MstxTensor::GetMstxVecReduceComplexInfo(dst, src, sharedTmpBuffer, count, "ReduceMax");
#endif
using PrimType = PrimT<T>;
#if defined(__NPU_ARCH__) && ((__NPU_ARCH__ == 3510) || (__NPU_ARCH__ == 5102) || (__NPU_ARCH__ == 3003) || (__NPU_ARCH__ == 3113))
#if ASCENDC_CPU_DEBUG
int32_t oneRepSize = ONE_REPEAT_BYTE_SIZE / sizeof(PrimType);
int32_t repeats = count < oneRepSize ? 1 : (count / oneRepSize);
if (!CheckFunVecReduce(dst, src, sharedTmpBuffer, repeats, count, calIndex, "vreduce_max")) {
ASSERT(false && "check vreduce max instr failed");
}
#endif
ReduceMaxImpl<PrimType>((__ubuf__ PrimType*)dst.GetPhyAddr(),
(__ubuf__ PrimType*)src.GetPhyAddr(), (__ubuf__ PrimType*)sharedTmpBuffer.GetPhyAddr(), count, calIndex);
#else
ASCENDC_ASSERT((SupportType<PrimType, half, float>()), {KERNEL_LOG(KERNEL_ERROR, "Failed to check dtype in ReduceMax, "
"current api support dtype combination is src and dst both: half / float");});
ASCENDC_CHECK_VALUE_RANGE(count, 1, TOTAL_UB_SIZE / sizeof(PrimType), "count", "ReduceMax");
int32_t elementNumPerRep = ONE_REPEAT_BYTE_SIZE / sizeof(PrimType);
int32_t repeatTime = count / elementNumPerRep;
int32_t tailCount = count % elementNumPerRep;
int32_t bodyCount = elementNumPerRep;
if (repeatTime == 0) {
repeatTime = 1;
bodyCount = count;
tailCount = 0;
}
#if ASCENDC_CPU_DEBUG
if (!CheckFunVecReduce(dst, src, sharedTmpBuffer, repeatTime, count, calIndex, "ReduceMax")) {
ASCENDC_REPORT_CHECK_ERROR("ReduceMax", KernelFuncType::NONE_MODE);
}
#endif
if (count == 0) {
return;
}
struct ReduceRepeatParams params(bodyCount, repeatTime, DEFAULT_REDUCE_DST_REP_STRIDE, DEFAULT_BLK_STRIDE,
DEFAULT_REPEAT_STRIDE);
ReduceImpl<PrimType>((__ubuf__ PrimType*)dst.GetPhyAddr(), (__ubuf__ PrimType*)src.GetPhyAddr(),
(__ubuf__ PrimType*)sharedTmpBuffer.GetPhyAddr(), params, calIndex, ReduceMode::REDUCE_MAX);
if (tailCount != 0) {
ReduceTailCompute(dst, src, sharedTmpBuffer, count, calIndex, ReduceMode::REDUCE_MAX);
}
#endif
}
* @ingroup ReduceSum Level 2
* @brief sum all input elements
* @param [out] dst output LocalTensor
* @param [in] src input LocalTensor
* @param [in] sharedTmpBuffer LocalTensor to store the intermediate results
* @param [in] count Number of data involved in calculation
*/
template <typename T, bool isSetMask>
__aicore__ inline void ReduceSum(const LocalTensor<T>& dst, const LocalTensor<T>& src,
const LocalTensor<T>& sharedTmpBuffer, const int32_t count)
{
#ifdef __MSTX_DFX_REPORT__
MstxTensor::GetMstxVecReduceComplexInfo(dst, src, sharedTmpBuffer, count, "ReduceSum");
#endif
using PrimType = PrimT<T>;
ASCENDC_CHECK_VALUE_RANGE(count, 1, TOTAL_UB_SIZE / sizeof(PrimType), "count", "ReduceSum");
#if defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201)
ASCENDC_ASSERT((SupportType<PrimType, half, float>()), { KERNEL_LOG(KERNEL_ERROR, "Failed to check dtype in ReduceSum, "
"current api support dtype combination is src and dst both: half / float");});
#if ASCENDC_CPU_DEBUG
if (!CheckFunVecReduceMode2(dst, src, count, "ReduceSum")) {
ASCENDC_REPORT_CHECK_ERROR("ReduceSum", KernelFuncType::CALCOUNT_MODE);
}
#endif
if (count == 0) {
return;
}
ReduceSumImpl<PrimType, isSetMask>((__ubuf__ PrimType*)dst.GetPhyAddr(), (__ubuf__ PrimType*)src.GetPhyAddr(), count);
#elif (__NPU_ARCH__ == 3510) || (__NPU_ARCH__ == 5102) || (__NPU_ARCH__ == 3003) || (__NPU_ARCH__ == 3113)
#if ASCENDC_CPU_DEBUG
int32_t oneRepSize = ONE_REPEAT_BYTE_SIZE / sizeof(PrimType);
int32_t repeats = count < oneRepSize ? 1 : (count / oneRepSize);
if (!CheckFunVecReduce(dst, src, sharedTmpBuffer, count, repeats, "vreduce_sum")) {
ASSERT(false && "check vreduce sum instr failed");
}
#endif
ReduceSumImpl<PrimType>((__ubuf__ PrimType*)dst.GetPhyAddr(), (__ubuf__ PrimType*)src.GetPhyAddr(), (__ubuf__ PrimType*)sharedTmpBuffer.GetPhyAddr(), count);
#elif defined(__NPU_ARCH__) && ((__NPU_ARCH__ == 3003) || (__NPU_ARCH__ == 3113))
ReduceSumImpl<PrimType>((__ubuf__ PrimType*)dst.GetPhyAddr(), (__ubuf__ PrimType*)src.GetPhyAddr(),
(__ubuf__ PrimType*)sharedTmpBuffer.GetPhyAddr(), count);
#else
ASCENDC_ASSERT((SupportType<PrimType, half, float>()), { KERNEL_LOG(KERNEL_ERROR, "Failed to check dtype in ReduceSum, "
"current api support dtype combination is src and dst both: half / float");});
int32_t elementNumPerRep = ONE_REPEAT_BYTE_SIZE / sizeof(PrimType);
int32_t repeatTime = count / elementNumPerRep;
int32_t tailCount = count % elementNumPerRep;
int32_t bodyCount = elementNumPerRep;
if (repeatTime == 0) {
repeatTime = 1;
bodyCount = count;
tailCount = 0;
}
#if ASCENDC_CPU_DEBUG
if (!CheckFunVecReduce(dst, src, sharedTmpBuffer, count, repeatTime, "ReduceSum")) {
ASCENDC_REPORT_CHECK_ERROR("ReduceSum", KernelFuncType::CALCOUNT_MODE);
}
#endif
if (count == 0) {
return;
}
struct ReduceRepeatParams params(bodyCount, repeatTime, DEFAULT_REDUCE_DST_REP_STRIDE, DEFAULT_BLK_STRIDE,
DEFAULT_REPEAT_STRIDE);
ReduceImpl<PrimType>((__ubuf__ PrimType*)dst.GetPhyAddr(), (__ubuf__ PrimType*)src.GetPhyAddr(),
(__ubuf__ PrimType*)sharedTmpBuffer.GetPhyAddr(), params, 0, ReduceMode::REDUCE_SUM);
event_t eventIdVToS = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::V_S));
SetFlag<HardEvent::V_S>(eventIdVToS);
WaitFlag<HardEvent::V_S>(eventIdVToS);
PrimType bodySumValue = dst.GetValue(0);
if (tailCount != 0) {
struct ReduceRepeatParams tailParams(tailCount, 1, DEFAULT_REDUCE_DST_REP_STRIDE, DEFAULT_BLK_STRIDE,
DEFAULT_REPEAT_STRIDE);
ReduceImpl<PrimType>((__ubuf__ PrimType*)dst.GetPhyAddr(),
(__ubuf__ PrimType*)src.GetPhyAddr(elementNumPerRep * repeatTime), (__ubuf__ PrimType*)sharedTmpBuffer.GetPhyAddr(),
tailParams, 0, ReduceMode::REDUCE_SUM);
event_t eventIdVToS1 = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::V_S));
SetFlag<HardEvent::V_S>(eventIdVToS1);
WaitFlag<HardEvent::V_S>(eventIdVToS1);
PrimType tailSumValue = dst.GetValue(0);
sharedTmpBuffer.SetValue(0, bodySumValue);
sharedTmpBuffer.SetValue(1, tailSumValue);
event_t eventIdSToV = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::S_V));
SetFlag<HardEvent::S_V>(eventIdSToV);
WaitFlag<HardEvent::S_V>(eventIdSToV);
struct ReduceRepeatParams newParams(2, 1, DEFAULT_REDUCE_DST_REP_STRIDE, DEFAULT_BLK_STRIDE,
DEFAULT_REPEAT_STRIDE);
ReduceImpl<PrimType>((__ubuf__ PrimType*)dst.GetPhyAddr(), (__ubuf__ PrimType*)sharedTmpBuffer.GetPhyAddr(),
(__ubuf__ PrimType*)sharedTmpBuffer.GetPhyAddr(), newParams, 0, ReduceMode::REDUCE_SUM);
}
#endif
}
#pragma end_pipe
template <typename T>
__aicore__ inline void GetReduceMaxMinCount(uint32_t &maxMinValue, uint32_t &maxMinIndex)
{
using PrimType = PrimT<T>;
#if defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201)
if (g_coreType == AIC) {
return;
}
#endif
GetReduceMaxMinCountImpl<PrimType>(maxMinValue, maxMinIndex);
}
template <typename T>
__aicore__ inline void GetReduceMaxMinCount(uint32_t &maxMinValue)
{
using PrimType = PrimT<T>;
GetReduceMaxMinCountImpl<PrimType>(maxMinValue);
}
template <typename T>
__aicore__ inline void GetReduceRepeatMaxMinSpr(uint32_t &maxMinValue, uint32_t &maxMinIndex)
{
using PrimType = PrimT<T>;
#if defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201)
if (g_coreType == AIC) {
return;
}
#endif
GetReduceMaxMinCountImpl<PrimType>(maxMinValue, maxMinIndex);
}
template <typename T>
__aicore__ inline void GetReduceRepeatMaxMinSpr(uint32_t &maxMinValue)
{
using PrimType = PrimT<T>;
GetReduceMaxMinCountImpl<PrimType>(maxMinValue);
}
__aicore__ inline int64_t GetReduceRepeatSumSpr()
{
#if defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201)
if (g_coreType == AIC) {
return 0;
}
#endif
#if defined(__NPU_ARCH__) && (__NPU_ARCH__ == 3003 || __NPU_ARCH__ == 3113)
return GetAccValImpl<int64_t>();
#else
return get_acc_val();
#endif
}
__aicore__ inline int64_t GetAccVal()
{
return GetReduceRepeatSumSpr();
}
template <typename T>
__aicore__ inline __inout_pipe__(S) void GetReduceMaxMinCount(T &maxMinValue, T &maxMinIndex)
{
ASCENDC_ASSERT((SupportType<T, half, float>()), { KERNEL_LOG(KERNEL_ERROR, "Failed to check dtype in "
"GetReduceMaxMinCount, current api support dtype combination is maxMinValue and maxMinIndex both: half / "
"float");});
#if defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201)
if (g_coreType == AIC) {
return;
}
#endif
GetReduceMaxMinCountImpl<T>(maxMinValue, maxMinIndex);
}
template <typename T>
__aicore__ inline __inout_pipe__(S) void GetReduceMaxMinCount(T &maxMinValue)
{
ASCENDC_ASSERT((SupportType<T, half, float>()), { KERNEL_LOG(KERNEL_ERROR, "Failed to check dtype in "
"GetReduceMaxMinCount, current api support dtype combination is maxMinValue: half / float");});
GetReduceMaxMinCountImpl<T>(maxMinValue);
}
template <typename T>
__aicore__ inline __inout_pipe__(S) void GetReduceRepeatMaxMinSpr(T &maxMinValue, T &maxMinIndex)
{
ASCENDC_ASSERT((SupportType<T, half, float>()), { KERNEL_LOG(KERNEL_ERROR, "Failed to check dtype in "
"GetReduceRepeatMaxMinSpr, current api support dtype combination is maxMinValue and maxMinIndex both: half / "
"float");});
#if defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201)
if (g_coreType == AIC) {
return;
}
#endif
GetReduceMaxMinCountImpl<T>(maxMinValue, maxMinIndex);
}
template <typename T>
__aicore__ inline __inout_pipe__(S) void GetReduceRepeatMaxMinSpr(T &maxMinValue)
{
ASCENDC_ASSERT((SupportType<T, half, float>()), { KERNEL_LOG(KERNEL_ERROR, "Failed to check dtype in "
"GetReduceRepeatMaxMinSpr, current api support dtype combination is maxMinValue: half / float");});
GetReduceMaxMinCountImpl<T>(maxMinValue);
}
template <typename T>
__aicore__ inline __inout_pipe__(S) T GetReduceRepeatSumSpr()
{
ASCENDC_ASSERT((SupportType<T, half, float>()), { KERNEL_LOG(KERNEL_ERROR, "Failed to check dtype in GetReduceRepeatSumSpr, "
"current api support dtype combination is half / float");});
#if defined(__NPU_ARCH__) && (__NPU_ARCH__ == 2201)
if (g_coreType == AIC) {
return 0;
}
#endif
return GetAccValImpl<T>();
}
template <typename T>
__aicore__ inline __inout_pipe__(S) T GetAccVal()
{
ASCENDC_ASSERT((SupportType<T, half, float>()), { KERNEL_LOG(KERNEL_ERROR, "Failed to check dtype in GetAccVal, "
"current api support dtype combination is half / float");});
return GetReduceRepeatSumSpr<T>();
}
}
#endif
#if defined(__UNDEF_ASCENDC_INCLUDE_INTERNAL_HEADERS_KERNEL_OPERATOR_VEC_REDUCE_INTF_IMPL_H__)
#undef __ASCENDC_INCLUDE_INTERNAL_HEADERS__
#undef __UNDEF_ASCENDC_INCLUDE_INTERNAL_HEADERS_KERNEL_OPERATOR_VEC_REDUCE_INTF_IMPL_H__
#endif