* Copyright (c) 2025 Huawei Technologies Co., Ltd.
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
* CANN Open Software License Agreement Version 2.0 (the "License").
* Please refer to the License for details. You may not use this file except in compliance with the License.
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
* See LICENSE in the root of the software repository for the full text of the License.
*/
* \file kernel_reg.h
* \brief
*/
#if !defined(__ASCENDC_INCLUDE_INTERNAL_HEADERS__)
#pragma message("impl/basic_api/kernel_reg.h is an internal header file and must not be used directly. Functions or variables defined in this file may be removed in the future. Please use \"#include \"basic_api/kernel_common.h\"\" and use public functions or variables defined in interface headers files.")
#define __ASCENDC_INCLUDE_INTERNAL_HEADERS__
#define __UNDEF_ASCENDC_INCLUDE_INTERNAL_HEADERS_KERNEL_REG_H__
#endif
#ifndef ASCENDC_KERNEL_REG_IMPL_H
#define ASCENDC_KERNEL_REG_IMPL_H
#include "kernel_utils.h"
#include "kernel_struct_aipp.h"
namespace AscendC {
constexpr uint64_t MASK_PLACEHOLDER = 0;
constexpr uint64_t MASK_PLACEHOLDER_LIST[2] = {0, 0};
enum class MaskMode : uint8_t {
NORMAL = 0,
COUNTER
};
template <typename T, MaskMode mode>
__aicore__ static inline void SetVectorMaskImpl(const uint64_t maskHigh, const uint64_t maskLow)
{
if ASCEND_IS_NOT_AIC {
set_vector_mask(maskHigh, maskLow);
}
}
template <typename T, MaskMode mode>
__aicore__ static inline void SetVectorMaskImpl(int32_t len)
{
if constexpr (mode == MaskMode::COUNTER) {
SetVectorMaskImpl<PrimT<T>, mode>(0, len);
return;
}
AscendCUtils::SetMask<PrimT<T>>(len);
}
__aicore__ inline void ResetMaskImpl()
{
if ASCEND_IS_NOT_AIC {
set_vector_mask(FULL_MASK, FULL_MASK);
}
}
#if defined(__NPU_ARCH__) && ((__NPU_ARCH__ == 3510) || (__NPU_ARCH__ == 5102))
template <pipe_t pipe> __aicore__ inline void PipeBarrierImpl()
{
#if (__NPU_ARCH__ == 5102)
if constexpr (pipe == PIPE_MTE3) {
return;
}
#else
static_assert(pipe != PIPE_S, "PipeBarrier<PIPE_S> is not supported on current device!");
if ASCEND_IS_AIC {
if constexpr (pipe == PIPE_MTE3) {
return;
}
}
#endif
if constexpr (pipe != PIPE_V) {
pipe_barrier(pipe);
}
return;
}
#else
template <pipe_t pipe> __aicore__ inline void PipeBarrierImpl()
{
#if __NPU_ARCH__ == 3102
return;
#endif
#if (__NPU_ARCH__ == 3002)
if constexpr (pipe == PIPE_S || pipe == PIPE_V) {
return;
}
#endif
#if (__NPU_ARCH__ == 3003) || (__NPU_ARCH__ == 3113)
if constexpr (pipe == PIPE_V) {
return;
}
#endif
#if (__NPU_ARCH__ == 2201)
ASCENDC_DEBUG_ASSERT(pipe != PIPE_S, KERNEL_LOG_INTERNAL(KERNEL_ERROR, "PipeBarrier<PIPE_S> is not supported on current device!"));
if ASCEND_IS_AIC {
if constexpr (pipe == PIPE_V) {
return;
}
}
#endif
pipe_barrier(pipe);
}
#endif
enum class CacheLine : uint64_t {
SINGLE_CACHE_LINE = 0,
ENTIRE_DATA_CACHE
};
enum class DcciDst : uint64_t {
CACHELINE_ALL = 0,
CACHELINE_UB,
CACHELINE_OUT,
CACHELINE_ATOMIC
};
#if defined(__NPU_ARCH__) && \
((__NPU_ARCH__ == 2201) || (__NPU_ARCH__ == 3002) || \
(__NPU_ARCH__ == 3510) || (__NPU_ARCH__ == 5102) || (__NPU_ARCH__ == 3003) || (__NPU_ARCH__ == 3113))
template <typename T, CacheLine entireType, DcciDst dcciDst>
__aicore__ inline void DcciGMImpl(__gm__ T* dst)
{
dcci(static_cast<__gm__ void *>(dst), static_cast<uint64_t>(entireType), static_cast<uint64_t>(dcciDst));
}
template <typename T, CacheLine entireType, DcciDst dcciDst>
__aicore__ inline void DcciUBImpl(__ubuf__ T* dst)
{
dcci(static_cast<__ubuf__ void *>(dst), static_cast<uint64_t>(entireType), static_cast<uint64_t>(dcciDst));
}
#endif
#if defined(__NPU_ARCH__ ) && \
((__NPU_ARCH__ == 2201) || (__NPU_ARCH__ == 2002) || (__NPU_ARCH__ == 3002) || \
(__NPU_ARCH__ == 3510) || (__NPU_ARCH__ == 5102) || (__NPU_ARCH__ == 3003) || (__NPU_ARCH__ == 3113))
template <typename T, CacheLine entireType>
__aicore__ inline void DcciGMImpl(__gm__ T* dst)
{
dcci(static_cast<__gm__ void *>(dst), static_cast<uint64_t>(entireType));
}
#endif
__aicore__ inline void SetMaskCountImpl()
{
#if defined (__NPU_ARCH__) && (__NPU_ARCH__ == 3113)
constexpr uint32_t CTRL_COUNTER = 56;
set_ctrl(sbitset1(get_ctrl(), CTRL_COUNTER));
#else
set_mask_count();
#endif
}
__aicore__ inline void SetMaskNormImpl()
{
#if defined (__NPU_ARCH__) && (__NPU_ARCH__ == 3113)
constexpr uint32_t CTRL_COUNTER = 56;
set_ctrl(sbitset0(get_ctrl(), CTRL_COUNTER));
#else
set_mask_norm();
#endif
}
__aicore__ inline void SetLreluMode(bool lreluMode)
{
if (lreluMode) {
set_ctrl(sbitset1(get_ctrl(), LEAKY_RELU_MODE_BIT));
} else {
set_ctrl(sbitset0(get_ctrl(), LEAKY_RELU_MODE_BIT));
}
}
__aicore__ inline void SetHF32ModeImpl(bool hf32Mode)
{
if (hf32Mode) {
set_ctrl(sbitset1(get_ctrl(), HF32_MODE_BIT));
} else {
set_ctrl(sbitset0(get_ctrl(), HF32_MODE_BIT));
}
}
__aicore__ inline void SetHF32TransModeImpl(bool hf32TransMode)
{
if (hf32TransMode) {
set_ctrl(sbitset1(get_ctrl(), HF32_TRANS_MODE_BIT));
} else {
set_ctrl(sbitset0(get_ctrl(), HF32_TRANS_MODE_BIT));
}
}
__aicore__ inline void SetMMLayoutTransformImpl(bool mmLayoutMode)
{
if (mmLayoutMode) {
set_ctrl(sbitset1(get_ctrl(), MM_LAYOUT_MODE_BIT));
} else {
set_ctrl(sbitset0(get_ctrl(), MM_LAYOUT_MODE_BIT));
}
}
template <bool castMode>
__aicore__ inline void SetCastOverflowModeImpl()
{
if constexpr (castMode) {
set_ctrl(sbitset1(get_ctrl(), CAST_MODE_BIT));
} else {
set_ctrl(sbitset0(get_ctrl(), CAST_MODE_BIT));
}
}
#if defined(__NPU_ARCH__) && \
((__NPU_ARCH__ == 2201) || (__NPU_ARCH__ == 2002) || (__NPU_ARCH__ == 3002) || \
(__NPU_ARCH__ == 3510) || (__NPU_ARCH__ == 5102))
template <typename T>
__aicore__ inline void SetAippFunctionsImpl0(__gm__ T* src0)
{
#if defined(__NPU_ARCH__) && ((__NPU_ARCH__ == 3510) || (__NPU_ARCH__ == 5102))
Internal::g_aippSrc0 = reinterpret_cast<uint64_t>(src0) & 0xffffffffffff;
#else
uint64_t aippConfig0 = reinterpret_cast<uint64_t>(src0) & 0xffffffffffff;
set_aipp_spr_0(aippConfig0);
#endif
}
template <typename T, typename U>
__aicore__ inline void SetAippFunctionsImpl1(__gm__ T* src1, AippParams<U>& config)
{
#if defined(__NPU_ARCH__) && ((__NPU_ARCH__ == 3510) || (__NPU_ARCH__ == 5102))
Internal::g_aippSrc1 = reinterpret_cast<uint64_t>(src1) & 0xffffffffffff;
if (config.cscParams.isEnableCsc) {
Internal::g_aippSrc1 |= static_cast<uint64_t>(1) << AIPP_OFFSET_CSC_ENABLE;
}
#else
uint64_t aippConfig1 = reinterpret_cast<uint64_t>(src1) & 0xffffffffffff;
if (config.cscParams.isEnableCsc) {
aippConfig1 |= static_cast<uint64_t>(1) << AIPP_OFFSET_CSC_ENABLE;
}
set_aipp_spr_1(aippConfig1);
#endif
}
template <typename T>
__aicore__ inline void SetAippFunctionsImpl2(AippParams<T>& config)
{
uint16_t cscMatrixR0C0 = GetScalarBitcodeValue(config.cscParams.cscMatrixR0C0);
uint16_t cscMatrixR0C1 = GetScalarBitcodeValue(config.cscParams.cscMatrixR0C1);
uint16_t cscMatrixR0C2 = GetScalarBitcodeValue(config.cscParams.cscMatrixR0C2);
uint16_t cscMatrixR1C0 = GetScalarBitcodeValue(config.cscParams.cscMatrixR1C0);
#if defined(__NPU_ARCH__) && ((__NPU_ARCH__ == 3510) || (__NPU_ARCH__ == 5102))
Internal::g_aippCscRc0 = static_cast<uint64_t>(cscMatrixR0C0);
Internal::g_aippCscRc0 |= static_cast<uint64_t>(cscMatrixR0C1) << AIPP_OFFSET_CH1;
Internal::g_aippCscRc0 |= static_cast<uint64_t>(cscMatrixR0C2) << AIPP_OFFSET_CH2;
Internal::g_aippCscRc0 |= static_cast<uint64_t>(cscMatrixR1C0) << AIPP_OFFSET_CH3;
#else
uint64_t aippConfig2 = static_cast<uint64_t>(cscMatrixR0C0);
aippConfig2 |= static_cast<uint64_t>(cscMatrixR0C1) << AIPP_OFFSET_CH1;
aippConfig2 |= static_cast<uint64_t>(cscMatrixR0C2) << AIPP_OFFSET_CH2;
aippConfig2 |= static_cast<uint64_t>(cscMatrixR1C0) << AIPP_OFFSET_CH3;
set_aipp_spr_2(aippConfig2);
#endif
}
template <typename T>
__aicore__ inline void SetAippFunctionsImpl3(AippParams<T>& config)
{
uint16_t cscMatrixR1C1 = GetScalarBitcodeValue(config.cscParams.cscMatrixR1C1);
uint16_t cscMatrixR1C2 = GetScalarBitcodeValue(config.cscParams.cscMatrixR1C2);
uint16_t cscMatrixR2C0 = GetScalarBitcodeValue(config.cscParams.cscMatrixR2C0);
uint16_t cscMatrixR2C1 = GetScalarBitcodeValue(config.cscParams.cscMatrixR2C1);
#if defined(__NPU_ARCH__) && ((__NPU_ARCH__ == 3510) || (__NPU_ARCH__ == 5102))
Internal::g_aippCscRc1 = static_cast<uint64_t>(cscMatrixR1C1);
Internal::g_aippCscRc1 |= static_cast<uint64_t>(cscMatrixR1C2) << AIPP_OFFSET_CH1;
Internal::g_aippCscRc1 |= static_cast<uint64_t>(cscMatrixR2C0) << AIPP_OFFSET_CH2;
Internal::g_aippCscRc1 |= static_cast<uint64_t>(cscMatrixR2C1) << AIPP_OFFSET_CH3;
#else
uint64_t aippConfig3 = static_cast<uint64_t>(cscMatrixR1C1);
aippConfig3 |= static_cast<uint64_t>(cscMatrixR1C2) << AIPP_OFFSET_CH1;
aippConfig3 |= static_cast<uint64_t>(cscMatrixR2C0) << AIPP_OFFSET_CH2;
aippConfig3 |= static_cast<uint64_t>(cscMatrixR2C1) << AIPP_OFFSET_CH3;
set_aipp_spr_3(aippConfig3);
#endif
}
template <typename T>
__aicore__ inline void SetAippFunctionsImpl4(AippParams<T>& config)
{
uint16_t cscMatrixR2C2 = GetScalarBitcodeValue(config.cscParams.cscMatrixR2C2);
uint8_t cscBiasOut0 = GetScalarBitcodeValue(config.cscParams.cscBiasOut0);
uint8_t cscBiasOut1 = GetScalarBitcodeValue(config.cscParams.cscBiasOut1);
uint8_t cscBiasOut2 = GetScalarBitcodeValue(config.cscParams.cscBiasOut2);
uint8_t cscBiasIn0 = GetScalarBitcodeValue(config.cscParams.cscBiasIn0);
uint8_t cscBiasIn1 = GetScalarBitcodeValue(config.cscParams.cscBiasIn1);
uint8_t cscBiasIn2 = GetScalarBitcodeValue(config.cscParams.cscBiasIn2);
#if defined(__NPU_ARCH__) && ((__NPU_ARCH__ == 3510) || (__NPU_ARCH__ == 5102))
Internal::g_aippCscBias = static_cast<uint64_t>(cscMatrixR2C2);
Internal::g_aippCscBias |= static_cast<uint64_t>(cscBiasOut0) << AIPP_OFFSET_CSC_OUT_CH0;
Internal::g_aippCscBias |= static_cast<uint64_t>(cscBiasOut1) << AIPP_OFFSET_CSC_OUT_CH1;
Internal::g_aippCscBias |= static_cast<uint64_t>(cscBiasOut2) << AIPP_OFFSET_CSC_OUT_CH2;
Internal::g_aippCscBias |= static_cast<uint64_t>(cscBiasIn0) << AIPP_OFFSET_CSC_IN_CH0;
Internal::g_aippCscBias |= static_cast<uint64_t>(cscBiasIn1) << AIPP_OFFSET_CSC_IN_CH1;
Internal::g_aippCscBias |= static_cast<uint64_t>(cscBiasIn2) << AIPP_OFFSET_CSC_IN_CH2;
#else
uint64_t aippConfig4 = static_cast<uint64_t>(cscMatrixR2C2);
aippConfig4 |= static_cast<uint64_t>(cscBiasOut0) << AIPP_OFFSET_CSC_OUT_CH0;
aippConfig4 |= static_cast<uint64_t>(cscBiasOut1) << AIPP_OFFSET_CSC_OUT_CH1;
aippConfig4 |= static_cast<uint64_t>(cscBiasOut2) << AIPP_OFFSET_CSC_OUT_CH2;
aippConfig4 |= static_cast<uint64_t>(cscBiasIn0) << AIPP_OFFSET_CSC_IN_CH0;
aippConfig4 |= static_cast<uint64_t>(cscBiasIn1) << AIPP_OFFSET_CSC_IN_CH1;
aippConfig4 |= static_cast<uint64_t>(cscBiasIn2) << AIPP_OFFSET_CSC_IN_CH2;
set_aipp_spr_4(aippConfig4);
#endif
}
template <typename T>
__aicore__ inline void SetAippFunctionsImpl5(AippParams<T>& config)
{
#if __NPU_ARCH__ == 3002
return;
#endif
uint8_t dtcMeanCh0 = GetScalarBitcodeValue(config.dtcParams.dtcMeanCh0);
uint8_t dtcMeanCh1 = GetScalarBitcodeValue(config.dtcParams.dtcMeanCh1);
uint8_t dtcMeanCh2 = GetScalarBitcodeValue(config.dtcParams.dtcMeanCh2);
#if defined(__NPU_ARCH__) && ((__NPU_ARCH__ == 3510) || (__NPU_ARCH__ == 5102))
Internal::g_aippDtcMean = static_cast<uint64_t>(dtcMeanCh0);
Internal::g_aippDtcMean |= static_cast<uint64_t>(dtcMeanCh1) << AIPP_OFFSET_CH1;
Internal::g_aippDtcMean |= static_cast<uint64_t>(dtcMeanCh2) << AIPP_OFFSET_CH2;
#else
uint64_t aippConfig5 = static_cast<uint64_t>(dtcMeanCh0);
aippConfig5 |= static_cast<uint64_t>(dtcMeanCh1) << AIPP_OFFSET_CH1;
aippConfig5 |= static_cast<uint64_t>(dtcMeanCh2) << AIPP_OFFSET_CH2;
set_aipp_spr_5(aippConfig5);
#endif
}
template <typename T>
__aicore__ inline void SetAippFunctionsImpl6(AippParams<T>& config)
{
#if __NPU_ARCH__ == 3002
return;
#endif
uint16_t dtcMinCh0 = GetScalarBitcodeValue(config.dtcParams.dtcMinCh0);
uint16_t dtcMinCh1 = GetScalarBitcodeValue(config.dtcParams.dtcMinCh1);
uint16_t dtcMinCh2 = GetScalarBitcodeValue(config.dtcParams.dtcMinCh2);
#if defined(__NPU_ARCH__) && ((__NPU_ARCH__ == 3510) || (__NPU_ARCH__ == 5102))
Internal::g_aippDtcMin = static_cast<uint64_t>(dtcMinCh0);
Internal::g_aippDtcMin |= static_cast<uint64_t>(dtcMinCh1) << AIPP_OFFSET_CH1;
Internal::g_aippDtcMin |= static_cast<uint64_t>(dtcMinCh2) << AIPP_OFFSET_CH2;
#else
uint64_t aippConfig6 = static_cast<uint64_t>(dtcMinCh0);
aippConfig6 |= static_cast<uint64_t>(dtcMinCh1) << AIPP_OFFSET_CH1;
aippConfig6 |= static_cast<uint64_t>(dtcMinCh2) << AIPP_OFFSET_CH2;
set_aipp_spr_6(aippConfig6);
#endif
}
template <typename T>
__aicore__ inline void SetAippFunctionsImpl7(AippParams<T>& config)
{
#if __NPU_ARCH__ == 3002
return;
#endif
uint16_t dtcVarCh0 = GetScalarBitcodeValue(config.dtcParams.dtcVarCh0);
uint16_t dtcVarCh1 = GetScalarBitcodeValue(config.dtcParams.dtcVarCh1);
uint16_t dtcVarCh2 = GetScalarBitcodeValue(config.dtcParams.dtcVarCh2);
#if defined(__NPU_ARCH__) && ((__NPU_ARCH__ == 3510) || (__NPU_ARCH__ == 5102))
Internal::g_aippDtcVar = static_cast<uint64_t>(dtcVarCh0);
Internal::g_aippDtcVar |= static_cast<uint64_t>(dtcVarCh1) << AIPP_OFFSET_CH1;
Internal::g_aippDtcVar |= static_cast<uint64_t>(dtcVarCh2) << AIPP_OFFSET_CH2;
#else
uint64_t aippConfig7 = static_cast<uint64_t>(dtcVarCh0);
aippConfig7 |= static_cast<uint64_t>(dtcVarCh1) << AIPP_OFFSET_CH1;
aippConfig7 |= static_cast<uint64_t>(dtcVarCh2) << AIPP_OFFSET_CH2;
set_aipp_spr_7(aippConfig7);
#endif
}
template <typename T>
__aicore__ inline void SetAippFunctionsImpl8(AippParams<T>& config)
{
uint64_t aippConfig8 = 0;
if constexpr(IsSameType<T, int8_t>::value || IsSameType<T, uint8_t>::value) {
uint8_t paddingValueCh0 = GetScalarBitcodeValue(config.paddingParams.paddingValueCh0);
uint8_t paddingValueCh1 = GetScalarBitcodeValue(config.paddingParams.paddingValueCh1);
uint8_t paddingValueCh2 = GetScalarBitcodeValue(config.paddingParams.paddingValueCh2);
uint8_t paddingValueCh3 = GetScalarBitcodeValue(config.paddingParams.paddingValueCh3);
#if defined(__NPU_ARCH__) && ((__NPU_ARCH__ == 3510) || (__NPU_ARCH__ == 5102))
Internal::g_aippPaddingVal = static_cast<uint64_t>(paddingValueCh0);
Internal::g_aippPaddingVal |= static_cast<uint64_t>(paddingValueCh1) << AIPP_OFFSET_CH1;
Internal::g_aippPaddingVal |= static_cast<uint64_t>(paddingValueCh2) << AIPP_OFFSET_CH2;
Internal::g_aippPaddingVal |= static_cast<uint64_t>(paddingValueCh3) << AIPP_OFFSET_CH3;
#else
aippConfig8 |= static_cast<uint64_t>(paddingValueCh0);
aippConfig8 |= static_cast<uint64_t>(paddingValueCh1) << AIPP_OFFSET_CH1;
aippConfig8 |= static_cast<uint64_t>(paddingValueCh2) << AIPP_OFFSET_CH2;
aippConfig8 |= static_cast<uint64_t>(paddingValueCh3) << AIPP_OFFSET_CH3;
set_aipp_spr_8(aippConfig8);
#endif
} else {
uint16_t paddingValueCh0 = GetScalarBitcodeValue(config.paddingParams.paddingValueCh0);
uint16_t paddingValueCh1 = GetScalarBitcodeValue(config.paddingParams.paddingValueCh1);
uint16_t paddingValueCh2 = GetScalarBitcodeValue(config.paddingParams.paddingValueCh2);
uint16_t paddingValueCh3 = GetScalarBitcodeValue(config.paddingParams.paddingValueCh3);
#if defined(__NPU_ARCH__) && ((__NPU_ARCH__ == 3510) || (__NPU_ARCH__ == 5102))
Internal::g_aippPaddingVal = static_cast<uint64_t>(paddingValueCh0);
Internal::g_aippPaddingVal |= static_cast<uint64_t>(paddingValueCh1) << AIPP_OFFSET_CH1;
Internal::g_aippPaddingVal |= static_cast<uint64_t>(paddingValueCh2) << AIPP_OFFSET_CH2;
Internal::g_aippPaddingVal |= static_cast<uint64_t>(paddingValueCh3) << AIPP_OFFSET_CH3;
#else
aippConfig8 |= static_cast<uint64_t>(paddingValueCh0);
aippConfig8 |= static_cast<uint64_t>(paddingValueCh1) << AIPP_OFFSET_CH1;
aippConfig8 |= static_cast<uint64_t>(paddingValueCh2) << AIPP_OFFSET_CH2;
aippConfig8 |= static_cast<uint64_t>(paddingValueCh3) << AIPP_OFFSET_CH3;
set_aipp_spr_8(aippConfig8);
#endif
}
}
template <typename T>
__aicore__ inline void SetAippFunctionsImpl9(AippInputFormat format, AippParams<T>& config)
{
uint64_t aippConfig9 = 0;
if constexpr(IsSameType<T, int8_t>::value || IsSameType<T, uint8_t>::value) {
uint8_t cPaddingValue = GetScalarBitcodeValue(config.cPaddingParams.cPaddingValue);
aippConfig9 |= static_cast<uint64_t>(cPaddingValue);
} else {
uint16_t cPaddingValue = GetScalarBitcodeValue(config.cPaddingParams.cPaddingValue);
aippConfig9 |= static_cast<uint64_t>(cPaddingValue);
}
if (config.swapParams.isSwapRB) {
aippConfig9 |= static_cast<uint64_t>(1) << AIPP_OFFSET_SWAP_RB;
}
if (config.swapParams.isSwapUV) {
aippConfig9 |= static_cast<uint64_t>(1) << AIPP_OFFSET_SWAP_UV;
}
if (config.swapParams.isSwapAX) {
aippConfig9 |= static_cast<uint64_t>(1) << AIPP_OFFSET_SWAP_AX;
}
aippConfig9 |= (static_cast<uint64_t>(format) & 0x1f) << AIPP_OFFSET_FORMAT;
if (config.singleLineParams.isSingleLineCopy) {
aippConfig9 |= static_cast<uint64_t>(1) << AIPP_OFFSET_SINGLE_LINE;
}
aippConfig9 |= (static_cast<uint64_t>(config.paddingParams.paddingMode) & 0x3) << AIPP_OFFSET_PADDING_MODE;
#if __NPU_ARCH__ == 3002
aippConfig9 |= (static_cast<uint64_t>(config.dtcParams.dtcRoundMode) & 0x1) << AIPP_OFFSET_DTC_ROUND_MODE;
#endif
aippConfig9 |= (static_cast<uint64_t>(config.cPaddingParams.cPaddingMode) & 0x1) << AIPP_OFFSET_CPADDING_MODE;
set_aipp_spr_9(aippConfig9);
}
template <typename T>
__aicore__ inline void SetAippFunctionsImpl18(AippParams<T>& config)
{
#if __NPU_ARCH__ != 3002 && (__NPU_ARCH__ != 5102)
return;
#endif
float dtcVarCh0f = static_cast<float>(config.dtcParams.dtcVarCh0);
float dtcVarCh1f = static_cast<float>(config.dtcParams.dtcVarCh1);
uint32_t dtcVarCh0 = GetScalarBitcodeValue(dtcVarCh0f);
uint32_t dtcVarCh1 = GetScalarBitcodeValue(dtcVarCh1f);
uint64_t aippConfig18 = static_cast<uint64_t>(dtcVarCh0);
aippConfig18 |= static_cast<uint64_t>(dtcVarCh1) << AIPP_OFFSET_DTC_CH1;
set_aipp_spr_18(aippConfig18);
}
template <typename T>
__aicore__ inline void SetAippFunctionsImpl19(AippParams<T>& config)
{
#if __NPU_ARCH__ != 3002 && (__NPU_ARCH__ != 5102)
return;
#endif
float dtcVarCh2f = static_cast<float>(config.dtcParams.dtcVarCh2);
uint32_t dtcVarCh2 = GetScalarBitcodeValue(dtcVarCh2f);
uint64_t aippConfig19 = static_cast<uint64_t>(dtcVarCh2);
set_aipp_spr_19(aippConfig19);
}
template <typename T>
__aicore__ inline void SetAippFunctionsImpl20(AippParams<T>& config)
{
#if __NPU_ARCH__ != 3002 && (__NPU_ARCH__ != 5102)
return;
#endif
float dtcMeanCh0f = static_cast<float>(config.dtcParams.dtcMeanCh0 * 1.0f);
float dtcMeanCh1f = static_cast<float>(config.dtcParams.dtcMeanCh1 * 1.0f);
uint32_t dtcMeanCh0 = GetScalarBitcodeValue(dtcMeanCh0f);
uint32_t dtcMeanCh1 = GetScalarBitcodeValue(dtcMeanCh1f);
uint64_t aippConfig20 = static_cast<uint64_t>(dtcMeanCh0);
aippConfig20 |= static_cast<uint64_t>(dtcMeanCh1) << AIPP_OFFSET_DTC_CH1;
set_aipp_spr_20(aippConfig20);
}
template <typename T>
__aicore__ inline void SetAippFunctionsImpl21(AippParams<T>& config)
{
#if __NPU_ARCH__ != 3002 && (__NPU_ARCH__ != 5102)
return;
#endif
float dtcMeanCh2f = static_cast<float>(config.dtcParams.dtcMeanCh2 * 1.0f);
uint32_t dtcMeanCh2 = GetScalarBitcodeValue(dtcMeanCh2f);
uint64_t aippConfig21 = static_cast<uint64_t>(dtcMeanCh2);
set_aipp_spr_21(aippConfig21);
}
#if defined(__NPU_ARCH__) && ((__NPU_ARCH__ == 3510) || (__NPU_ARCH__ == 5102))
template <typename T>
__aicore__ inline void SetAippFunctionsImpl22(AippInputFormat format, AippParams<T>& config)
{
Internal::g_aippArgs = 0;
if constexpr(IsSameType<T, int8_t>::value || IsSameType<T, uint8_t>::value) {
uint8_t cPaddingValue = GetScalarBitcodeValue(config.cPaddingParams.cPaddingValue);
Internal::g_aippArgs |= static_cast<uint64_t>(cPaddingValue);
} else {
uint16_t cPaddingValue = GetScalarBitcodeValue(config.cPaddingParams.cPaddingValue);
Internal::g_aippArgs |= static_cast<uint64_t>(cPaddingValue);
}
if (config.swapParams.isSwapRB) {
Internal::g_aippArgs |= static_cast<uint64_t>(1) << AIPP_OFFSET_SWAP_RB;
}
if (config.swapParams.isSwapUV) {
Internal::g_aippArgs |= static_cast<uint64_t>(1) << AIPP_OFFSET_SWAP_UV;
}
if (config.swapParams.isSwapAX) {
Internal::g_aippArgs |= static_cast<uint64_t>(1) << AIPP_OFFSET_SWAP_AX;
}
Internal::g_aippArgs |= (static_cast<uint64_t>(format) & 0x1f) << AIPP_OFFSET_FORMAT;
if (config.singleLineParams.isSingleLineCopy) {
Internal::g_aippArgs |= static_cast<uint64_t>(1) << AIPP_OFFSET_SINGLE_LINE;
}
Internal::g_aippArgs |= (static_cast<uint64_t>(config.paddingParams.paddingMode) & 0x3) << AIPP_OFFSET_PADDING_MODE;
Internal::g_aippArgs |= (static_cast<uint64_t>(config.cPaddingParams.cPaddingMode) & 0x1) << AIPP_OFFSET_CPADDING_MODE;
}
#endif
template <typename T, typename U>
__aicore__ inline void SetAippFunctionsImpl(__gm__ T* src0, __gm__ T* src1,
AippInputFormat format, AippParams<U>& config)
{
#if __NPU_ARCH__ == 2201
if ASCEND_IS_AIV {
return;
}
#endif
#if defined(__NPU_ARCH__) && ((__NPU_ARCH__ == 3510) || (__NPU_ARCH__ == 5102))
static_assert(SupportType<T, uint8_t>(), "Input type T only supports uint8_t on current device.");
static_assert(SupportType<U, uint8_t, int8_t, half>(), "Input type T only supports uint8_t, int8_t, half on current device.");
#endif
#if __NPU_ARCH__ == 3002
SetAippFunctionsImpl0<T>(src0);
SetAippFunctionsImpl1<T, U>(src1, config);
SetAippFunctionsImpl2<U>(config);
SetAippFunctionsImpl3<U>(config);
SetAippFunctionsImpl4<U>(config);
SetAippFunctionsImpl8<U>(config);
SetAippFunctionsImpl9<U>(format, config);
SetAippFunctionsImpl18<U>(config);
SetAippFunctionsImpl19<U>(config);
SetAippFunctionsImpl20<U>(config);
SetAippFunctionsImpl21<U>(config);
#else
SetAippFunctionsImpl0<T>(src0);
SetAippFunctionsImpl1<T, U>(src1, config);
SetAippFunctionsImpl2<U>(config);
SetAippFunctionsImpl3<U>(config);
SetAippFunctionsImpl4<U>(config);
SetAippFunctionsImpl5<U>(config);
SetAippFunctionsImpl6<U>(config);
SetAippFunctionsImpl7<U>(config);
SetAippFunctionsImpl8<U>(config);
#if defined(__NPU_ARCH__) && ((__NPU_ARCH__ == 3510) || (__NPU_ARCH__ == 5102))
SetAippFunctionsImpl22<U>(format, config);
#else
SetAippFunctionsImpl9<U>(format, config);
#endif
#endif
}
template <typename T, typename U>
__aicore__ inline void SetAippFunctionsImpl(__gm__ T* src0, AippInputFormat format, AippParams<U> config)
{
#if __NPU_ARCH__ == 2201
if ASCEND_IS_AIV {
return;
}
#endif
SetAippFunctionsImpl(src0, reinterpret_cast<__gm__ T*>(0), format, config);
}
#endif
}
#endif
#if defined(__UNDEF_ASCENDC_INCLUDE_INTERNAL_HEADERS_KERNEL_REG_H__)
#undef __ASCENDC_INCLUDE_INTERNAL_HEADERS__
#undef __UNDEF_ASCENDC_INCLUDE_INTERNAL_HEADERS_KERNEL_REG_H__
#endif