* Copyright (c) 2025 Huawei Technologies Co., Ltd.
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
* CANN Open Software License Agreement Version 2.0 (the "License").
* Please refer to the License for details. You may not use this file except in compliance with the License.
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
* See LICENSE in the root of the software repository for the full text of the License.
*/
#ifndef __ASCENDC_API_DUPLICATE_H__
#define __ASCENDC_API_DUPLICATE_H__
template <typename T>
inline __aicore__ void DuplicateInt64(const AscendC::LocalTensor<T> &dst, T src, const uint32_t size,
AscendC::LocalTensor<uint8_t> &tmp_buf) {
uint16_t bit_00 = src & 0xFFFF;
uint16_t bit_16 = src >> 16 & 0xFFFF;
uint16_t bit_32 = src >> 32 & 0xFFFF;
uint16_t bit_48 = src >> 48 & 0xFFFF;
uint32_t cal_cnt = size * (sizeof(T) / sizeof(uint16_t));
constexpr uint32_t TRANSPOSE_MIN_NUM = 16;
constexpr uint32_t TRANSPOSE_MIN_CUBE_BYTE_SIZE = TRANSPOSE_MIN_NUM * TRANSPOSE_MIN_NUM * sizeof(uint16_t);
AscendC::LocalTensor<uint16_t> init_buf = KernelUtils::NewTensor<uint16_t>(tmp_buf, 0, TRANSPOSE_MIN_CUBE_BYTE_SIZE);
Duplicate(init_buf[TRANSPOSE_MIN_NUM * 0], bit_00, TRANSPOSE_MIN_NUM);
Duplicate(init_buf[TRANSPOSE_MIN_NUM * 1], bit_16, TRANSPOSE_MIN_NUM);
Duplicate(init_buf[TRANSPOSE_MIN_NUM * 2], bit_32, TRANSPOSE_MIN_NUM);
Duplicate(init_buf[TRANSPOSE_MIN_NUM * 3], bit_48, TRANSPOSE_MIN_NUM);
Duplicate(init_buf[TRANSPOSE_MIN_NUM * 4], bit_00, TRANSPOSE_MIN_NUM);
Duplicate(init_buf[TRANSPOSE_MIN_NUM * 5], bit_16, TRANSPOSE_MIN_NUM);
Duplicate(init_buf[TRANSPOSE_MIN_NUM * 6], bit_32, TRANSPOSE_MIN_NUM);
Duplicate(init_buf[TRANSPOSE_MIN_NUM * 7], bit_48, TRANSPOSE_MIN_NUM);
Duplicate(init_buf[TRANSPOSE_MIN_NUM * 8], bit_00, TRANSPOSE_MIN_NUM);
Duplicate(init_buf[TRANSPOSE_MIN_NUM * 9], bit_16, TRANSPOSE_MIN_NUM);
Duplicate(init_buf[TRANSPOSE_MIN_NUM * 10], bit_32, TRANSPOSE_MIN_NUM);
Duplicate(init_buf[TRANSPOSE_MIN_NUM * 11], bit_48, TRANSPOSE_MIN_NUM);
Duplicate(init_buf[TRANSPOSE_MIN_NUM * 12], bit_00, TRANSPOSE_MIN_NUM);
Duplicate(init_buf[TRANSPOSE_MIN_NUM * 13], bit_16, TRANSPOSE_MIN_NUM);
Duplicate(init_buf[TRANSPOSE_MIN_NUM * 14], bit_32, TRANSPOSE_MIN_NUM);
Duplicate(init_buf[TRANSPOSE_MIN_NUM * 15], bit_48, TRANSPOSE_MIN_NUM);
AscendC::PipeBarrier<PIPE_V>();
Transpose(init_buf, init_buf);
AscendC::PipeBarrier<PIPE_V>();
AscendC::LocalTensor<uint16_t> new_dst = dst.template ReinterpretCast<uint16_t>();
CopyRepeatParams params;
params.srcStride = 0;
params.dstStride = DEFAULT_BLK_STRIDE;
params.srcRepeatSize = 0;
params.dstRepeatSize = DEFAULT_REPEAT_STRIDE;
constexpr uint32_t ONE_RPT_SIZE = KernelUtils::RptSize<uint16_t>();
constexpr uint32_t MAX_RPT_SIZE = MAX_REPEAT_TIME * ONE_RPT_SIZE;
uint32_t calc_size = 0;
for (; calc_size + MAX_RPT_SIZE <= cal_cnt; calc_size += MAX_RPT_SIZE) {
Copy(new_dst[calc_size], init_buf, ONE_RPT_SIZE, MAX_REPEAT_TIME, params);
AscendC::PipeBarrier<PIPE_V>();
}
if (calc_size + ONE_RPT_SIZE <= cal_cnt) {
uint32_t left_rpt_num = KernelUtils::RptNum<uint16_t>(cal_cnt - calc_size);
uint32_t do_size = left_rpt_num * KernelUtils::RptSize<uint16_t>();
Copy(new_dst[calc_size], init_buf, ONE_RPT_SIZE, left_rpt_num, params);
calc_size += do_size;
AscendC::PipeBarrier<PIPE_V>();
}
if (calc_size < cal_cnt) {
uint32_t left_size = cal_cnt - calc_size;
uint32_t mask = KernelUtils::SizeAlign(left_size, KernelUtils::BlkSize<uint16_t>());
Copy(new_dst[calc_size], init_buf, mask, 1, params);
AscendC::PipeBarrier<PIPE_V>();
}
}
inline __aicore__ void Duplicate(const AscendC::LocalTensor<int64_t> &dst, int64_t src, const uint32_t size,
AscendC::LocalTensor<uint8_t> &tmp_buf) {
DuplicateInt64(dst, src, size, tmp_buf);
}
inline __aicore__ void Duplicate(const AscendC::LocalTensor<uint64_t> &dst, uint64_t src, const uint32_t size,
AscendC::LocalTensor<uint8_t> &tmp_buf) {
DuplicateInt64(dst, src, size, tmp_buf);
}
inline __aicore__ void Duplicate(const AscendC::LocalTensor<uint8_t> &dst, uint8_t src, const uint32_t size) {
uint16_t tmp_src = src;
tmp_src <<= 8;
tmp_src |= src;
const AscendC::LocalTensor<uint16_t> tmp_dst = dst.ReinterpretCast<uint16_t>();
Duplicate(tmp_dst, tmp_src, static_cast<int32_t>(dst.GetSize() + 1) / 2);
}
inline __aicore__ void Duplicate(const AscendC::LocalTensor<uint8_t> &dst, uint8_t src, const uint32_t size,
AscendC::LocalTensor<uint8_t> &tmp_buf) {
Duplicate(dst, src, size);
}
inline __aicore__ void Duplicate(const AscendC::LocalTensor<int8_t> &dst, int8_t src, const uint32_t size) {
uint16_t tmp_src = static_cast<uint16_t>((uint8_t)src);
tmp_src <<= 8;
tmp_src |= (uint8_t)src;
const AscendC::LocalTensor<uint16_t> tmp_dst = dst.ReinterpretCast<uint16_t>();
AscendC::Duplicate(tmp_dst, tmp_src, static_cast<int32_t>(size + 1) / 2);
}
inline __aicore__ void Duplicate(const AscendC::LocalTensor<int8_t> &dst, int8_t src, const uint32_t size,
AscendC::LocalTensor<uint8_t> &tmp_buf) {
Duplicate(dst, src, size);
}
#endif