* Copyright (c) 2025 Huawei Technologies Co., Ltd.
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
* CANN Open Software License Agreement Version 2.0 (the "License").
* Please refer to the License for details. You may not use this file except in compliance with the License.
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
* See LICENSE in the root of the software repository for the full text of the License.
*/
#include "kernel_tiling/kernel_tiling.h"
#ifdef ASCENDC_CPU_DEBUG
#include "kernel_log.h"
#else
#define __aicore__ [aicore]
#endif
#if defined(ASCENDC_CPU_DEBUG)
template <class T>
inline __aicore__ void InitTilingData(const __gm__ uint8_t *p_tilingdata, T *tilingdata)
#else
template <class T>
__inline__ __attribute__((always_inline)) __aicore__ void InitTilingData(const __gm__ uint8_t *p_tilingdata, T *tilingdata)
#endif
{
constexpr uint64_t all_bytes = sizeof(T);
#if defined(ASCENDC_CPU_DEBUG) || defined(__DAV_C220_CUBE__) || defined(__DAV_C310_CUBE__) || defined(__DAV_310R6_CUBE__) || defined(__GET_CODE_CHANNEL__)
#if defined(__DAV_C100__) || defined(ASCENDC_CPU_DEBUG)
constexpr uint32_t judge_bytes = all_bytes > 15 ? all_bytes - 15 : 0;
uint32_t i = 0;
if (judge_bytes > 0) {
for (; i < judge_bytes; i += 16) {
(*(uint64_t*)((uint8_t*)tilingdata + i)) = (*(const __gm__ uint64_t*)((const __gm__ uint8_t *)p_tilingdata + i));
(*(uint64_t*)((uint8_t*)tilingdata + i + 8)) = (*(const __gm__ uint64_t*)((const __gm__ uint8_t *)p_tilingdata + i + 8));
}
}
if (all_bytes & 0x00000008) {
(*(uint64_t*)((uint8_t*)tilingdata + i)) = (*(const __gm__ uint64_t *)((const __gm__ uint8_t *)p_tilingdata + i));
i += 8;
}
if (all_bytes & 0x00000004) {
(*(uint32_t*)((uint8_t*)tilingdata + i)) = (*(const __gm__ uint32_t *)((const __gm__ uint8_t *)p_tilingdata + i));
i += 4;
}
if (all_bytes & 0x00000002) {
(*(uint16_t*)((uint8_t*)tilingdata + i)) = (*(const __gm__ uint16_t *)((const __gm__ uint8_t *)p_tilingdata + i));
i += 2;
}
if (all_bytes & 0x00000001) {
(*(uint8_t*)((uint8_t*)tilingdata + i)) = (*(const __gm__ uint8_t *)((const __gm__ uint8_t *)p_tilingdata + i));
}
#else
copy_data_align64((uint8_t*)tilingdata, (__gm__ uint8_t *)p_tilingdata, all_bytes);
#endif
#else
__ubuf__ uint8_t *tilingdata_in_ub = (__ubuf__ uint8_t *)get_imm(0);
constexpr uint32_t len_burst = (all_bytes + 31) / 32;
#if defined(__DAV_C310__) || defined(__DAV_310R6__)
copy_gm_to_ubuf_align_v2((__ubuf__ uint8_t *)tilingdata_in_ub, (__gm__ uint8_t *)p_tilingdata, 0, 1, len_burst * 32, 0, 0, false, 0, 0, 0);
constexpr uint32_t DC_PRLOAD_LOOP = (all_bytes)/512;
for (uint64_t loop_dc=0; loop_dc < DC_PRLOAD_LOOP; loop_dc++) {
uint64_t offset = loop_dc*512;
dc_preload((uint64_t *)tilingdata, offset);
}
uint64_t tiling_offset = ((uint64_t)(&tilingdata))%64;
uint32_t DC_PRLOAD_LOOP1 = (all_bytes+63+tiling_offset)/64;
for (uint64_t loop_dc=0; loop_dc < DC_PRLOAD_LOOP1; loop_dc++) {
uint64_t offset = loop_dc*64;
dc_preload((uint64_t *)tilingdata, offset);
}
#elif !defined(__DAV_M310__)
copy_gm_to_ubuf(((__ubuf__ uint8_t *)tilingdata_in_ub), p_tilingdata, 0, 1,len_burst, 0, 0);
#else
copy_gm_to_ubuf_align(((__ubuf__ uint8_t *)tilingdata_in_ub), (__gm__ uint8_t *)p_tilingdata,0, 1, all_bytes, 0, 0, 0, 0);
#endif
set_flag(PIPE_MTE2, PIPE_S, EVENT_ID0);
wait_flag(PIPE_MTE2, PIPE_S, EVENT_ID0);
#if defined __DAV_C100__
constexpr uint32_t judge_bytes = all_bytes > 15 ? all_bytes - 15 : 0;
uint32_t i = 0;
if (judge_bytes > 0) {
for (; i < judge_bytes; i += 16) {
(*(uint64_t*)((uint8_t*)tilingdata + i)) = (*(__ubuf__ uint64_t*)((__ubuf__ uint8_t *)tilingdata_in_ub + i));
(*(uint64_t*)((uint8_t*)tilingdata + i + 8)) = (*(__ubuf__ uint64_t*)((__ubuf__ uint8_t *)tilingdata_in_ub + i + 8));
}
}
if (all_bytes & 0x00000008) {
(*(uint64_t*)((uint8_t*)tilingdata + i)) = (*(__ubuf__ uint64_t *)((__ubuf__ uint8_t *)tilingdata_in_ub + i));
i += 8;
}
if (all_bytes & 0x00000004) {
(*(uint32_t*)((uint8_t*)tilingdata + i)) = (*(__ubuf__ uint32_t *)((__ubuf__ uint8_t *)tilingdata_in_ub + i));
i += 4;
}
if (all_bytes & 0x00000002) {
(*(uint16_t*)((uint8_t*)tilingdata + i)) = (*(__ubuf__ uint16_t *)((__ubuf__ uint8_t *)tilingdata_in_ub + i));
i += 2;
}
if (all_bytes & 0x00000001) {
(*(uint8_t*)((uint8_t*)tilingdata + i)) = (*(__ubuf__ uint8_t *)((__ubuf__ uint8_t *)tilingdata_in_ub + i));
}
#else
copy_data_align64((uint8_t*)tilingdata, (__ubuf__ uint8_t *)tilingdata_in_ub, all_bytes);
#endif
#endif
#ifndef ASCENDC_CPU_DEBUG
pipe_barrier(PIPE_ALL);
#endif
}
#define GET_TILING_DATA(tiling_data, tiling_arg) \
AutofuseTilingData tiling_data; \
InitTilingData<AutofuseTilingData>(tiling_arg, &tiling_data);