* Copyright (c) 2025 Huawei Technologies Co., Ltd.
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
* CANN Open Software License Agreement Version 2.0 (the "License").
* Please refer to the License for details. You may not use this file except in compliance with the License.
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
* See LICENSE in the root of the software repository for the full text of the License.
*/
* \file tool.h
* \brief
*/
#ifndef GROUPED_MATMUL_WEIGHT_QUANT_TOOL_H
#define GROUPED_MATMUL_WEIGHT_QUANT_TOOL_H
#include "kernel_operator.h"
using AscendC::CrossCoreSetFlag;
using AscendC::CrossCoreWaitFlag;
using AscendC::DataCopyExtParams;
using AscendC::DataCopyPadExtParams;
using AscendC::GetUserWorkspace;
using AscendC::GlobalTensor;
using AscendC::int4b_t;
using AscendC::IsSameType;
using AscendC::LocalTensor;
using AscendC::ONE_BLK_SIZE;
using AscendC::TPosition;
namespace GROUPED_MATMUL::A16W4Msd {
template <typename T>
__aicore__ inline T CeilAlign(T a, T b)
{
ASCENDC_ASSERT(b != 0, { KERNEL_LOG(KERNEL_ERROR, "Division by zero error!"); });
return (a + b - 1) / b * b;
}
template <typename T>
__aicore__ inline T CeilDiv(T a, T b)
{
ASCENDC_ASSERT(b != 0, { KERNEL_LOG(KERNEL_ERROR, "Division by zero error!"); });
return (a + b - 1) / b;
}
template <typename T>
__aicore__ inline void DataCopyPad2D(const LocalTensor<T> &dst, const GlobalTensor<T> &src, uint32_t blockCount,
uint32_t blockLen, uint32_t dstInnerLength, uint32_t srcInnerLength)
{
DataCopyExtParams params;
params.blockCount = blockCount;
params.blockLen = blockLen * sizeof(T);
params.srcStride = (srcInnerLength - blockLen) * sizeof(T);
params.dstStride = (dstInnerLength - blockLen) * sizeof(T) / ONE_BLK_SIZE;
DataCopyPadExtParams<T> padParams;
if (blockLen % (32 / sizeof(T)) != 0) {
padParams.isPad = true;
padParams.rightPadding = CeilAlign(blockLen, static_cast<uint32_t>(32 / sizeof(T))) - blockLen;
padParams.paddingValue = 0;
}
if constexpr (IsSameType<T, int4b_t>::value) {
params.blockLen = params.blockLen >> 1;
params.srcStride = params.srcStride >> 1;
params.dstStride = params.dstStride >> 1;
padParams.rightPadding = padParams.rightPadding >> 1;
}
DataCopyPad(dst, src, params, padParams);
}
template <typename T>
__aicore__ inline void DataCopyPad2D(const GlobalTensor<T> &dst, const LocalTensor<T> &src, uint32_t dim1,
uint32_t dim0, uint32_t srcFullDim0, uint32_t dstFullDim0)
{
DataCopyExtParams params;
params.blockCount = dim1;
params.blockLen = dim0 * sizeof(T);
params.srcStride = CeilDiv((srcFullDim0 - dim0) * sizeof(T), static_cast<uint64_t>(ONE_BLK_SIZE));
params.dstStride = (dstFullDim0 - dim0) * sizeof(T);
DataCopyPad(dst, src, params);
}
template <typename T>
__aicore__ constexpr uint32_t GetKBUnit()
{
if constexpr (IsSameType<T, int4b_t>::value) {
return 2048;
}
if constexpr (IsSameType<T, int8_t>::value) {
return 1024;
}
if constexpr (IsSameType<T, float>::value) {
return 256;
}
return 512;
}
}
#endif