* Copyright (c) 2026 Huawei Technologies Co., Ltd.
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
* CANN Open Software License Agreement Version 2.0 (the "License").
* Please refer to the License for details. You may not use this file except in compliance with the License.
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
* See LICENSE in the root of the software repository for the full text of the License.
*/
* \file matmul_kernel.h
* \brief Host-side launch declarations for blasLt matmul device kernels.
*
* Implementations are provided under type-specific directories (for example `blasLt/matmul_fp32/`); this header
* collects their entry points as additional precisions (FP16, MXFP8, and others) are integrated.
*/
#pragma once
#include <cstdint>
#include <acl/acl.h>
#include "epilogue_alpha_beta_tiling_data.h"
#include "matmul_tiling_data.h"
void epilogue_alpha_beta_get_tiling(uint32_t m, uint32_t n, uint32_t numBlocks, EpilogueAlphaBetaTilingData& tilingData);
void epilogue_alpha_beta_kernel_do(
uint8_t* dRaw, uint8_t* c, uint8_t* d, const EpilogueAlphaBetaTilingData& tiling, void* stream);
void epilogue_alpha_beta_do(
uint8_t* dRaw, uint8_t* c, uint8_t* d, uint32_t m, uint32_t n, uint32_t ldc, uint32_t ldd, uint32_t lddRaw,
float alpha, float beta, aclDataType dtypeC, aclDataType dtypeDRaw, aclDataType dtypeD, void* stream);
void matmul_fp32_kernel_do(
uint8_t* a, uint8_t* b, uint8_t* dRaw, const MatmulFp32TilingData& tiling, uint32_t numBlocks, void* stream);
void matmul_mxfp8_kernel_do(
uint8_t* dA, uint8_t* dB, uint8_t* dScaleA, uint8_t* dScaleB, uint8_t* dC, const QuantMatmulTilingData& tiling,
bool transA, bool transB, void* stream);
void matmul_mxfp4_kernel_do_e2m1_e2m1_fp32(
uint8_t* dA, uint8_t* dB, uint8_t* dScaleA, uint8_t* dScaleB, uint8_t* dC, const QuantMatmulTilingData& tiling,
bool transA, bool transB, void* stream);
void matmul_mxfp4_kernel_do_e2m1_e2m1_bf16(
uint8_t* dA, uint8_t* dB, uint8_t* dScaleA, uint8_t* dScaleB, uint8_t* dC, const QuantMatmulTilingData& tiling,
bool transA, bool transB, void* stream);