* Copyright (c) 2025 Huawei Technologies Co., Ltd.
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
* CANN Open Software License Agreement Version 2.0 (the "License").
* Please refer to the License for details. You may not use this file except in compliance with the License.
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
* See LICENSE in the root of the software repository for the full text of the License.
*/
#ifndef __LLT_HCCL_STUB_H__
#define __LLT_HCCL_STUB_H__
#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>
#include <semaphore.h>
#include <assert.h>
#include <errno.h>
#include <sys/time.h>
#include <string>
#include <list>
#include <mutex>
#include <string>
#include <map>
#include <atomic>
using std::string;
using std::list;
#ifdef __cplusplus
extern "C" {
#endif
#ifdef __cplusplus
}
#endif
#include "hccl/hcom.h"
#include "runtime/base.h"
#include "llt_hccl_stub_sal_pub.h"
#define EVENT_UNIQUE_ID_BYTES (SAL_UNIQUE_ID_BYTES + 11)
#define EVENT_UNIQUE_ID_PREFIX "hccl-event-"
#ifdef HAVE_SYMVER_SUPPORT
# define symver(name, api, ver) \
asm(".symver " #name "," #api "@" #ver)
# define default_symver(name, api) \
asm(".symver " #name "," #api "@@" DEFAULT_ABI)
#else
# define symver(name, api, ver)
# define default_symver(name, api) \
extern __typeof(name) api __attribute__((alias(#name)))
#endif
#if !defined(weak_alias)
#define weak_alias(name, aliasname) _weak_alias (name, aliasname)
#define _weak_alias(name, aliasname) \
extern __typeof (name) aliasname __attribute__ ((weak, alias (#name)));
#endif
#if !defined(strong_alias)
#define strong_alias(name, aliasname) _strong_alias (name, aliasname)
#define _strong_alias(name, aliasname) \
extern __typeof (name) aliasname __attribute__ ((alias (#name)));
#endif
#define MAX_DEVICE_NUM 63
#define MIN_DEVICE_ID 0
namespace Adx
{
enum class DumpType : int32_t {
OPERATOR = 0x01,
EXCEPTION = 0x02,
ARGS_EXCEPTION = 0x03,
OP_OVERFLOW = 0x04
};
bool AdumpIsDumpEnable(DumpType type);
void AdumpPrintWorkSpace(const void *workSpaceAddr, const size_t dumpWorkSpaceSize,
rtStream_t stream, const char *opType, bool enableSync);
};
namespace cce {
typedef enum tagCceCRedOp
{
CCE_RED_OP_SUM = 0,
CCE_RED_OP_PROD = 1,
CCE_RED_OP_MAX = 2,
CCE_RED_OP_Min = 3,
CCE_RED_OP_RESERVED
}ccReduceOp_t;
typedef enum tagCcStatus {
CC_STATUS_SUCCESS = 0,
CC_STATUS_NOT_INITIALIZED = 1,
CC_STATUS_ALLOC_FAILED = 2,
CC_STATUS_BAD_PARAM = 3,
CC_STATUS_INTERNAL_ERROR = 4,
CC_STATUS_KERNEL_ERROR = 5,
CC_STATUS_RUNTIME_ERROR = 6,
CC_STATUS_NOT_SUPPORTED = 7,
CC_STATUS_INVALID_VALUE = 7,
CC_STATUS_RESERVED
} ccStatus_t;
* @ingroup cce
* @brief original data type
*/
typedef enum tagCcDataType {
CC_DATA_FLOAT = 0,
CC_DATA_HALF,
CC_DATA_INT8,
CC_DATA_INT32,
CC_DATA_UINT8,
CC_DATA_HALF_UINT16_PROPOSAL,
CC_DATA_INT16,
CC_DATA_UINT16,
CC_DATA_UINT32,
CC_DATA_INT64,
CC_DATA_UINT64,
CC_DATA_DOUBLE,
CC_DATA_BOOL,
CC_DATA_DUAL,
CC_DATA_DUAL_SUB_INT8,
CC_DATA_DUAL_SUB_UINT8,
CC_DATA_COMPLEX64,
CC_DATA_COMPLEX128,
CC_DATA_QINT8,
CC_DATA_QINT16,
CC_DATA_QINT32,
CC_DATA_QUINT8,
CC_DATA_QUINT16,
CC_DATA_RESERVED
} ccDataType_t;
};
typedef enum task_type
{
TASK_TYPE_MEMCPY = 0,
TASK_TYPE_RECORD,
TASK_TYPE_MULTIDEV_RECORD,
TASK_TYPE_WAIT,
TASK_TYPE_REDUCE,
TASK_TYPE_USLEEP,
TASK_TYPE_NOTIFY_RECORD,
TASK_TYPE_NOTIFY_WAIT,
TASK_TYPE_RDMA_SEND,
TASK_TYPE_CALLBACK_FUNC,
TASK_TYPE_RESERVED
} tasktype_e;
constexpr u64 ENGINE_MAX_TAG_LEN = 31;
* @name ProfReporterData
* @brief struct of data to report
*/
struct ProfReporterData {
char tag[ENGINE_MAX_TAG_LEN + 1];
u32 deviceId;
size_t dataLen;
u8 *data;
};
typedef void *hccl_rt_event_t;
typedef struct rt_event_share_info_stub_s
{
s32 record_status;
char sem_unique_id[SAL_SEM_UNIQUE_ID_BYTES];
char mutex_unique_id[SAL_MUTEX_UNIQUE_ID_BYTES];
} rt_event_share_info_stub_t;
typedef struct rt_event_stub_s
{
sal_sem_t sem;
sal_mutex_t mutex;
void* event_handler;
rt_event_share_info_stub_t *event_share_info;
} rt_event_stub_t;
typedef struct task_info_s
{
u32 streamId;
u32 taskId;
} task_info_t;
typedef struct rt_name_map_stub_s
{
char shm_real_name[SAL_DMEM_UNIQUE_ID_BYTES];
char mapped_name[SAL_DMEM_UNIQUE_ID_BYTES];
s32 mem_size;
s32 valid_flag;
u32 offset;
} rt_name_map_stub_t;
enum
{
THREAD_STATE_INITALING = 0,
THREAD_STATE_STOPED,
THREAD_STATE_WORKING,
THREAD_STATE_MAX
};
enum class QosErrorCode : int {
QOS_SUCCESS = 0,
QOS_UNINIT_ERROR,
QOS_INIT_ERROR,
QOS_ILLEGAL_PARA,
QOS_NOT_FOUND,
QOS_UNSUPPORTED,
QOS_DSMI_ERROR,
QOS_NOMATCH_MPAMID,
};
enum class QosStreamType : int {
STREAM_FORWARD_COMPUTE = 0,
STREAM_BACKWARD_COMPUTE = 1,
STREAM_PARAMETER_UPDATE = 2,
STREAM_GRADUATION_AGGREGATION = 3,
STREAM_HCCL_MODEL_LAY_PARALLEL_FEATURE_MAP = 4,
STREAM_HCCL_MODEL_PIPELINE_PARALLEL_FEATURE_MAP = 5,
STREAM_HCCL_PARAMETER_PREFETCH = 6,
STREAM_HCCL_FEATURE_MAP_PREFETCH = 7,
STREAM_HCCL_FEATURE_MAP_SHARE = 8,
STREAM_HCCL_EMBEDDING_READ_WRITE = 9,
STREAM_DVPP_COMPUTE = 10,
STREAM_L2CACHE_PREFETCH = 11,
STREAM_L2CACHE_INV_WRB_FLUSH = 12,
STREAM_AIV_H2D_COPY = 13,
STREAM_OTHERS,
STREAM_INVALID,
STREAM_MAX
};
enum class QosEngineType : int {
AI,
HCCL,
AICPU,
MEMCPYS,
CMO
};
struct QosConfig {
unsigned int mpamId;
unsigned int bwHigh;
unsigned int bwLow;
unsigned int qos;
unsigned int hardlimit;
unsigned int pmg;
unsigned int ns;
unsigned int mode;
};
#define THREAD_STOP_COUNTER 20
#define THREAD_DEFAULT_UPDATE_INTERVAL 100000
#define THREAD_UPDATE_MIN 10000
#define NOTIFY_MAX 1024
#define NOTIFY_SHM_NAME_LEN 128
#define NOTIFY_TIMEOUT_CNT 20000
typedef struct rt_notify_shm_s
{
s32 device_id;
char ipc_notify_shm_name[NOTIFY_SHM_NAME_LEN];
volatile u64 name_flag;
volatile u64 ref_cnt;
u64 occupied_flag[NOTIFY_MAX];
u64 record_cnt[NOTIFY_MAX];
} rt_shm_notify_t;
typedef struct rt_notify_ipc_name_s
{
char ipc_notify_shm_name[NOTIFY_SHM_NAME_LEN];
u64 notify_id;
} rt_shm_ipc_name_t;
typedef struct rt_notify_s
{
rt_shm_ipc_name_t* ipc_name_shm;
rt_shm_notify_t* ipc_notify_shm;
u64 notify_id;
} rt_notify_t;
int RAND_bytes(char *buf, int num);
class thread_class
{
private:
u32 uithread_update_interval;
u32 uithreadstate;
sal_thread_t threadfd;
public:
thread_class();
virtual ~thread_class();
s32 stop_thread();
s32 start_thread();
s32 start_thread(string thread_name);
s32 set_new_interval(u32 uinewinterval);
u32 get_current_interval();
s32 update_thread_state(u32 uinewstate);
virtual s32 thread_handler();
virtual s32 pre_stop_handler();
virtual s32 pre_start_handler();
};
void* threadfun(void* p);
#define MSG_LOCK() \
if (stream_task_lock) \
(void)sal_mutex_take(stream_task_lock, SAL_MUTEX_FOREVER)
#define MSG_UNLOCK() \
if (stream_task_lock) \
(void)sal_mutex_give(stream_task_lock)
typedef struct rt_memcpy_async_s
{
void* dst;
void* src;
uint64_t count;
} rt_memcpy_async_t;
typedef struct rt_vector_reduce_s
{
void* src1;
void* src2;
uint32_t count_reduce;
cce::ccDataType_t datatype;
cce::ccReduceOp_t op;
void* dst_reduce;
} rt_vector_reduce_t;
typedef struct rt_rdma_send_s
{
u32 wqe_index;
void *cn;
}rt_rdma_send_t;
typedef struct rt_callback_func_stub_s
{
void *para;
u8 isExecuted;
bool isBlock;
}rt_callback_func_stub_t;
typedef struct stream_task_s
{
tasktype_e task_type;
u32 task_id;
union
{
rt_event_stub_t event;
rt_memcpy_async_t memcpystruct;
rt_vector_reduce_t reducestruct;
rt_notify_t* notify;
rt_rdma_send_t rdmasend;
rt_callback_func_stub_t callbackTask;
u32 usec;
} stream_para;
stream_task_s () : task_type(TASK_TYPE_RESERVED), task_id((u32)(-1))
{
memset(&stream_para, 0, sizeof(stream_para));
stream_para.callbackTask.isExecuted = 0;
stream_para.callbackTask.isBlock = false;
}
} stream_task_t;
#define M_PROF_KERNEL_TASK_NAME_LEN (63)
typedef struct tag_rt_profile_data_head_s
{
u64 rserved;
}rtProfileDataHead_t;
typedef struct ProfileTaskTrack
{
rtProfileDataHead_t head;
u64 timeStamp;
u16 eventName;
u16 taskType;
u16 streamId;
u16 taskId;
u32 thread;
u32 deviceId;
char kernelName[M_PROF_KERNEL_TASK_NAME_LEN];
u8 persistant:1;
u8 reserved:7;
}rtProfTaskTrack_t;
using atomic_ptr_t = std::shared_ptr<std::atomic<u32>>;
class stream_class : private thread_class
{
private:
sal_mutex_t stream_task_lock;
s32 thread_handler();
s32 pre_stop_handler();
s32 pre_start_handler();
sal_sem_t thread_trigger;
void trigger_thread();
sal_sem_t stream_task_done;
rtError_t stream_usleep(u32 usec);
rtError_t event_record(rtEvent_t event);
rtError_t event_multidev_record(rtEvent_t event);
rtError_t event_wait(rtEvent_t event);
rtError_t memcpy_async(void* dst, void* src, uint64_t count);
rtError_t notify_record(rt_notify_t* notify);
rtError_t notify_wait(rt_notify_t* notify);
template <typename T>
cce::ccStatus_t reduce_op(T src1, T src2, T* dst, const cce::ccReduceOp_t op);
cce::ccStatus_t vector_reduce( const void* src1, const void* src2,
uint32_t count, const cce::ccDataType_t datatype,
const cce::ccReduceOp_t op, void* dst );
public:
explicit stream_class(s32 device_id);
virtual ~stream_class();
void HWTSLog(const stream_task_t& task, u64 ts_start, u64 duration);
u64 TimestampNanosecond();
s32 push_task(stream_task_t* stream_task);
s32 get_stream_id() const;
s32 get_device_id() const;
rtError_t stream_synchronize();
void set_stream_enabled(bool enabled);
s32 current_dev;
static rtError_t rdma_send(u32 wqe_index, void* cnn);
void ExecuteCallbackFunc();
list<stream_task_t> stream_task_list;
private:
static std::atomic<s32> streamIdCounter_;
static std::atomic<u32> taskIdCounter_;
static std::map<rtStream_t, int32_t> streamMap_;
static std::mutex mapMutex_;
static std::map<s32, atomic_ptr_t> refCountMap_;
static std::array<std::string, 8> lineFeed_;
s32 deviceId_;
s32 streamId_;
ProfReporterData dataRuntime_;
ProfReporterData dataHWTS_;
bool stream_enabled_;
};
#ifdef __cplusplus
extern "C" {
#endif
void rtSetCommonPidMode(bool state);
#ifdef __cplusplus
}
#endif
#ifdef __cplusplus
extern "C" {
#endif
typedef enum {
RT_ENGINE_TYPE_AIC = 0,
RT_ENGINE_TYPE_AIV
} rtEngineType;
typedef union {
uint8_t schemMode;
uint32_t localMemorySize;
rtEngineType engineType;
uint32_t blockDimOffset;
uint8_t isBlockTaskPrefetch;
uint8_t isDataDump;
uint16_t timeout;
uint32_t rsv[4];
} rtLaunchKernelAttrVal_t;
typedef enum {
RT_LAUNCH_KERNEL_ATTR_SCHEM_MODE = 1,
RT_LAUNCH_KERNEL_ATTR_DYN_UBUF_SIZE,
RT_LAUNCH_KERNEL_ATTR_ENGINE_TYPE,
RT_LAUNCH_KERNEL_ATTR_BLOCKDIM_OFFSET,
RT_LAUNCH_KERNEL_ATTR_BLOCK_TASK_PREFETCH,
RT_LAUNCH_KERNEL_ATTR_DATA_DUMP,
RT_LAUNCH_KERNEL_ATTR_TIMEOUT,
RT_LAUNCH_KERNEL_ATTR_MAX
} rtLaunchKernelAttrId;
typedef struct {
rtLaunchKernelAttrId id;
rtLaunchKernelAttrVal_t value;
} rtLaunchKernelAttr_t;
typedef struct {
rtLaunchKernelAttr_t *attrs;
size_t numAttrs;
} rtKernelLaunchCfg_t;
typedef struct {
uint32_t addrOffset;
uint32_t dataOffset;
} rtPlaceHolderInfo_t;
rtError_t rtStreamSynchronize(rtStream_t stream);
#ifdef __cplusplus
}
#endif
#endif