// SPDX-License-Identifier: Mulan PSL v2
/*
 * Copyright (c) 2025 Huawei Technologies Co., Ltd.
 * This software is licensed under Mulan PSL v2.
 * You can use this software according to the terms and conditions of the Mulan PSL v2.
 * You may obtain a copy of Mulan PSL v2 at:
 *         http://license.coscl.org.cn/MulanPSL2
 *
 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
 * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
 * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
 * See the Mulan PSL v2 for more details.
 */

use std::{
    ffi::CStr,
    os::raw::{c_int, c_longlong, c_uint, c_void},
    path::Path,
    sync::OnceLock,
};

use procfs::process::{MMapPath, Process};
use tracing::{debug, error, info, warn};

use cudax::{cublas, cublaslt, driver, nccl, nvml, runtime};
use xgpu_common::{
    ipc::message::{Argument, ArgumentFlag},
    sys::dynlib,
};

use super::api::{ApiHandler, ServerErr};

pub static BASE_ADDR: OnceLock<u64> = OnceLock::new();

pub struct CudaDeviceResetHandler;
impl ApiHandler for CudaDeviceResetHandler {
    fn handle_api(&self, _args: &mut [Argument<'_>]) -> Result<Argument<'static>, ServerErr> {
        let res = unsafe { runtime::cudaDeviceReset() };
        if res as i32 > 0 {
            return Err(ServerErr::ApiRunError(
                "cudaDeviceReset".to_string(),
                res as i32,
            ));
        }
        let ret_value = Argument::from_value(res, ArgumentFlag::ARG_OUT);
        Ok(ret_value)
    }
}

pub struct CudaDeviceSynchronizeHandler;
impl ApiHandler for CudaDeviceSynchronizeHandler {
    fn handle_api(&self, _args: &mut [Argument<'_>]) -> Result<Argument<'static>, ServerErr> {
        let res = unsafe { runtime::cudaDeviceSynchronize() };
        if res as i32 > 0 {
            return Err(ServerErr::ApiRunError(
                "cudaDeviceSynchronize".to_string(),
                res as i32,
            ));
        }
        let ret_value = Argument::from_value(res, ArgumentFlag::ARG_OUT);
        Ok(ret_value)
    }
}

pub struct CudaDeviceGetStreamPriorityRangeHandler;
impl ApiHandler for CudaDeviceGetStreamPriorityRangeHandler {
    fn handle_api(&self, args: &mut [Argument<'_>]) -> Result<Argument<'static>, ServerErr> {
        let least = unsafe {
            args[0].downcast_mut::<c_int>().map_err(|_| {
                ServerErr::InvalidType("InvalidType, <least> expected: c_int".into())
            })?
        };
        let greatest = unsafe {
            args[1].downcast_mut::<c_int>().map_err(|_| {
                ServerErr::InvalidType("InvalidType, <greatest> expected: c_int".into())
            })?
        };
        debug!("least:{}, greatest:{}", *least, *greatest);
        let res = unsafe {
            runtime::cudaDeviceGetStreamPriorityRange(least as *mut c_int, greatest as *mut c_int)
        };
        debug!("after least:{}, greatest:{}", *least, *greatest);
        if res as i32 > 0 {
            return Err(ServerErr::ApiRunError(
                "cudaDeviceGetStreamPriorityRange".to_string(),
                res as i32,
            ));
        }
        let ret_value = Argument::from_value(res, ArgumentFlag::ARG_OUT);
        Ok(ret_value)
    }
}

pub struct CudaGetLastErrorHandler;
impl ApiHandler for CudaGetLastErrorHandler {
    fn handle_api(&self, _args: &mut [Argument<'_>]) -> Result<Argument<'static>, ServerErr> {
        let res = unsafe { runtime::cudaGetLastError() };
        if res as i32 > 0 {
            return Err(ServerErr::ApiRunError(
                "cudaGetLastError".to_string(),
                res as i32,
            ));
        }
        let ret_value = Argument::from_value(res, ArgumentFlag::ARG_OUT);
        Ok(ret_value)
    }
}

pub struct CudaPeekAtLastErrorHandler;
impl ApiHandler for CudaPeekAtLastErrorHandler {
    fn handle_api(&self, _args: &mut [Argument<'_>]) -> Result<Argument<'static>, ServerErr> {
        debug!("-------cudaPeekAtLastError");
        let res = unsafe { runtime::cudaPeekAtLastError() };
        if res as i32 > 0 {
            return Err(ServerErr::ApiRunError(
                "cudaPeekAtLastError".to_string(),
                res as i32,
            ));
        }
        let ret_value = Argument::from_value(res, ArgumentFlag::ARG_OUT);
        Ok(ret_value)
    }
}

pub struct CudaGetDeviceCountHandler;
impl ApiHandler for CudaGetDeviceCountHandler {
    fn handle_api(&self, args: &mut [Argument<'_>]) -> Result<Argument<'static>, ServerErr> {
        let count = unsafe {
            args[0].downcast_mut::<c_int>().map_err(|_| {
                ServerErr::InvalidType("InvalidType, <count> expected: c_int".into())
            })?
        };
        let res = unsafe { runtime::cudaGetDeviceCount(count as *mut i32) };
        debug!("cudaGetDeviceCount, res ={}", res);
        if res > 0 {
            return Err(ServerErr::ApiRunError(
                "cudaGetDeviceCount".to_string(),
                res as i32,
            ));
        }
        let ret_value = Argument::from_value(res, ArgumentFlag::ARG_OUT);
        Ok(ret_value)
    }
}

pub struct CudaGetDevicePropertiesV2Handler;

impl ApiHandler for CudaGetDevicePropertiesV2Handler {
    fn handle_api(&self, args: &mut [Argument<'_>]) -> Result<Argument<'static>, ServerErr> {
        let prop = unsafe {
            args[0]
                .downcast_mut::<runtime::cudaDeviceProp>()
                .map_err(|_| {
                    ServerErr::InvalidType("InvalidType, expected: cudaDeviceProp".into())
                })?
        };
        let device = unsafe {
            args[1]
                .downcast_mut::<c_int>()
                .map_err(|_| ServerErr::InvalidType("InvalidType, expected: c_int".into()))?
        };
        let res = unsafe {
            runtime::cudaGetDeviceProperties_v2(prop as *mut runtime::cudaDeviceProp, *device)
        };
        if res > 0 {
            return Err(ServerErr::ApiRunError(
                "cudaGetDeviceProperties_v2".to_string(),
                res as i32,
            ));
        }
        let ret_value = Argument::from_value(res, ArgumentFlag::ARG_OUT);
        Ok(ret_value)
    }
}

pub struct CudaDeviceGetAttributeHandler;
impl ApiHandler for CudaDeviceGetAttributeHandler {
    fn handle_api(&self, args: &mut [Argument<'_>]) -> Result<Argument<'static>, ServerErr> {
        let value = unsafe {
            args[0].downcast_mut::<c_int>().map_err(|_| {
                ServerErr::InvalidType("InvalidType, <value> expected: c_int".into())
            })?
        };
        let attr = args[1]
            .downcast_ref::<runtime::cudaDeviceAttr>()
            .map_err(|_| {
                ServerErr::InvalidType(
                    "InvalidType, <attr> expected: runtime::cudaDeviceAttr".into(),
                )
            })?;
        let device = args[2]
            .downcast_ref::<c_int>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <device> expected: c_int".into()))?;
        let res = unsafe { runtime::cudaDeviceGetAttribute(value as *mut c_int, *attr, *device) };
        if res > 0 {
            return Err(ServerErr::ApiRunError(
                "cudaDeviceGetAttribute".to_string(),
                res as i32,
            ));
        }
        debug!("----------CudaDeviceGetAttribute");
        let ret_value = Argument::from_value(res, ArgumentFlag::ARG_OUT);
        Ok(ret_value)
    }
}

pub struct CudaSetDeviceHandler;
impl ApiHandler for CudaSetDeviceHandler {
    fn handle_api(&self, args: &mut [Argument<'_>]) -> Result<Argument<'static>, ServerErr> {
        let device = args[0]
            .downcast_ref::<c_int>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <device> expected: c_int".into()))?;
        let res = unsafe { runtime::cudaSetDevice(*device) };
        if res > 0 {
            return Err(ServerErr::ApiRunError(
                "cudaSetDevice".to_string(),
                res as i32,
            ));
        }
        let ret_value = Argument::from_value(res, ArgumentFlag::ARG_OUT);
        Ok(ret_value)
    }
}

pub struct CudaGetDeviceHandler;
impl ApiHandler for CudaGetDeviceHandler {
    fn handle_api(&self, args: &mut [Argument<'_>]) -> Result<Argument<'static>, ServerErr> {
        let device = unsafe {
            args[0].downcast_mut::<c_int>().map_err(|_| {
                ServerErr::InvalidType("InvalidType, <value> expected: c_int".into())
            })?
        };
        let res = unsafe { runtime::cudaGetDevice(device as *mut c_int) };
        debug!("cudaGetDevice, res ={}", res);
        if res > 0 {
            return Err(ServerErr::ApiRunError(
                "cudaGetDevice".to_string(),
                res as i32,
            ));
        }
        let ret_value = Argument::from_value(res, ArgumentFlag::ARG_OUT);
        Ok(ret_value)
    }
}

pub struct CudaStreamCreateWithPriorityHandler;
impl ApiHandler for CudaStreamCreateWithPriorityHandler {
    fn handle_api(&self, args: &mut [Argument<'_>]) -> Result<Argument<'static>, ServerErr> {
        let p_stream = unsafe {
            args[0].downcast_mut::<usize>().map_err(|_| {
                ServerErr::InvalidType("InvalidType, <p_stream> expected: usize".into())
            })? as *mut usize as *mut runtime::cudaStream_t
        };
        let flags = args[1]
            .downcast_ref::<c_uint>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <flags> expected: c_uint".into()))?;
        let priority = args[2].downcast_ref::<c_int>().map_err(|_| {
            ServerErr::InvalidType("InvalidType, <priority> expected: c_int".into())
        })?;

        let res = unsafe { runtime::cudaStreamCreateWithPriority(p_stream, *flags, *priority) };
        debug!("----------cudaStreamCreateWithPriority, res: {}", res);
        if res > 0 {
            return Err(ServerErr::ApiRunError(
                "cudaStreamCreateWithPriority".to_string(),
                res as i32,
            ));
        }

        let ret_value = Argument::from_value(res, ArgumentFlag::ARG_OUT);
        Ok(ret_value)
    }
}

pub struct CudaStreamWaitEventHandler;
impl ApiHandler for CudaStreamWaitEventHandler {
    fn handle_api(&self, args: &mut [Argument<'_>]) -> Result<Argument<'static>, ServerErr> {
        info!("[server] api_name: cudaStreamWaitEvent");
        let stream = args[0]
            .downcast::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <stream> expected: usize".into()))?
            as runtime::cudaStream_t;
        let event = args[1]
            .downcast::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <event> expected: usize".into()))?
            as runtime::cudaEvent_t;
        let flags = args[2]
            .downcast::<c_uint>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <flags> expected: c_uint".into()))?;
        let res = unsafe { runtime::cudaStreamWaitEvent(stream, event, flags) };
        if res > 0 {
            return Err(ServerErr::ApiRunError(
                "cudaStreamWaitEvent".to_string(),
                res as i32,
            ));
        }
        debug!("--------------cudaStreamWaitEvent, res={}", res);
        let ret_value = Argument::from_value(res, ArgumentFlag::ARG_OUT);
        Ok(ret_value)
    }
}

pub struct CudaStreamSynchronizeHandler;
impl ApiHandler for CudaStreamSynchronizeHandler {
    fn handle_api(&self, args: &mut [Argument<'_>]) -> Result<Argument<'static>, ServerErr> {
        info!("[server] api_name: cudaStreamSynchronize");
        let stream = args[0]
            .downcast::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <stream> expected: usize".into()))?
            as runtime::cudaStream_t;
        let res = unsafe { runtime::cudaStreamSynchronize(stream) };
        if res > 0 {
            return Err(ServerErr::ApiRunError(
                "cudaStreamSynchronize".to_string(),
                res as i32,
            ));
        }
        debug!("--------------cudaStreamSynchronize, res={}", res);
        let ret_value = Argument::from_value(res, ArgumentFlag::ARG_OUT);
        Ok(ret_value)
    }
}

pub struct CudaThreadExchangeStreamCaptureModeHandler;
impl ApiHandler for CudaThreadExchangeStreamCaptureModeHandler {
    fn handle_api(&self, args: &mut [Argument<'_>]) -> Result<Argument<'static>, ServerErr> {
        let mode = unsafe {
            args[0]
                .downcast_mut::<runtime::cudaStreamCaptureMode>()
                .map_err(|_| {
                    ServerErr::InvalidType(
                        "InvalidType, <p_stream> expected: runtime::cudaStream_t".into(),
                    )
                })?
        };
        let res = unsafe { runtime::cudaThreadExchangeStreamCaptureMode(mode) };
        if res > 0 {
            return Err(ServerErr::ApiRunError(
                "cudaThreadExchangeStreamCaptureMode".to_string(),
                res as i32,
            ));
        }
        let ret_value = Argument::from_value(res, ArgumentFlag::ARG_OUT);
        Ok(ret_value)
    }
}

pub struct CudaStreamIsCapturingHandler;
impl ApiHandler for CudaStreamIsCapturingHandler {
    fn handle_api(&self, args: &mut [Argument<'_>]) -> Result<Argument<'static>, ServerErr> {
        debug!("[server] api_name: cudaStreamIsCapturing");
        let stream = args[0]
            .downcast::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <stream> expected: usize".into()))?
            as runtime::cudaStream_t;
        let p_capture_status = unsafe {
            args[1].downcast_mut::<runtime::cudaStreamCaptureStatus>().map_err(|_| ServerErr::InvalidType("InvalidType, <pCaptureStatus> expected: *mut runtime::cudaStreamCaptureStatus".into()))?
        };
        let res = unsafe { runtime::cudaStreamIsCapturing(stream, p_capture_status) };
        if res > 0 {
            return Err(ServerErr::ApiRunError(
                "cudaStreamIsCapturing".to_string(),
                res as i32,
            ));
        }
        debug!("--------------cudaStreamIsCapturing, res={}", res);
        let ret_value = Argument::from_value(res, ArgumentFlag::ARG_OUT);
        Ok(ret_value)
    }
}

pub struct CudaEventCreateWithFlagsHandler;
impl ApiHandler for CudaEventCreateWithFlagsHandler {
    fn handle_api(&self, args: &mut [Argument<'_>]) -> Result<Argument<'static>, ServerErr> {
        info!("[server] api_name: cudaEventCreateWithFlags");
        let event = unsafe {
            args[0].downcast_mut::<usize>().map_err(|_| {
                ServerErr::InvalidType("InvalidType, <event> expected: usize".into())
            })? as *mut usize as *mut runtime::cudaEvent_t
        };
        let flags = args[1]
            .downcast::<c_uint>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <flags> expected: c_uint".into()))?;
        let res = unsafe { runtime::cudaEventCreateWithFlags(event, flags) };
        if res > 0 {
            return Err(ServerErr::ApiRunError(
                "cudaEventCreateWithFlags".to_string(),
                res as i32,
            ));
        }
        debug!("--------------cudaEventCreateWithFlags, res={}", res);
        let ret_value = Argument::from_value(res, ArgumentFlag::ARG_OUT);
        Ok(ret_value)
    }
}

pub struct CudaEventRecordHandler;
impl ApiHandler for CudaEventRecordHandler {
    fn handle_api(&self, args: &mut [Argument<'_>]) -> Result<Argument<'static>, ServerErr> {
        info!("[server] api_name: cudaEventRecord");
        let event = args[0]
            .downcast::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <event> expected: usize".into()))?
            as runtime::cudaEvent_t;
        let stream = args[1]
            .downcast::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <stream> expected: usize".into()))?
            as runtime::cudaStream_t;
        let res = unsafe { runtime::cudaEventRecord(event, stream) };
        debug!("event: {}, stream: {}", event, stream);
        //debug!("AS u64  event:{:x}, stream:{:x}", event as u64, stream as u64);
        //error!("temp debug!");
        //debug!("--------------cudaEventRecord, res={}", res);
        if res > 0 {
            return Err(ServerErr::ApiRunError(
                "cudaEventRecord".to_string(),
                res as i32,
            ));
        }

        debug!("--------------cudaEventRecord, res={}", res);
        let ret_value = Argument::from_value(res, ArgumentFlag::ARG_OUT);
        Ok(ret_value)
    }
}

pub struct CudaEventRecordWithFlagsHandler;
impl ApiHandler for CudaEventRecordWithFlagsHandler {
    fn handle_api(&self, args: &mut [Argument<'_>]) -> Result<Argument<'static>, ServerErr> {
        info!("[server] api_name: cudaEventRecordWithFlags");
        let event = args[0]
            .downcast::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <event> expected: usize".into()))?
            as runtime::cudaEvent_t;
        let stream = args[1]
            .downcast::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <stream> expected: usize".into()))?
            as runtime::cudaStream_t;
        let flags = args[2]
            .downcast::<c_uint>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <flags> expected: c_uint".into()))?;

        let res = unsafe { runtime::cudaEventRecordWithFlags(event, stream, flags) };
        debug!("event: {}, stream: {}", event, stream);
        //debug!("AS u64  event:{:x}, stream:{:x}", event as u64, stream as u64);
        //error!("temp debug!");
        //debug!("--------------cudaEventRecord, res={}", res);
        if res > 0 {
            return Err(ServerErr::ApiRunError(
                "cudaEventRecordWithFlags".to_string(),
                res as i32,
            ));
        }

        debug!("--------------cudaEventRecordWithFlags, res={}", res);
        let ret_value = Argument::from_value(res, ArgumentFlag::ARG_OUT);
        Ok(ret_value)
    }
}

pub struct CudaEventQueryHandler;
impl ApiHandler for CudaEventQueryHandler {
    fn handle_api(&self, args: &mut [Argument<'_>]) -> Result<Argument<'static>, ServerErr> {
        info!("[server] api_name: cudaEventQuery");
        let event = args[0]
            .downcast::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <event> expected: usize".into()))?
            as runtime::cudaEvent_t;
        let res = unsafe { runtime::cudaEventQuery(event) };
        debug!("--------------cudaEventQuery, res={}", res);
        if res > 0 && res != runtime::cudaErrorNotReady {
            return Err(ServerErr::ApiRunError(
                "cudaEventQuery".to_string(),
                res as i32,
            ));
        }

        if res == runtime::cudaErrorNotReady {
            /* from cuda doc!
             * pub const cudaErrorNotReady: cudaError = 600;
             * This indicates that asynchronous operations issued previously have not
             * completed yet. This result is not actually an error, but must be indicated
             * differently than ::cudaSuccess (which indicates completion). Calls that
             * may return this value include ::cudaEventQuery() and ::cudaStreamQuery()."
            //
             */
            warn!("cudaEventQuery, res = cudaErrorNotReady");
        }
        let ret_value = Argument::from_value(res, ArgumentFlag::ARG_OUT);
        Ok(ret_value)
    }
}

pub struct CudaEventSynchronizeHandler;
impl ApiHandler for CudaEventSynchronizeHandler {
    fn handle_api(&self, args: &mut [Argument<'_>]) -> Result<Argument<'static>, ServerErr> {
        info!("[server] api_name: cudaEventSynchronize");
        let event = args[0]
            .downcast::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <event> expected: usize".into()))?
            as runtime::cudaEvent_t;
        let res = unsafe { runtime::cudaEventSynchronize(event) };
        if res > 0 {
            return Err(ServerErr::ApiRunError(
                "cudaEventSynchronize".to_string(),
                res as i32,
            ));
        }

        debug!("--------------cudaEventSynchronize, res={}", res);
        let ret_value = Argument::from_value(res, ArgumentFlag::ARG_OUT);
        Ok(ret_value)
    }
}

pub struct CudaEventDestroyHandler;
impl ApiHandler for CudaEventDestroyHandler {
    fn handle_api(&self, args: &mut [Argument<'_>]) -> Result<Argument<'static>, ServerErr> {
        info!("[server] api_name: cudaEventDestroy");
        let event = args[0]
            .downcast::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <event> expected: usize".into()))?
            as runtime::cudaEvent_t;
        let res = unsafe { runtime::cudaEventDestroy(event) };
        if res > 0 {
            return Err(ServerErr::ApiRunError(
                "cudaEventDestroy".to_string(),
                res as i32,
            ));
        }

        debug!("--------------cudaEventDestroy, res={}", res);
        let ret_value = Argument::from_value(res, ArgumentFlag::ARG_OUT);
        Ok(ret_value)
    }
}

pub struct CudaEventElapsedTimeHandler;
impl ApiHandler for CudaEventElapsedTimeHandler {
    fn handle_api(&self, args: &mut [Argument<'_>]) -> Result<Argument<'static>, ServerErr> {
        info!("[server] api_name: cudaEventElapsedTime");
        let ms = unsafe {
            args[0].downcast_mut::<f32>().map_err(|_| {
                ServerErr::InvalidType("InvalidType, <ms> expected: *mut f32".into())
            })?
        };
        let start = args[1]
            .downcast::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <start> expected: usize".into()))?
            as runtime::cudaEvent_t;
        let end = args[2]
            .downcast::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <end> expected: usize".into()))?
            as runtime::cudaEvent_t;
        let res = unsafe { runtime::cudaEventElapsedTime(ms, start, end) };
        if res > 0 {
            return Err(ServerErr::ApiRunError(
                "cudaEventElapsedTime".to_string(),
                res as i32,
            ));
        }

        debug!("--------------cudaEventElapsedTime, res={}", res);
        let ret_value = Argument::from_value(res, ArgumentFlag::ARG_OUT);
        Ok(ret_value)
    }
}

pub struct CudaMemsetHandler;
impl ApiHandler for CudaMemsetHandler {
    fn handle_api(&self, args: &mut [Argument<'_>]) -> Result<Argument<'static>, ServerErr> {
        let dev_ptr = args[0]
            .downcast::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <dev_ptr> expected: usize".into()))?
            as *mut c_void;
        let value = args[1]
            .downcast_ref::<c_int>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <value> expected: c_int".into()))?;
        let count = args[2]
            .downcast_ref::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <count> expected: usize".into()))?;

        let res = unsafe { runtime::cudaMemset(dev_ptr as *mut c_void, *value, *count) };
        if res > 0 {
            return Err(ServerErr::ApiRunError("cudaMemset".to_string(), res as i32));
        }
        debug!("----------cudaMemset, res: {}", res);
        let ret_value = Argument::from_value(res, ArgumentFlag::ARG_OUT);
        Ok(ret_value)
    }
}

pub struct CudaMemsetAsyncHandler;
impl ApiHandler for CudaMemsetAsyncHandler {
    fn handle_api(&self, args: &mut [Argument<'_>]) -> Result<Argument<'static>, ServerErr> {
        info!("[server] api_name: cudaMemsetAsync");
        let dev_ptr = args[0]
            .downcast::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <devPtr> expected: usize".into()))?
            as *mut c_void;
        let value = args[1]
            .downcast::<c_int>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <value> expected: c_int".into()))?;
        let count = args[2]
            .downcast::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <count> expected: usize".into()))?;
        let stream = args[3]
            .downcast::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <stream> expected: usize".into()))?
            as runtime::cudaStream_t;
        let res = unsafe {
            runtime::cudaMemsetAsync(
                dev_ptr as *mut c_void,
                value,
                count,
                stream as runtime::cudaStream_t,
            )
        };
        debug!("dev_ptr_u64:{:p}", dev_ptr);
        if res > 0 {
            return Err(ServerErr::ApiRunError(
                "cudaMemsetAsync".to_string(),
                res as i32,
            ));
        }

        debug!("--------------cudaMemsetAsync, res={}", res);

        let ret_value = Argument::from_value(res, ArgumentFlag::ARG_OUT);
        Ok(ret_value)
    }
}

pub struct CudaPointerGetAttributesHandler;
impl ApiHandler for CudaPointerGetAttributesHandler {
    fn handle_api(&self, args: &mut [Argument<'_>]) -> Result<Argument<'static>, ServerErr> {
        let attributes = unsafe {
            args[0]
                .downcast_mut::<runtime::cudaPointerAttributes>()
                .map_err(|_| {
                    ServerErr::InvalidType(
                        "InvalidType, <attributes> expected: runtime::cudaPointerAttributes".into(),
                    )
                })?
        };
        let ptr = args[1]
            .downcast::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <ptr> expected: usize".into()))?
            as *const c_void;
        let res = unsafe {
            runtime::cudaPointerGetAttributes(
                attributes as *mut runtime::cudaPointerAttributes,
                ptr as *const c_void,
            )
        };
        if res > 0 {
            return Err(ServerErr::ApiRunError(
                "cudaPointerGetAttributes".to_string(),
                res as i32,
            ));
        }
        debug!("----------cudaPointerGetAttributes, res: {}", res);
        let ret_value = Argument::from_value(res, ArgumentFlag::ARG_OUT);
        Ok(ret_value)
    }
}

pub struct CuDeviceGetHandler;
impl ApiHandler for CuDeviceGetHandler {
    fn handle_api(&self, args: &mut [Argument<'_>]) -> Result<Argument<'static>, ServerErr> {
        let device = unsafe {
            args[0].downcast_mut::<driver::CUdevice>().map_err(|_| {
                ServerErr::InvalidType("InvalidType, <device> expected: driver::CUdevice".into())
            })?
        };
        let ordinal = args[1]
            .downcast_ref::<c_int>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <ordinal> expected: c_int".into()))?;
        let res = unsafe { driver::cuDeviceGet(device as *mut driver::CUdevice, *ordinal) };
        if res > 0 {
            return Err(ServerErr::ApiRunError(
                "cuDeviceGet".to_string(),
                res as i32,
            ));
        }

        debug!("----------cuDeviceGet res: {}", res);
        let ret_value = Argument::from_value(res, ArgumentFlag::ARG_OUT);
        Ok(ret_value)
    }
}

pub struct CuDeviceGetAttributeHandler;
impl ApiHandler for CuDeviceGetAttributeHandler {
    fn handle_api(&self, args: &mut [Argument<'_>]) -> Result<Argument<'static>, ServerErr> {
        let pi = unsafe {
            args[0]
                .downcast_mut::<c_int>()
                .map_err(|_| ServerErr::InvalidType("InvalidType, <pi> expected: c_int".into()))?
        };
        let attrib = args[1]
            .downcast_ref::<driver::CUdevice_attribute>()
            .map_err(|_| {
                ServerErr::InvalidType(
                    "InvalidType, <attrib> expected: driver::CUdevice_attribute".into(),
                )
            })?;
        let dev = args[2].downcast_ref::<driver::CUdevice>().map_err(|_| {
            ServerErr::InvalidType("InvalidType, <dev> expected: driver::CUdevice".into())
        })?;
        let res = unsafe { driver::cuDeviceGetAttribute(pi as *mut c_int, *attrib, *dev) };
        if res > 0 {
            return Err(ServerErr::ApiRunError(
                "cuDeviceGetAttribute".to_string(),
                res as i32,
            ));
        }
        debug!("----------cuDeviceGetAttribute: {}", res);
        let ret_value = Argument::from_value(res, ArgumentFlag::ARG_OUT);
        Ok(ret_value)
    }
}

pub struct CuDevicePrimaryCtxGetStateHandler;
impl ApiHandler for CuDevicePrimaryCtxGetStateHandler {
    fn handle_api(&self, args: &mut [Argument<'_>]) -> Result<Argument<'static>, ServerErr> {
        let dev = args[0].downcast_ref::<driver::CUdevice>().map_err(|_| {
            ServerErr::InvalidType("InvalidType, <dev> expected: driver::CUdevice".into())
        })?;

        let flags = unsafe {
            args[1].downcast_mut::<c_uint>().map_err(|_| {
                ServerErr::InvalidType("InvalidType, <flags> expected: c_uint".into())
            })?
        };
        let active = unsafe {
            args[2].downcast_mut::<c_int>().map_err(|_| {
                ServerErr::InvalidType("InvalidType, <active> expected: c_int".into())
            })?
        };
        debug!("args: dev:{}, flags:{}, active:{}", dev, *flags, *active);
        let res = unsafe { driver::cuDevicePrimaryCtxGetState(*dev, flags, active) };
        if res > 0 {
            return Err(ServerErr::ApiRunError(
                "cuDevicePrimaryCtxGetState".to_string(),
                res as i32,
            ));
        }
        debug!(
            "args: dev:{}, flags:{}, active:{}, res:{}",
            dev, *flags, *active, res
        );

        debug!("----------cuDevicePrimaryCtxGetState: {}", res);
        let ret_value = Argument::from_value(res, ArgumentFlag::ARG_OUT);
        Ok(ret_value)
    }
}

pub struct CuCtxGetCurrentHandler;
impl ApiHandler for CuCtxGetCurrentHandler {
    fn handle_api(&self, args: &mut [Argument<'_>]) -> Result<Argument<'static>, ServerErr> {
        info!("[server] api_name: cuCtxGetCurrent");
        let pctx = unsafe {
            args[0].downcast_mut::<driver::CUcontext>().map_err(|_| {
                ServerErr::InvalidType(
                    "InvalidType, <pctx> expected: *mut driver::CUcontext".into(),
                )
            })?
        };
        let res = unsafe { driver::cuCtxGetCurrent(pctx) };
        if res > 0 {
            return Err(ServerErr::ApiRunError(
                "cuCtxGetCurrent".to_string(),
                res as i32,
            ));
        }

        debug!("--------------cuCtxGetCurrent, res={}", res);
        let ret_value = Argument::from_value(res, ArgumentFlag::ARG_OUT);
        Ok(ret_value)
    }
}

pub struct CuFuncGetAttributeHandler;
impl ApiHandler for CuFuncGetAttributeHandler {
    fn handle_api(&self, args: &mut [Argument<'_>]) -> Result<Argument<'static>, ServerErr> {
        info!("[server] api_name: cuFuncGetAttribute");
        let pi = unsafe {
            args[0].downcast_mut::<c_int>().map_err(|_| {
                ServerErr::InvalidType("InvalidType, <pi> expected: *mut c_int".into())
            })?
        };
        let attrib = args[1]
            .downcast::<driver::CUfunction_attribute>()
            .map_err(|_| {
                ServerErr::InvalidType(
                    "InvalidType, <attrib> expected: driver::CUfunction_attribute".into(),
                )
            })?;
        let hfunc = args[2]
            .downcast::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <hfunc> expected: usize".into()))?
            as driver::CUfunction;
        let res = unsafe { driver::cuFuncGetAttribute(pi, attrib, hfunc) };
        if res > 0 {
            return Err(ServerErr::ApiRunError(
                "cuFuncGetAttribute".to_string(),
                res as i32,
            ));
        }

        debug!("--------------cuFuncGetAttribute, res={}", res);
        let ret_value = Argument::from_value(res, ArgumentFlag::ARG_OUT);
        Ok(ret_value)
    }
}

pub struct CublasCreateV2handler;
impl ApiHandler for CublasCreateV2handler {
    fn handle_api(&self, args: &mut [Argument<'_>]) -> Result<Argument<'static>, ServerErr> {
        debug!("[server] api_name: cublasCreate_v2");

        let handle = unsafe {
            args[0].downcast_mut::<usize>().map_err(|_| {
                ServerErr::InvalidType("InvalidType, <handle> expected: usize".into())
            })? as *mut usize as *mut cublas::cublasHandle_t
        };
        debug!("server: handle ptr:{:p}, *ptr:{:?} ", handle, unsafe {
            *handle
        });
        //*handle = std::ptr::null_mut();
        //let mut handle: cublas::cublasHandle_t = std::ptr::null_mut();
        let res = unsafe { cublas::cublasCreate_v2(handle) };
        debug!("server: handle ptr:{:p}, *ptr:{:?} ", handle, unsafe {
            *handle
        });
        if res > 0 {
            return Err(ServerErr::ApiRunError(
                "cublasCreate_v2".to_string(),
                res as i32,
            ));
        }
        debug!("----------cublasCreate_v2, res={}", res);
        let ret_value = Argument::from_value(res, ArgumentFlag::ARG_OUT);
        Ok(ret_value)
    }
}

pub struct CublasSetWorkspaceV2handler;
impl ApiHandler for CublasSetWorkspaceV2handler {
    fn handle_api(&self, args: &mut [Argument<'_>]) -> Result<Argument<'static>, ServerErr> {
        debug!("[server] api_name: cublasSetWorkspace_v2");
        let handle = args[0]
            .downcast::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <handle> expected: usize".into()))?
            as cublas::cublasHandle_t;
        let workspace = args[1].downcast::<usize>().map_err(|_| {
            ServerErr::InvalidType("InvalidType, <workspace> expected: usize".into())
        })? as *mut c_void;
        let workspace_size_in_bytes = args[2].downcast::<usize>().map_err(|_| {
            ServerErr::InvalidType("InvalidType, <workspaceSizeInBytes> expected: usize".into())
        })?;
        debug!(
            "workspace:{:p}, size:{}",
            workspace, workspace_size_in_bytes
        );
        let res =
            unsafe { cublas::cublasSetWorkspace_v2(handle, workspace, workspace_size_in_bytes) };
        debug!("--------------cublasSetWorkspace_v2, res={}", res);
        if res > 0 {
            return Err(ServerErr::ApiRunError(
                "cublasSetWorkspace_v2".to_string(),
                res as i32,
            ));
        }
        let ret_value = Argument::from_value(res, ArgumentFlag::ARG_OUT);
        Ok(ret_value)
    }
}

pub struct CublasSetStreamV2handler;
impl ApiHandler for CublasSetStreamV2handler {
    fn handle_api(&self, args: &mut [Argument<'_>]) -> Result<Argument<'static>, ServerErr> {
        debug!("[server] api_name: cublasSetStream_v2");
        let handle = args[0]
            .downcast::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <handle> expected: usize".into()))?
            as cublas::cublasHandle_t;
        let stream_id = args[1]
            .downcast::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <streamId> expected: usize".into()))?
            as cublas::cudaStream_t;
        debug!(" server handle ptr:{:?}", handle);
        debug!(" server stream_id ptr:{:?}", stream_id);
        let res = unsafe { cublas::cublasSetStream_v2(handle, stream_id) };
        debug!(" server stream_id ptr:{:?}", stream_id);
        debug!("--------------cublasSetStream_v2, res={}", res);
        if res > 0 {
            return Err(ServerErr::ApiRunError(
                "cublasSetStream_v2".to_string(),
                res as i32,
            ));
        }
        let ret_value = Argument::from_value(res, ArgumentFlag::ARG_OUT);
        Ok(ret_value)
    }
}

pub struct CublasGetStreamV2handler;
impl ApiHandler for CublasGetStreamV2handler {
    fn handle_api(&self, args: &mut [Argument<'_>]) -> Result<Argument<'static>, ServerErr> {
        debug!("[server] api_name: cublasGetStream_v2");

        let handle = args[0]
            .downcast::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <handle> expected: usize".into()))?
            as cublas::cublasHandle_t;
        let stream_id = unsafe {
            args[1].downcast_mut::<usize>().map_err(|_| {
                ServerErr::InvalidType("InvalidType, <stream_id> expected: usize".into())
            })? as *mut usize as *mut cublas::cudaStream_t
        };
        debug!("server: handle:{:?} ", handle);
        debug!(
            "server1: stream_id ptr:{:p}, *ptr:{:?} ",
            stream_id,
            unsafe { *stream_id }
        );

        let res = unsafe { cublas::cublasGetStream_v2(handle, stream_id) };
        debug!(
            "server2: stream_id ptr:{:p}, *ptr:{:?} ",
            stream_id,
            unsafe { *stream_id }
        );
        if res > 0 {
            return Err(ServerErr::ApiRunError(
                "cublasGetStream_v2".to_string(),
                res as i32,
            ));
        }

        debug!("----------cublasGetStream_v2, res={}", res);
        let ret_value = Argument::from_value(res, ArgumentFlag::ARG_OUT);
        Ok(ret_value)
    }
}

pub struct CublasLtMatmulPreferenceCreateHandler;
impl ApiHandler for CublasLtMatmulPreferenceCreateHandler {
    fn handle_api(&self, args: &mut [Argument<'_>]) -> Result<Argument<'static>, ServerErr> {
        debug!("[server] api_name: cublasLtMatmulPreferenceCreate");
        let pref = unsafe {
            args[0]
                .downcast_mut::<usize>()
                .map_err(|_| ServerErr::InvalidType("InvalidType, <pref> expected: usize".into()))?
                as *mut usize as *mut cublaslt::cublasLtMatmulPreference_t
        };
        let res = unsafe { cublaslt::cublasLtMatmulPreferenceCreate(pref) };
        if res > 0 {
            return Err(ServerErr::ApiRunError(
                "cublasLtMatmulPreferenceCreate".to_string(),
                res as i32,
            ));
        }
        debug!("--------------cublasLtMatmulPreferenceCreate, res={}", res);
        let ret_value = Argument::from_value(res, ArgumentFlag::ARG_OUT);
        Ok(ret_value)
    }
}

pub struct CublasLtMatmulPreferenceSetAttributeHandler;
impl ApiHandler for CublasLtMatmulPreferenceSetAttributeHandler {
    fn handle_api(&self, args: &mut [Argument<'_>]) -> Result<Argument<'static>, ServerErr> {
        debug!("[server] api_name: cublasLtMatmulPreferenceSetAttribute");
        let pref = args[0]
            .downcast::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <pref> expected: usize".into()))?
            as cublaslt::cublasLtMatmulPreference_t;

        let attr = args[1]
            .downcast::<cublaslt::cublasLtMatmulPreferenceAttributes_t>()
            .map_err(|_| {
                ServerErr::InvalidType(
                    "InvalidType, <attr> expected: cublaslt::cublasLtMatmulPreferenceAttributes_t"
                        .into(),
                )
            })?;

        let buf = args[2].downcast_slice::<u8>().map_err(|_| {
            ServerErr::InvalidType("InvalidType, <buf> expected: *const c_void".into())
        })?;

        let size_in_bytes = args[3]
            .downcast::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <size> expected: usize".into()))?;

        let res = unsafe {
            cublaslt::cublasLtMatmulPreferenceSetAttribute(
                pref,
                attr,
                buf.as_ptr().cast::<c_void>(),
                size_in_bytes,
            )
        };
        debug!(
            "--------------cublasLtMatmulPreferenceSetAttribute, res:{}",
            res
        );
        if res > 0 {
            return Err(ServerErr::ApiRunError(
                "cublasLtMatmulPreferenceSetAttribute".to_string(),
                res as i32,
            ));
        }
        let ret_value = Argument::from_value(res, ArgumentFlag::ARG_OUT);
        Ok(ret_value)
    }
}

pub struct CublasLtMatmulDescCreateHandler;
impl ApiHandler for CublasLtMatmulDescCreateHandler {
    fn handle_api(&self, args: &mut [Argument<'_>]) -> Result<Argument<'static>, ServerErr> {
        debug!("[server] api_name: cublasLtMatmulDescCreate");
        let matmul_desc = unsafe {
            args[0].downcast_mut::<usize>().map_err(|_| {
                ServerErr::InvalidType("InvalidType, <matmul_desc> expected: usize".into())
            })? as *mut usize as *mut cublaslt::cublasLtMatmulDesc_t
        };
        let compute_type = args[1]
            .downcast::<cublaslt::cublasComputeType_t>()
            .map_err(|_| {
                ServerErr::InvalidType(
                    "InvalidType, <computeType> expected: cublaslt::cublasComputeType_t".into(),
                )
            })?;
        let scale_type = args[2]
            .downcast::<cublaslt::cudaDataType_t>()
            .map_err(|_| {
                ServerErr::InvalidType(
                    "InvalidType, <scaleType> expected: cublaslt::cudaDataType_t".into(),
                )
            })?;
        debug!("matmul_desc:{:p}, **{:?}", matmul_desc, unsafe {
            *matmul_desc
        });
        let res =
            unsafe { cublaslt::cublasLtMatmulDescCreate(matmul_desc, compute_type, scale_type) };
        debug!(
            "matmul_desc:{:p}, *{:?}, **{:?}",
            matmul_desc,
            unsafe { *matmul_desc },
            unsafe { **matmul_desc }
        );
        debug!("--------------cublasLtMatmulDescCreate, res={}", res);
        if res > 0 {
            return Err(ServerErr::ApiRunError(
                "cublasLtMatmulDescCreate".to_string(),
                res as i32,
            ));
        }
        let ret_value = Argument::from_value(res, ArgumentFlag::ARG_OUT);
        Ok(ret_value)
    }
}

pub struct CublasLtMatmulDescSetAttributeHandler;
impl ApiHandler for CublasLtMatmulDescSetAttributeHandler {
    fn handle_api(&self, args: &mut [Argument<'_>]) -> Result<Argument<'static>, ServerErr> {
        debug!("[server] api_name: cublasLtMatmulDescSetAttribute");
        let matmul_desc = args[0].downcast::<usize>().map_err(|_| {
            ServerErr::InvalidType("InvalidType, <matmul_desc> expected: usize".into())
        })? as cublaslt::cublasLtMatmulDesc_t;

        let attr = args[1]
            .downcast::<cublaslt::cublasLtMatmulDescAttributes_t>()
            .map_err(|_| {
                ServerErr::InvalidType(
                    "InvalidType, <attr> expected: cublaslt::cublasLtMatmulDescAttributes_t".into(),
                )
            })?;

        let buf: *const c_void = {
            if let Ok(slice) = args[2].downcast_slice::<u8>() {
                debug!("buf as slice, len:{}", slice.len());
                slice.as_ptr().cast::<c_void>()
            } else if let Ok(addr) = args[2].downcast_ref::<usize>() {
                debug!("buf as usize, {:x}", addr);
                addr as *const usize as *const c_void
            } else {
                error!("InvalidType, <buf> expected: &[u8] (host buffer) or usize (device virtual address)");
                return Err(ServerErr::InvalidType(
                    "InvalidType, <buf> expected: &[u8] (host buffer) or usize (device virtual address)".into()
                ));
            }
        };

        let size = args[3]
            .downcast::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <size> expected: usize".into()))?;

        debug!("matmul_desc:{:p}, **{:?}", matmul_desc, unsafe {
            *matmul_desc
        });

        let res = unsafe { cublaslt::cublasLtMatmulDescSetAttribute(matmul_desc, attr, buf, size) };
        debug!("--------------cublasLtMatmulDescSetAttribute, res={}", res);
        if res > 0 {
            return Err(ServerErr::ApiRunError(
                "cublasLtMatmulDescSetAttribute".to_string(),
                res as i32,
            ));
        }
        let ret_value = Argument::from_value(res, ArgumentFlag::ARG_OUT);
        Ok(ret_value)
    }
}

pub struct CublasLtMatrixLayoutCreateHandler;
impl ApiHandler for CublasLtMatrixLayoutCreateHandler {
    fn handle_api(&self, args: &mut [Argument<'_>]) -> Result<Argument<'static>, ServerErr> {
        debug!("[server] api_name: cublasLtMatrixLayoutCreate");
        let mat_layout = unsafe {
            args[0].downcast_mut::<usize>().map_err(|_| {
                ServerErr::InvalidType("InvalidType, <mat_layout> expected: usize".into())
            })? as *mut usize as *mut cublaslt::cublasLtMatrixLayout_t
        };
        let type_x = args[1].downcast::<cublaslt::cudaDataType>().map_err(|_| {
            ServerErr::InvalidType("InvalidType, <type_x> expected: cublaslt::cudaDataType".into())
        })?;
        let rows = args[2]
            .downcast::<u64>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <rows> expected: u64".into()))?;
        let cols = args[3]
            .downcast::<u64>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <cols> expected: u64".into()))?;
        let ld = args[4]
            .downcast::<i64>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <ld> expected: i64".into()))?;
        let res =
            unsafe { cublaslt::cublasLtMatrixLayoutCreate(mat_layout, type_x, rows, cols, ld) };
        debug!(
            "matlayout, ptr:{:p}, *{:?}, **{:?}",
            mat_layout,
            unsafe { *mat_layout },
            unsafe { **mat_layout }
        );
        debug!("rows:{},cols:{},ld:{}", rows, cols, ld);
        debug!("--------------cublasLtMatrixLayoutCreate, res={}", res);
        if res > 0 {
            return Err(ServerErr::ApiRunError(
                "cublasLtMatrixLayoutCreate".to_string(),
                res as i32,
            ));
        }
        let ret_value = Argument::from_value(res, ArgumentFlag::ARG_OUT);
        Ok(ret_value)
    }
}

pub struct CublasGetMathModeHandler;
impl ApiHandler for CublasGetMathModeHandler {
    fn handle_api(&self, args: &mut [Argument<'_>]) -> Result<Argument<'static>, ServerErr> {
        debug!("[server] api_name: cublasGetMathMode");
        let handle = args[0]
            .downcast::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <handle> expected: usize".into()))?
            as cublas::cublasHandle_t;

        let mode = unsafe {
            args[1]
                .downcast_mut::<cublas::cublasMath_t>()
                .map_err(|_| {
                    ServerErr::InvalidType(
                        "InvalidType, <mode> expected: *mut cublas::cublasMath_t".into(),
                    )
                })?
        };
        let res = unsafe { cublas::cublasGetMathMode(handle, mode) };
        if res > 0 {
            return Err(ServerErr::ApiRunError(
                "cublasGetMathMode".to_string(),
                res as i32,
            ));
        }
        debug!("--------------cublasGetMathMode, res={}", res);
        let ret_value = Argument::from_value(res, ArgumentFlag::ARG_OUT);
        Ok(ret_value)
    }
}

pub struct CublasSetMathModeHandler;
impl ApiHandler for CublasSetMathModeHandler {
    fn handle_api(&self, args: &mut [Argument<'_>]) -> Result<Argument<'static>, ServerErr> {
        debug!("[server] api_name: cublasSetMathMode");
        let handle = args[0]
            .downcast::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <handle> expected: usize".into()))?
            as cublas::cublasHandle_t;

        let mode = args[1].downcast::<cublas::cublasMath_t>().map_err(|_| {
            ServerErr::InvalidType("InvalidType, <mode> expected: cublas::cublasMath_t".into())
        })?;
        let res = unsafe { cublas::cublasSetMathMode(handle, mode) };
        debug!("--------------cublasSetMathMode, res={}", res);
        if res > 0 {
            return Err(ServerErr::ApiRunError(
                "cublasSetMathMode".to_string(),
                res as i32,
            ));
        }
        let ret_value = Argument::from_value(res, ArgumentFlag::ARG_OUT);
        Ok(ret_value)
    }
}

pub struct CublasLtMatmulAlgoGetHeuristicHandler;
impl ApiHandler for CublasLtMatmulAlgoGetHeuristicHandler {
    fn handle_api(&self, args: &mut [Argument<'_>]) -> Result<Argument<'static>, ServerErr> {
        debug!("[server] api_name: cublasLtMatmulAlgoGetHeuristic");
        let light_handle = args[0].downcast::<usize>().map_err(|_| {
            ServerErr::InvalidType("InvalidType, <lightHandle> expected: usize".into())
        })? as cublaslt::cublasLtHandle_t;
        let operation_desc = args[1].downcast::<usize>().map_err(|_| {
            ServerErr::InvalidType("InvalidType, <operationDesc> expected: usize".into())
        })? as cublaslt::cublasLtMatmulDesc_t;
        let adesc = args[2]
            .downcast::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <Adesc> expected: usize".into()))?
            as cublaslt::cublasLtMatrixLayout_t;
        let bdesc = args[3]
            .downcast::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <Bdesc> expected: usize".into()))?
            as cublaslt::cublasLtMatrixLayout_t;
        let cdesc = args[4]
            .downcast::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <Cdesc> expected: usize".into()))?
            as cublaslt::cublasLtMatrixLayout_t;
        let ddesc = args[5]
            .downcast::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <Ddesc> expected: usize".into()))?
            as cublaslt::cublasLtMatrixLayout_t;
        let preference = args[6].downcast::<usize>().map_err(|_| {
            ServerErr::InvalidType("InvalidType, <preference> expected: usize".into())
        })? as cublaslt::cublasLtMatmulPreference_t;
        let requested_algo_count = args[7].downcast::<c_int>().map_err(|_| {
            ServerErr::InvalidType("InvalidType, <requestedAlgoCount> expected: c_int".into())
        })?;
        let heuristic_results_array = unsafe {
            args[8]
            .downcast_mut::<cublaslt::cublasLtMatmulHeuristicResult_t>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <heuristicResultsArray> expected: *mut cublaslt::cublasLtMatmulHeuristicResult_t".into()))?
        };
        let return_algo_count = unsafe {
            args[9].downcast_mut::<c_int>().map_err(|_| {
                ServerErr::InvalidType("InvalidType, <returnAlgoCount> expected: *mut c_int".into())
            })?
        };
        let res = unsafe {
            cublaslt::cublasLtMatmulAlgoGetHeuristic(
                light_handle,
                operation_desc,
                adesc,
                bdesc,
                cdesc,
                ddesc,
                preference,
                requested_algo_count,
                heuristic_results_array as *mut cublaslt::cublasLtMatmulHeuristicResult_t,
                return_algo_count,
            )
        };
        debug!("--------------cublasLtMatmulAlgoGetHeuristic, res={}", res);
        if res > 0 {
            debug!("light_handle: {}", light_handle);
            debug!("op_desc:{:p}, *{:?}", operation_desc, unsafe {
                *operation_desc
            });
            debug!("b_desc:{:p}, *{:?}", bdesc, unsafe { *bdesc });
            debug!("preference:{:p}, *{:?}", preference, unsafe { *preference });
            debug!(
                "heuristic_res_array:{:p}, *{:?}",
                heuristic_results_array, *heuristic_results_array
            );
            debug!(
                "ret_alg_cnt:{:p}, *{:?}",
                return_algo_count, *return_algo_count
            );
            return Err(ServerErr::ApiRunError(
                "cublasLtMatmulAlgoGetHeuristic".to_string(),
                res as i32,
            ));
        }
        let ret_value = Argument::from_value(res, ArgumentFlag::ARG_OUT);
        Ok(ret_value)
    }
}

pub struct CublasLtMatmulHandler;
impl ApiHandler for CublasLtMatmulHandler {
    fn handle_api(&self, args: &mut [Argument<'_>]) -> Result<Argument<'static>, ServerErr> {
        debug!("[server] api_name: cublasLtMatmul");
        let light_handle = args[0].downcast::<usize>().map_err(|_| {
            ServerErr::InvalidType("InvalidType, <lightHandle> expected: usize".into())
        })? as cublaslt::cublasLtHandle_t;
        let compute_desc = args[1].downcast::<usize>().map_err(|_| {
            ServerErr::InvalidType("InvalidType, <computeDesc> expected: usize".into())
        })? as cublaslt::cublasLtMatmulDesc_t;
        let alpha: *const c_void = {
            if let Ok(slice) = args[2].downcast_slice::<u8>() {
                debug!("alpha as slice, len:{}", slice.len());
                slice.as_ptr().cast::<c_void>()
            } else if let Ok(addr) = args[2].downcast::<usize>() {
                debug!("alpha as usize, {:x}", addr);
                addr as *const c_void
            } else {
                error!("InvalidType, <alpha> expected: &[u8] (host buffer) or usize (device virtual address)");
                return Err(ServerErr::InvalidType(
                    "InvalidType, <alpha> expected: &[u8] (host buffer) or usize (device virtual address)".into()
                ));
            }
        };
        let a = args[3]
            .downcast::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <A> expected: usize".into()))?
            as *const c_void;
        let adesc = args[4]
            .downcast::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <Adesc> expected: usize".into()))?
            as cublaslt::cublasLtMatrixLayout_t;
        let b = args[5]
            .downcast::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <B> expected: usize".into()))?
            as *const c_void;
        let bdesc = args[6]
            .downcast::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <Bdesc> expected: usize".into()))?
            as cublaslt::cublasLtMatrixLayout_t;
        let beta: *const c_void = {
            if args[7].is_empty() {
                std::ptr::null()
            } else if let Ok(slice) = args[7].downcast_slice::<u8>() {
                debug!("beta as slice, len:{}", slice.len());
                slice.as_ptr().cast::<c_void>()
            } else if let Ok(addr) = args[7].downcast::<usize>() {
                debug!("beta as usize, {:x}", addr);
                addr as *const c_void
            } else {
                error!("InvalidType, <beta> expected: &[u8] (host buffer) or usize (device virtual address)");
                return Err(ServerErr::InvalidType(
                        "InvalidType, <beta> expected: &[u8] (host buffer) or usize (device virtual address)".into()
                    ));
            }
        };
        let c = args[8]
            .downcast::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <C> expected: usize".into()))?
            as *const c_void;
        let cdesc = args[9]
            .downcast::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <Cdesc> expected: usize".into()))?
            as cublaslt::cublasLtMatrixLayout_t;
        let d = args[10]
            .downcast::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <D> expected: usize".into()))?
            as *mut c_void;
        let ddesc = args[11]
            .downcast::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <Ddesc> expected: usize".into()))?
            as cublaslt::cublasLtMatrixLayout_t;
        let algo = unsafe {
            args[12]
                .downcast_mut::<cublaslt::cublasLtMatmulAlgo_t>()
                .map_err(|_| {
                    ServerErr::InvalidType(
                        "InvalidType, <algo> expected: *const cublaslt::cublasLtMatmulAlgo_t"
                            .into(),
                    )
                })?
        };
        let workspace = args[13].downcast::<usize>().map_err(|_| {
            ServerErr::InvalidType("InvalidType, <workspace> expected: usize".into())
        })? as *mut c_void;
        let workspace_size_in_bytes = args[14].downcast::<usize>().map_err(|_| {
            ServerErr::InvalidType("InvalidType, <workspaceSizeInBytes> expected: usize".into())
        })?;
        let stream = args[15]
            .downcast::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <stream> expected: usize".into()))?
            as cublaslt::cudaStream_t;
        debug!("alpha:{:p}, *f32 {:?}", alpha, unsafe {
            *(alpha as *const f32)
        });
        debug!("beta:{:p}, *f32 {:?}", beta, unsafe {
            *(beta as *const f32)
        });
        debug!("algo:{:p}, *{:?}", algo, *algo);
        debug!(
            "workspace:{:p}, size:{}",
            workspace, workspace_size_in_bytes
        );
        let res = unsafe {
            cublaslt::cublasLtMatmul(
                light_handle,
                compute_desc,
                alpha,
                a,
                adesc,
                b,
                bdesc,
                beta,
                c,
                cdesc,
                d,
                ddesc,
                algo,
                workspace,
                workspace_size_in_bytes,
                stream,
            )
        };
        debug!("--------------cublasLtMatmul, res={}", res);
        if res > 0 {
            debug!("a_desc:{:p}, *{:?}", adesc, unsafe { *adesc });
            debug!("b_desc:{:p}, *{:?}", bdesc, unsafe { *bdesc });
            debug!("c_desc:{:p}, *{:?}", cdesc, unsafe { *cdesc });
            debug!("d_desc:{:p}, *{:?}", ddesc, unsafe { *ddesc });
            return Err(ServerErr::ApiRunError(
                "cublasLtMatmul".to_string(),
                res as i32,
            ));
        }
        let ret_value = Argument::from_value(res, ArgumentFlag::ARG_OUT);
        Ok(ret_value)
    }
}

pub struct CublasLtMatmulDescDestroyHandler;
impl ApiHandler for CublasLtMatmulDescDestroyHandler {
    fn handle_api(&self, args: &mut [Argument<'_>]) -> Result<Argument<'static>, ServerErr> {
        debug!("[server] api_name: cublasLtMatmulDescDestroy");
        let matmul_desc = args[0].downcast::<usize>().map_err(|_| {
            ServerErr::InvalidType("InvalidType, <matmulDesc> expected: usize".into())
        })? as cublaslt::cublasLtMatmulDesc_t;
        let res = unsafe { cublaslt::cublasLtMatmulDescDestroy(matmul_desc) };
        if res > 0 {
            return Err(ServerErr::ApiRunError(
                "cublasLtMatmulDescDestroy".to_string(),
                res as i32,
            ));
        }
        debug!("--------------cublasLtMatmulDescDestroy, res={}", res);
        let ret_value = Argument::from_value(res, ArgumentFlag::ARG_OUT);
        Ok(ret_value)
    }
}

pub struct CublasLtMatrixLayoutDestroyHandler;
impl ApiHandler for CublasLtMatrixLayoutDestroyHandler {
    fn handle_api(&self, args: &mut [Argument<'_>]) -> Result<Argument<'static>, ServerErr> {
        debug!("[server] api_name: cublasLtMatrixLayoutDestroy");
        let mat_layout = args[0].downcast::<usize>().map_err(|_| {
            ServerErr::InvalidType("InvalidType, <matLayout> expected: usize".into())
        })? as cublaslt::cublasLtMatrixLayout_t;
        let res = unsafe { cublaslt::cublasLtMatrixLayoutDestroy(mat_layout) };
        if res > 0 {
            return Err(ServerErr::ApiRunError(
                "cublasLtMatrixLayoutDestroy".to_string(),
                res as i32,
            ));
        }
        debug!("--------------cublasLtMatrixLayoutDestroy, res={}", res);
        let ret_value = Argument::from_value(res, ArgumentFlag::ARG_OUT);
        Ok(ret_value)
    }
}

pub struct CublasLtMatmulPreferenceDestroyHandler;
impl ApiHandler for CublasLtMatmulPreferenceDestroyHandler {
    fn handle_api(&self, args: &mut [Argument<'_>]) -> Result<Argument<'static>, ServerErr> {
        debug!("[server] api_name: cublasLtMatmulPreferenceDestroy");
        let pref = args[0]
            .downcast::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <pref> expected: usize".into()))?
            as cublaslt::cublasLtMatmulPreference_t;
        let res = unsafe { cublaslt::cublasLtMatmulPreferenceDestroy(pref) };
        if res > 0 {
            return Err(ServerErr::ApiRunError(
                "cublasLtMatmulPreferenceDestroy".to_string(),
                res as i32,
            ));
        }
        debug!("--------------cublasLtMatmulPreferenceDestroy, res={}", res);
        let ret_value = Argument::from_value(res, ArgumentFlag::ARG_OUT);
        Ok(ret_value)
    }
}

pub struct CublasSgemmV2handler;
impl ApiHandler for CublasSgemmV2handler {
    fn handle_api(&self, args: &mut [Argument<'_>]) -> Result<Argument<'static>, ServerErr> {
        debug!("[server] api_name: cublasSgemm_v2");
        let handle = args[0]
            .downcast::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <handle> expected: usize".into()))?
            as cublas::cublasHandle_t;
        let transa = args[1]
            .downcast::<cublas::cublasOperation_t>()
            .map_err(|_| {
                ServerErr::InvalidType(
                    "InvalidType, <transa> expected: cublas::cublasOperation_t".into(),
                )
            })?;
        let transb = args[2]
            .downcast::<cublas::cublasOperation_t>()
            .map_err(|_| {
                ServerErr::InvalidType(
                    "InvalidType, <transb> expected: cublas::cublasOperation_t".into(),
                )
            })?;
        let m = args[3]
            .downcast::<c_int>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <m> expected: c_int".into()))?;
        let n = args[4]
            .downcast::<c_int>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <n> expected: c_int".into()))?;
        let k = args[5]
            .downcast::<c_int>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <k> expected: c_int".into()))?;
        let alpha = unsafe {
            args[6].downcast_mut::<f32>().map_err(|_| {
                ServerErr::InvalidType("InvalidType, <alpha> expected: *const f32".into())
            })?
        };
        let a = args[7]
            .downcast::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <A> expected: usize".into()))?
            as *const f32;
        let lda = args[8]
            .downcast::<c_int>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <lda> expected: c_int".into()))?;
        let b = args[9]
            .downcast::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <B> expected: usize".into()))?
            as *const f32;
        let ldb = args[10]
            .downcast::<c_int>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <ldb> expected: c_int".into()))?;
        let beta = unsafe {
            args[11].downcast_mut::<f32>().map_err(|_| {
                ServerErr::InvalidType("InvalidType, <beta> expected: *const f32".into())
            })?
        };
        let c = args[12]
            .downcast::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <C> expected: usize".into()))?
            as *mut f32;
        let ldc = args[13]
            .downcast::<c_int>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <ldc> expected: c_int".into()))?;
        let res = unsafe {
            cublas::cublasSgemm_v2(
                handle, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
            )
        };
        debug!("--------------cublasSgemm_v2, res={}", res);
        if res > 0 {
            return Err(ServerErr::ApiRunError(
                "cublasSgemm_v2".to_string(),
                res as i32,
            ));
        }
        let ret_value = Argument::from_value(res, ArgumentFlag::ARG_OUT);
        Ok(ret_value)
    }
}

pub struct NvmlInitV2Handler;
impl ApiHandler for NvmlInitV2Handler {
    fn handle_api(&self, _args: &mut [Argument<'_>]) -> Result<Argument<'static>, ServerErr> {
        let res = unsafe { nvml::nvmlInit_v2() };
        let ret_value = Argument::from_value(res, ArgumentFlag::ARG_OUT);
        Ok(ret_value)
    }
}

pub struct NvmlShutdownHandler;
impl ApiHandler for NvmlShutdownHandler {
    fn handle_api(&self, _args: &mut [Argument<'_>]) -> Result<Argument<'static>, ServerErr> {
        let res = unsafe { nvml::nvmlShutdown() };
        let ret_value = Argument::from_value(res, ArgumentFlag::ARG_OUT);
        Ok(ret_value)
    }
}

pub struct NvmlDeviceGetCountV2handler;
impl ApiHandler for NvmlDeviceGetCountV2handler {
    fn handle_api(&self, args: &mut [Argument<'_>]) -> Result<Argument<'static>, ServerErr> {
        let device_count = unsafe {
            args[0].downcast_mut::<c_uint>().map_err(|_| {
                ServerErr::InvalidType("InvalidType, <device> expected: driver::CUdevice".into())
            })?
        };
        let res = unsafe { nvml::nvmlDeviceGetCount_v2(device_count as *mut c_uint) };
        let ret_value = Argument::from_value(res, ArgumentFlag::ARG_OUT);
        Ok(ret_value)
    }
}

pub struct NcclGetVersionHandler;
impl ApiHandler for NcclGetVersionHandler {
    fn handle_api(&self, args: &mut [Argument<'_>]) -> Result<Argument<'static>, ServerErr> {
        info!("[server] api_name: ncclGetVersion");
        let version = unsafe {
            args[0].downcast_mut::<c_int>().map_err(|_| {
                ServerErr::InvalidType("InvalidType, <version> expected: *mut c_int".into())
            })?
        };
        let res = unsafe { nccl::ncclGetVersion(version) };
        if res > 0 {
            return Err(ServerErr::ApiRunError(
                "ncclGetVersion".to_string(),
                res as i32,
            ));
        }

        debug!("--------------ncclGetVersion, res={}", res);
        let ret_value = Argument::from_value(res, ArgumentFlag::ARG_OUT);
        Ok(ret_value)
    }
}

pub struct NcclCommDestroyHandler;
impl ApiHandler for NcclCommDestroyHandler {
    fn handle_api(&self, args: &mut [Argument<'_>]) -> Result<Argument<'static>, ServerErr> {
        let comm = args[0]
            .downcast::<usize>()
            .map_err(|e| debug!("{}", e))
            .expect("parse comm failed") as nccl::ncclComm_t;

        let res = unsafe { nccl::ncclCommDestroy(comm) };
        debug!("NcclCommDestroyHandler, res ={}", res);
        let ret_value = Argument::from_value(res, ArgumentFlag::ARG_OUT);
        Ok(ret_value)
    }
}

pub struct NcclCommAbortHandler;
impl ApiHandler for NcclCommAbortHandler {
    fn handle_api(&self, args: &mut [Argument<'_>]) -> Result<Argument<'static>, ServerErr> {
        debug!("[server] api_name: ncclCommAbort");
        let comm = args[0]
            .downcast::<usize>()
            .map_err(|e| debug!("{}", e))
            .expect("parse comm failed") as nccl::ncclComm_t;

        let res = unsafe { nccl::ncclCommAbort(comm) };
        if res > 0 {
            return Err(ServerErr::ApiRunError(
                "ncclCommAbort".to_string(),
                res as i32,
            ));
        }

        debug!("--------------ncclCommAbort, res={}", res);
        let ret_value = Argument::from_value(res, ArgumentFlag::ARG_OUT);
        Ok(ret_value)
    }
}

pub struct NcclCommFinalizeHandler;
impl ApiHandler for NcclCommFinalizeHandler {
    fn handle_api(&self, args: &mut [Argument<'_>]) -> Result<Argument<'static>, ServerErr> {
        debug!("[server] api_name: ncclCommFinalize");
        let comm = args[0]
            .downcast::<usize>()
            .map_err(|e| debug!("{}", e))
            .expect("parse comm failed") as nccl::ncclComm_t;
        let res = unsafe { nccl::ncclCommFinalize(comm) };
        if res > 0 {
            return Err(ServerErr::ApiRunError(
                "ncclCommFinalize".to_string(),
                res as i32,
            ));
        }

        debug!("--------------ncclCommFinalize, res={}", res);
        let ret_value = Argument::from_value(res, ArgumentFlag::ARG_OUT);
        Ok(ret_value)
    }
}

pub struct NcclCommSplitHandler;
impl ApiHandler for NcclCommSplitHandler {
    fn handle_api(&self, args: &mut [Argument<'_>]) -> Result<Argument<'static>, ServerErr> {
        debug!("[server] api_name: ncclCommSplit");
        let comm = args[0]
            .downcast::<usize>()
            .map_err(|e| debug!("{}", e))
            .expect("parse comm failed") as nccl::ncclComm_t;

        let color = args[1]
            .downcast::<c_int>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <color> expected: c_int".into()))?;
        let key = args[2]
            .downcast::<c_int>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <key> expected: c_int".into()))?;
        let newcomm = unsafe {
            args[3].downcast_mut::<usize>().map_err(|_| {
                ServerErr::InvalidType("InvalidType, <newcomm> expected: usize".into())
            })? as *mut usize as *mut nccl::ncclComm_t
        };
        let config = unsafe {
            args[4].downcast_mut::<nccl::ncclConfig_t>().map_err(|_| {
                ServerErr::InvalidType(
                    "InvalidType, <config> expected: *mut nccl::ncclConfig_t".into(),
                )
            })?
        };
        let res = unsafe { nccl::ncclCommSplit(comm, color, key, newcomm, config) };
        if res > 0 {
            return Err(ServerErr::ApiRunError(
                "ncclCommSplit".to_string(),
                res as i32,
            ));
        }
        debug!("--------------ncclCommSplit, res={}", res);
        let ret_value = Argument::from_value(res, ArgumentFlag::ARG_OUT);
        Ok(ret_value)
    }
}

pub struct NcclCommGetAsyncErrorHandler;
impl ApiHandler for NcclCommGetAsyncErrorHandler {
    fn handle_api(&self, args: &mut [Argument<'_>]) -> Result<Argument<'static>, ServerErr> {
        info!("[server] api_name: ncclCommGetAsyncError");
        let comm = args[0]
            .downcast::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <comm> expected: usize".into()))?
            as nccl::ncclComm_t;
        let async_error = unsafe {
            args[1].downcast_mut::<nccl::ncclResult_t>().map_err(|_| {
                ServerErr::InvalidType(
                    "InvalidType, <asyncError> expected: *mut nccl::ncclResult_t".into(),
                )
            })?
        };
        debug!("comm:{:?}", comm);
        debug!("async_err:{:p}, *{:?}", async_error, *async_error);
        let res = unsafe { nccl::ncclCommGetAsyncError(comm, async_error) };
        if res > 0 {
            return Err(ServerErr::ApiRunError(
                "ncclCommGetAsyncError".to_string(),
                res as i32,
            ));
        }
        debug!("after comm:{:?}", comm);
        debug!("after async_err:{:p}, *{:?}", async_error, *async_error);
        debug!("--------------ncclCommGetAsyncError, res={}", res);
        let ret_value = Argument::from_value(res, ArgumentFlag::ARG_OUT);
        Ok(ret_value)
    }
}

pub struct NcclBcastHandler;
impl ApiHandler for NcclBcastHandler {
    fn handle_api(&self, args: &mut [Argument<'_>]) -> Result<Argument<'static>, ServerErr> {
        info!("[server] api_name: ncclBcast");
        let buff = args[0]
            .downcast::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <buff> expected: usize".into()))?
            as *mut c_void;
        let count = args[1]
            .downcast::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <count> expected: usize".into()))?;
        let datatype = args[2].downcast::<nccl::ncclDataType_t>().map_err(|_| {
            ServerErr::InvalidType("InvalidType, <datatype> expected: nccl::ncclDataType_t".into())
        })?;
        let root = args[3]
            .downcast::<c_int>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <root> expected: c_int".into()))?;
        let comm = args[4]
            .downcast::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <comm> expected: usize".into()))?
            as nccl::ncclComm_t;
        let stream = args[5]
            .downcast::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <stream> expected: usize".into()))?
            as nccl::cudaStream_t;
        let res = unsafe { nccl::ncclBcast(buff, count, datatype, root, comm, stream) };
        if res > 0 {
            return Err(ServerErr::ApiRunError("ncclBcast".to_string(), res as i32));
        }
        debug!("--------------ncclBcast, res={}", res);
        let ret_value = Argument::from_value(res, ArgumentFlag::ARG_OUT);
        Ok(ret_value)
    }
}

pub struct NcclAllReduceHandler;
impl ApiHandler for NcclAllReduceHandler {
    fn handle_api(&self, args: &mut [Argument<'_>]) -> Result<Argument<'static>, ServerErr> {
        info!("[server] api_name: ncclAllReduce");
        let sendbuff = args[0]
            .downcast::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <sendbuff> expected: usize".into()))?
            as *const c_void;
        debug!("sendbuff:{:p}", sendbuff);
        let recvbuff = args[1]
            .downcast::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <recvbuff> expected: usize".into()))?
            as *mut c_void;
        debug!("recvbuff:{:p}", recvbuff);
        let count = args[2]
            .downcast::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <count> expected: usize".into()))?;
        let datatype = args[3].downcast::<nccl::ncclDataType_t>().map_err(|_| {
            ServerErr::InvalidType("InvalidType, <datatype> expected: nccl::ncclDataType_t".into())
        })?;
        let op = args[4].downcast::<nccl::ncclRedOp_t>().map_err(|_| {
            ServerErr::InvalidType("InvalidType, <op> expected: nccl::ncclRedOp_t".into())
        })?;
        let comm = args[5]
            .downcast::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <comm> expected: usize".into()))?
            as nccl::ncclComm_t;
        let stream = args[6]
            .downcast::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <stream> expected: usize".into()))?
            as nccl::cudaStream_t;
        let res =
            unsafe { nccl::ncclAllReduce(sendbuff, recvbuff, count, datatype, op, comm, stream) };

        if res > 0 {
            return Err(ServerErr::ApiRunError(
                "ncclAllReduce".to_string(),
                res as i32,
            ));
        }
        debug!("--------------ncclAllReduce, res={}", res);
        let ret_value = Argument::from_value(res, ArgumentFlag::ARG_OUT);
        Ok(ret_value)
    }
}

pub struct NcclAllGatherHandler;
impl ApiHandler for NcclAllGatherHandler {
    fn handle_api(&self, args: &mut [Argument<'_>]) -> Result<Argument<'static>, ServerErr> {
        info!("[server] api_name: ncclAllGather");
        let sendbuff = args[0]
            .downcast::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <sendbuff> expected: usize".into()))?
            as *const c_void;
        let recvbuff = args[1]
            .downcast::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <recvbuff> expected: usize".into()))?
            as *mut c_void;
        let sendcount = args[2].downcast::<usize>().map_err(|_| {
            ServerErr::InvalidType("InvalidType, <sendcount> expected: usize".into())
        })?;
        let datatype = args[3].downcast::<nccl::ncclDataType_t>().map_err(|_| {
            ServerErr::InvalidType("InvalidType, <datatype> expected: nccl::ncclDataType_t".into())
        })?;
        let comm = args[4]
            .downcast::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <comm> expected: usize".into()))?
            as nccl::ncclComm_t;
        let stream = args[5]
            .downcast::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <stream> expected: usize".into()))?
            as nccl::cudaStream_t;
        let res =
            unsafe { nccl::ncclAllGather(sendbuff, recvbuff, sendcount, datatype, comm, stream) };
        if res > 0 {
            return Err(ServerErr::ApiRunError(
                "ncclAllGather".to_string(),
                res as i32,
            ));
        }

        debug!("--------------ncclAllGather, res={}", res);
        let ret_value = Argument::from_value(res, ArgumentFlag::ARG_OUT);
        Ok(ret_value)
    }
}

pub struct NcclSendHandler;
impl ApiHandler for NcclSendHandler {
    fn handle_api(&self, args: &mut [Argument<'_>]) -> Result<Argument<'static>, ServerErr> {
        info!("[server] api_name: ncclSend");
        let sendbuff = args[0]
            .downcast::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <sendbuff> expected: usize".into()))?
            as *const c_void;
        let count = args[1]
            .downcast::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <count> expected: usize".into()))?;
        let datatype = args[2].downcast::<nccl::ncclDataType_t>().map_err(|_| {
            ServerErr::InvalidType("InvalidType, <datatype> expected: nccl::ncclDataType_t".into())
        })?;
        let peer = args[3]
            .downcast::<c_int>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <peer> expected: c_int".into()))?;
        let comm = args[4]
            .downcast::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <comm> expected: usize".into()))?
            as nccl::ncclComm_t;
        let stream = args[5]
            .downcast::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <stream> expected: usize".into()))?
            as nccl::cudaStream_t;
        let res = unsafe { nccl::ncclSend(sendbuff, count, datatype, peer, comm, stream) };
        if res > 0 {
            return Err(ServerErr::ApiRunError("ncclSend".to_string(), res as i32));
        }

        debug!("--------------ncclSend, res={}", res);
        let ret_value = Argument::from_value(res, ArgumentFlag::ARG_OUT);
        Ok(ret_value)
    }
}

pub struct NcclRecvHandler;
impl ApiHandler for NcclRecvHandler {
    fn handle_api(&self, args: &mut [Argument<'_>]) -> Result<Argument<'static>, ServerErr> {
        info!("[server] api_name: ncclRecv");
        let recvbuff = args[0]
            .downcast::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <recvbuff> expected: usize".into()))?
            as *mut c_void;
        let count = args[1]
            .downcast::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <count> expected: usize".into()))?;
        let datatype = args[2].downcast::<nccl::ncclDataType_t>().map_err(|_| {
            ServerErr::InvalidType("InvalidType, <datatype> expected: nccl::ncclDataType_t".into())
        })?;
        let peer = args[3]
            .downcast::<c_int>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <peer> expected: c_int".into()))?;
        let comm = args[4]
            .downcast::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <comm> expected: usize".into()))?
            as nccl::ncclComm_t;
        let stream = args[5]
            .downcast::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <stream> expected: usize".into()))?
            as nccl::cudaStream_t;
        let res = unsafe { nccl::ncclRecv(recvbuff, count, datatype, peer, comm, stream) };
        if res > 0 {
            return Err(ServerErr::ApiRunError("ncclRecv".to_string(), res as i32));
        }

        debug!("--------------ncclRecv, res={}", res);
        let ret_value = Argument::from_value(res, ArgumentFlag::ARG_OUT);
        Ok(ret_value)
    }
}

pub struct NcclGroupStartHandler;
impl ApiHandler for NcclGroupStartHandler {
    fn handle_api(&self, _args: &mut [Argument<'_>]) -> Result<Argument<'static>, ServerErr> {
        info!("[server] api_name: ncclGroupStart");
        let res = unsafe { nccl::ncclGroupStart() };
        if res > 0 {
            return Err(ServerErr::ApiRunError(
                "ncclGroupStart".to_string(),
                res as i32,
            ));
        }
        debug!("--------------ncclGroupStart, res={}", res);
        let ret_value = Argument::from_value(res, ArgumentFlag::ARG_OUT);
        Ok(ret_value)
    }
}

pub struct NcclGroupEndHandler;
impl ApiHandler for NcclGroupEndHandler {
    fn handle_api(&self, _args: &mut [Argument<'_>]) -> Result<Argument<'static>, ServerErr> {
        info!("[server] api_name: ncclGroupEnd");
        let res = unsafe { nccl::ncclGroupEnd() };
        if res > 0 {
            return Err(ServerErr::ApiRunError(
                "ncclGroupEnd".to_string(),
                res as i32,
            ));
        }

        debug!("--------------ncclGroupEnd, res={}", res);
        let ret_value = Argument::from_value(res, ArgumentFlag::ARG_OUT);
        Ok(ret_value)
    }
}

pub struct CudaMallocHandler;
impl ApiHandler for CudaMallocHandler {
    fn handle_api(&self, args: &mut [Argument<'_>]) -> Result<Argument<'static>, ServerErr> {
        let dev_ptr = unsafe {
            args[0].downcast_mut::<usize>().map_err(|_| {
                ServerErr::InvalidType("InvalidType, <dev_ptr> expected: usize".into())
            })? as *mut usize as *mut *mut c_void
        };
        let size = args[1]
            .downcast_ref::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <size> expected: usize".into()))?;
        debug!(
            "----server-1-----cudaMalloc, , ptr: {:p}, *ptr:{:p}, size: {}",
            dev_ptr,
            unsafe { *dev_ptr },
            size
        );
        let res = unsafe { runtime::cudaMalloc(dev_ptr, *size) };
        debug!(
            "----server-2-----cudaMalloc, res: {}, ptr: {:p}, *ptr:{:p}, size: {}",
            res,
            dev_ptr,
            unsafe { *dev_ptr },
            size
        );

        if res > 0 {
            return Err(ServerErr::ApiRunError("cudaMalloc".to_string(), res as i32));
        }
        let ret_value = Argument::from_value(res, ArgumentFlag::ARG_OUT);
        Ok(ret_value)
    }
}

pub struct CudaFreeHandler;
impl ApiHandler for CudaFreeHandler {
    fn handle_api(&self, args: &mut [Argument<'_>]) -> Result<Argument<'static>, ServerErr> {
        let dev_ptr = args[0]
            .downcast::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <dev_ptr> expected: usize".into()))?
            as *mut c_void;
        let res = unsafe { runtime::cudaFree(dev_ptr) };
        if res > 0 {
            return Err(ServerErr::ApiRunError("cudaFree".to_string(), res as i32));
        }
        debug!("----server------cudaFree, res: {}, ptr: {:p}", res, dev_ptr);
        let ret_value = Argument::from_value(res, ArgumentFlag::ARG_OUT);
        Ok(ret_value)
    }
}

pub struct CudaMemcpyAsyncHandler;
impl ApiHandler for CudaMemcpyAsyncHandler {
    fn handle_api(&self, args: &mut [Argument<'_>]) -> Result<Argument<'static>, ServerErr> {
        info!("[server] api_name: cudaMemcpyAsync");
        let kind = args[3].downcast::<runtime::cudaMemcpyKind>().map_err(|_| {
            ServerErr::InvalidType("InvalidType, <kind> expected: runtime::cudaMemcpyKind".into())
        })?;
        let dst: *mut c_void;
        let mut src: *const c_void;
        let mut copy_flag = false;
        let mut dst2: *mut c_void = std::ptr::null_mut();

        let count = args[2]
            .downcast_ref::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <count> expected: usize".into()))?;

        if kind == runtime::cudaMemcpyDeviceToHost {
            let buf = unsafe {
                args[0].downcast_mut_slice::<u8>().map_err(|_| {
                    ServerErr::InvalidType("InvalidType, <buf> expected: *const c_void".into())
                })?
            };
            dst = buf.as_mut_ptr().cast::<c_void>();
            src = args[1]
                .downcast::<usize>()
                .map_err(|_| ServerErr::InvalidType("InvalidType, <src> expected: usize".into()))?
                as *const c_void;

            if !args[5].is_empty() {
                dst2 = args[5].downcast::<usize>().map_err(|_| {
                    ServerErr::InvalidType("InvalidType, <dst2> expected: usize".into())
                })? as *mut c_void;
                copy_flag = true;
                debug!("D2H: dst2:{:p}", dst2);
            }

            debug!("D2H: dst:{:p}, slice:{:?}", dst, buf);
            debug!("D2H: src:{:p}", src);
        } else if kind == runtime::cudaMemcpyHostToDevice {
            dst = args[0]
                .downcast::<usize>()
                .map_err(|_| ServerErr::InvalidType("InvalidType, <dest> expected: usize".into()))?
                as *mut c_void;

            let buf = args[1].downcast_slice::<u8>().map_err(|_| {
                ServerErr::InvalidType("InvalidType, <buf> expected: *const c_void".into())
            })?;
            src = buf.as_ptr().cast::<c_void>();

            if !args[5].is_empty() {
                let src2 = args[5].downcast::<usize>().map_err(|_| {
                    ServerErr::InvalidType("InvalidType, <dst2> expected: usize".into())
                })? as *mut c_void;
                unsafe { std::ptr::copy_nonoverlapping(src, src2, *count) };
                src = src2 as *const c_void;
                debug!("H2D: src2:{:p}", src2);
            }

            debug!("H2D: src:{:p}", src);
            debug!("H2D: dst:{:p}", dst);
        } else if kind == runtime::cudaMemcpyDeviceToDevice {
            dst = args[0]
                .downcast::<usize>()
                .map_err(|_| ServerErr::InvalidType("InvalidType, <dest> expected: usize".into()))?
                as *mut c_void;
            src = args[1]
                .downcast::<usize>()
                .map_err(|_| ServerErr::InvalidType("InvalidType, <src> expected: usize".into()))?
                as *const c_void;

            debug!("D2D: src:{:p}->dst:{:p}", src, dst);
        } else {
            //panic!("cudaMemcpyAsync::host_to_host copy is not implemented!");
            return Err(ServerErr::ApiRunError(
                "server: cudaMemcpyAsync::host_to_host copy is not implemented!".to_string(),
                -1,
            ));
        }

        let stream = args[4]
            .downcast::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <stream> expected: usize".into()))?
            as runtime::cudaStream_t;
        debug!(
            "dst: {:p}, src: {:p}, count: {}, kind: {:?}, stream: {:#x}",
            dst, src, *count, kind, stream
        );
        let res = if copy_flag {
            unsafe { runtime::cudaMemcpyAsync(dst2, src, *count, kind, stream) }
        } else {
            unsafe { runtime::cudaMemcpyAsync(dst, src, *count, kind, stream) }
        };

        if res > 0 {
            return Err(ServerErr::ApiRunError(
                "cudaMemcpyAsync".to_string(),
                res as i32,
            ));
        }

        if copy_flag {
            unsafe { runtime::cudaStreamSynchronize(stream) };
            debug!("*dst2:{:?}", unsafe {
                std::slice::from_raw_parts(dst2 as *const u8, *count)
            });
            debug!("before copy *dst:{:?}", unsafe {
                std::slice::from_raw_parts(dst as *const u8, *count)
            });
            unsafe { std::ptr::copy_nonoverlapping(dst2 as *const u8, dst as *mut u8, *count) };
            debug!("after copy *dst:{:?}", unsafe {
                std::slice::from_raw_parts(dst as *const u8, *count)
            });
        }

        debug!("----------MemcpyAsync, res = {}", res);
        let ret_value = Argument::from_value(res, ArgumentFlag::ARG_OUT);
        Ok(ret_value)
    }
}

pub struct NcclGetUniqueIdHandler;
impl ApiHandler for NcclGetUniqueIdHandler {
    fn handle_api(&self, args: &mut [Argument<'_>]) -> Result<Argument<'static>, ServerErr> {
        info!("[server] api_name: ncclGetUniqueId");
        let unique_id = unsafe {
            args[0].downcast_mut::<nccl::ncclUniqueId>().map_err(|_| {
                ServerErr::InvalidType(
                    "InvalidType, <unique_id> expected: nccl::ncclUniqueId".into(),
                )
            })?
        };
        debug!("unique_id: {:p}, *{:?}", unique_id, *unique_id);
        let res = unsafe { nccl::ncclGetUniqueId(unique_id) };
        if res > 0 {
            return Err(ServerErr::ApiRunError(
                "ncclGetUniqueId".to_string(),
                res as i32,
            ));
        }
        debug!("unique_id: {:p}, *{:?}", unique_id, *unique_id);
        debug!("----------ncclGetUniqueId");
        let ret_value = Argument::from_value(res, ArgumentFlag::ARG_OUT);
        Ok(ret_value)
    }
}

pub struct NcclCommInitRankConfigHandler;
impl ApiHandler for NcclCommInitRankConfigHandler {
    fn handle_api(&self, args: &mut [Argument<'_>]) -> Result<Argument<'static>, ServerErr> {
        info!("[server] api_name: ncclCommInitRankConfig");
        let comm = unsafe {
            args[0]
                .downcast_mut::<usize>()
                .map_err(|_| ServerErr::InvalidType("InvalidType, <comm> expected: usize".into()))?
                as *mut usize as *mut nccl::ncclComm_t
        };

        let nranks = args[1]
            .downcast::<c_int>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <nranks> expected: c_int".into()))?;

        let comm_id = args[2].downcast::<nccl::ncclUniqueId>().map_err(|_| {
            ServerErr::InvalidType("InvalidType, <comm_id> expected: nccl::ncclUniqueId".into())
        })?;

        let rank = args[3]
            .downcast::<c_int>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <rank> expected: c_int".into()))?;

        let config = unsafe {
            args[4].downcast_mut::<nccl::ncclConfig_t>().map_err(|_| {
                ServerErr::InvalidType(
                    "InvalidType, <comm> expected: *mut nccl::ncclConfig_t".into(),
                )
            })?
        };
        debug!("comm: {:p}, *{:?}", comm, unsafe { *comm });
        debug!("comm_id: {:?}", comm_id);

        let res = unsafe { nccl::ncclCommInitRankConfig(comm, nranks, comm_id, rank, config) };

        debug!("after comm: {:p}, *{:?}", comm, unsafe { *comm });
        if res > 0 {
            return Err(ServerErr::ApiRunError(
                "ncclCommInitRankConfig".to_string(),
                res as i32,
            ));
        }
        let ret_value = Argument::from_value(res, ArgumentFlag::ARG_OUT);
        Ok(ret_value)
    }
}

pub struct CudaLaunchKernelHandler;
impl ApiHandler for CudaLaunchKernelHandler {
    fn handle_api(&self, args: &mut [Argument<'_>]) -> Result<Argument<'static>, ServerErr> {
        info!("[server] api_name: cudaLaunchKernel");

        let offset = args[0]
            .downcast::<isize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <offset> expected: isize".into()))?;
        let base_addr = BASE_ADDR.get().ok_or(ServerErr::InvalidType(
            "BASE_ADDR.get() failed: missing u64".into(),
        ))?;
        let func_ptr = (*base_addr as isize).wrapping_add(offset) as u64 as *const c_void;

        let grid_dim = args[1].downcast::<runtime::dim3>().map_err(|_| {
            ServerErr::InvalidType("InvalidType, <grid_dim> expected: runtime::dim3".into())
        })?;

        let block_dim = args[2].downcast::<runtime::dim3>().map_err(|_| {
            ServerErr::InvalidType("InvalidType, <block_dim> expected: runtime::dim3".into())
        })?;

        let shared_mem = args[3].downcast::<usize>().map_err(|_| {
            ServerErr::InvalidType("InvalidType, <shared_mem> expected: usize".into())
        })?;

        let stream = args[4]
            .downcast::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <stream> expected: usize".into()))?
            as runtime::cudaStream_t;

        debug!(
            "cudaLaunchKernel: gridDim=({},{},{}), blockDim=({},{},{}), sharedMem={}, stream={}",
            grid_dim.x,
            grid_dim.y,
            grid_dim.z,
            block_dim.x,
            block_dim.y,
            block_dim.z,
            shared_mem,
            stream
        );
        let mut params = [std::ptr::null_mut(); 32];
        for i in 5..args.len() {
            let param_data = unsafe {
                args[i].downcast_mut_slice::<u8>().map_err(|_| {
                    ServerErr::InvalidType("InvalidType, <arg_i> expected: *const c_void".into())
                })?
            };
            let param_ptr = param_data.as_mut_ptr().cast::<c_void>();

            debug!(
                "param[{}]: addr={:p}, len={}",
                i,
                param_ptr,
                param_data.len()
            );
            params[i - 5] = param_ptr;
        }

        let res = unsafe {
            runtime::cudaLaunchKernel(
                func_ptr,
                grid_dim,
                block_dim,
                params.as_mut_ptr(),
                shared_mem,
                stream,
            )
        };
        if res != 0 {
            return Err(ServerErr::ApiRunError(
                "cudaLaunchKernel".to_string(),
                res as i32,
            ));
        }
        debug!("cudaLaunchKernel, res:{}", res);

        Ok(Argument::from_value(res, ArgumentFlag::ARG_OUT))
    }
}

pub struct NcclReduceHandler;
impl ApiHandler for NcclReduceHandler {
    fn handle_api(&self, args: &mut [Argument<'_>]) -> Result<Argument<'static>, ServerErr> {
        info!("[server] api_name: ncclReduce");
        let sendbuff = args[0]
            .downcast::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <sendbuff> expected: usize".into()))?
            as *const c_void;
        debug!("sendbuff:{:p}", sendbuff);
        let recvbuff = args[1]
            .downcast::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <recvbuff> expected: usize".into()))?
            as *mut c_void;
        debug!("recvbuff:{:p}", recvbuff);
        let count = args[2]
            .downcast::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <count> expected: usize".into()))?;
        let datatype = args[3].downcast::<nccl::ncclDataType_t>().map_err(|_| {
            ServerErr::InvalidType("InvalidType, <datatype> expected: nccl::ncclDataType_t".into())
        })?;
        let op = args[4].downcast::<nccl::ncclRedOp_t>().map_err(|_| {
            ServerErr::InvalidType("InvalidType, <op> expected: nccl::ncclRedOp_t".into())
        })?;
        let root = args[5]
            .downcast::<c_int>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <root> expected: c_int".into()))?;
        let comm = args[6]
            .downcast::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <comm> expected: usize".into()))?
            as nccl::ncclComm_t;
        let stream = args[7]
            .downcast::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <stream> expected: usize".into()))?
            as nccl::cudaStream_t;
        let res = unsafe {
            nccl::ncclReduce(sendbuff, recvbuff, count, datatype, op, root, comm, stream)
        };

        if res > 0 {
            return Err(ServerErr::ApiRunError("ncclReduce".to_string(), res as i32));
        }
        debug!("--------------ncclReduce, res={}", res);
        let ret_value = Argument::from_value(res, ArgumentFlag::ARG_OUT);
        Ok(ret_value)
    }
}

pub struct CudaFuncSetAttributeHandler;
impl ApiHandler for CudaFuncSetAttributeHandler {
    fn handle_api(&self, args: &mut [Argument<'_>]) -> Result<Argument<'static>, ServerErr> {
        info!("[server] cudaFuncSetAttribute");

        let offset = args[0]
            .downcast::<isize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <offset> expected: isize".into()))?;
        let base_addr = BASE_ADDR.get().ok_or(ServerErr::InvalidType(
            "BASE_ADDR.get() failed: missing u64".into(),
        ))?;
        let func_ptr = (*base_addr as isize).wrapping_add(offset) as u64 as *const c_void;

        let attr = args[1]
            .downcast::<runtime::cudaFuncAttribute>()
            .map_err(|_| {
                ServerErr::InvalidType(
                    "InvalidType, <attr> expected: runtime::cudaFuncAttribute".into(),
                )
            })?;

        let value = args[2]
            .downcast::<c_int>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <value> expected: c_int".into()))?;

        debug!(
            " attr: {:?}, offset:{:x}, func_ptr:{:p}, value:{}",
            attr, offset, func_ptr, value
        );
        let res = unsafe { runtime::cudaFuncSetAttribute(func_ptr, attr, value) };
        debug!("cudaFuncSetAttribute, res={}", res);
        if res != 0 {
            return Err(ServerErr::ApiRunError(
                "cuFuncSetAttribute".to_string(),
                res as i32,
            ));
        }
        let ret_value = Argument::from_value(res, ArgumentFlag::ARG_OUT);
        Ok(ret_value)
    }
}

pub struct CudaFuncGetAttributesHandler;
impl ApiHandler for CudaFuncGetAttributesHandler {
    fn handle_api(&self, args: &mut [Argument<'_>]) -> Result<Argument<'static>, ServerErr> {
        info!("[server] cudaFuncGetAttribute");

        let attr = unsafe {
            args[0]
                .downcast_mut::<runtime::cudaFuncAttributes>()
                .map_err(|_| {
                    ServerErr::InvalidType(
                        "InvalidType, <attr> expected: runtime::cudaFuncAttribute".into(),
                    )
                })?
        };

        let offset = args[1]
            .downcast::<isize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <offset> expected: isize".into()))?;
        let base_addr = BASE_ADDR.get().ok_or(ServerErr::InvalidType(
            "BASE_ADDR.get() failed: missing u64".into(),
        ))?;
        let func_ptr = (*base_addr as isize).wrapping_add(offset) as u64 as *const c_void;

        debug!(
            "before attr: {:?}, offset:{:x}, func_ptr:{:p}",
            attr, offset, func_ptr
        );

        let res = unsafe { runtime::cudaFuncGetAttributes(attr, func_ptr) };
        if res > 0 {
            return Err(ServerErr::ApiRunError(
                "cudaFuncgetAttribute".to_string(),
                res as i32,
            ));
        }

        debug!("after attr:{:?}", attr);
        debug!("cudaFuncgetAttribute, res={}", res);

        let ret_value = Argument::from_value(res, ArgumentFlag::ARG_OUT);
        Ok(ret_value)
    }
}

pub struct CublasGemmExHandler;
impl ApiHandler for CublasGemmExHandler {
    fn handle_api(&self, args: &mut [Argument<'_>]) -> Result<Argument<'static>, ServerErr> {
        debug!("[server] api_name: cublasGemmEx");
        let handle = args[0]
            .downcast::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <handle> expected: usize".into()))?
            as cublas::cublasHandle_t;
        let transa = args[1]
            .downcast::<cublas::cublasOperation_t>()
            .map_err(|_| {
                ServerErr::InvalidType(
                    "InvalidType, <transa> expected: cublas::cublasOperation_t".into(),
                )
            })?;
        let transb = args[2]
            .downcast::<cublas::cublasOperation_t>()
            .map_err(|_| {
                ServerErr::InvalidType(
                    "InvalidType, <transb> expected: cublas::cublasOperation_t".into(),
                )
            })?;
        let m = args[3]
            .downcast::<c_int>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <m> expected: c_int".into()))?;
        let n = args[4]
            .downcast::<c_int>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <n> expected: c_int".into()))?;
        let k = args[5]
            .downcast::<c_int>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <k> expected: c_int".into()))?;
        let alpha_buf = args[6].downcast_slice::<u8>().map_err(|_| {
            ServerErr::InvalidType("InvalidType, <alpha_buf> expected: [u8]".into())
        })?;
        let alpha = alpha_buf.as_ptr().cast::<c_void>();
        let a = args[7]
            .downcast::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <A> expected: usize".into()))?
            as *const c_void;
        let atype = args[8].downcast::<cublas::cudaDataType>().map_err(|_| {
            ServerErr::InvalidType("InvalidType, <Atype> expected: cublas::cudaDataType".into())
        })?;
        let lda = args[9]
            .downcast::<c_int>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <lda> expected: c_int".into()))?;
        let b = args[10]
            .downcast::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <B> expected: usize".into()))?
            as *const c_void;
        let btype = args[11].downcast::<cublas::cudaDataType>().map_err(|_| {
            ServerErr::InvalidType("InvalidType, <Btype> expected: cublas::cudaDataType".into())
        })?;
        let ldb = args[12]
            .downcast::<c_int>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <ldb> expected: c_int".into()))?;
        let beta_buf = args[13]
            .downcast_slice::<u8>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <beta_buf> expected: [u8]".into()))?;
        let beta = beta_buf.as_ptr().cast::<c_void>();
        let c = args[14]
            .downcast::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <C> expected: usize".into()))?
            as *mut c_void;
        let ctype = args[15].downcast::<cublas::cudaDataType>().map_err(|_| {
            ServerErr::InvalidType("InvalidType, <Ctype> expected: cublas::cudaDataType".into())
        })?;
        let ldc = args[16]
            .downcast::<c_int>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <ldc> expected: c_int".into()))?;
        let compute_type = args[17]
            .downcast::<cublas::cublasComputeType_t>()
            .map_err(|_| {
                ServerErr::InvalidType(
                    "InvalidType, <computeType> expected: cublas::cublasComputeType_t".into(),
                )
            })?;
        let algo = args[18]
            .downcast::<cublas::cublasGemmAlgo_t>()
            .map_err(|_| {
                ServerErr::InvalidType(
                    "InvalidType, <algo> expected: cublas::cublasGemmAlgo_t".into(),
                )
            })?;
        let res = unsafe {
            cublas::cublasGemmEx(
                handle,
                transa,
                transb,
                m,
                n,
                k,
                alpha,
                a,
                atype,
                lda,
                b,
                btype,
                ldb,
                beta,
                c,
                ctype,
                ldc,
                compute_type,
                algo,
            )
        };
        if res > 0 {
            return Err(ServerErr::ApiRunError(
                "cublasGemmEx".to_string(),
                res as i32,
            ));
        }

        debug!("--------------cublasGemmEx, res={}", res);
        let ret_value = Argument::from_value(res, ArgumentFlag::ARG_OUT);
        Ok(ret_value)
    }
}

pub struct CublasSgemmStridedBatchedHandler;
impl ApiHandler for CublasSgemmStridedBatchedHandler {
    fn handle_api(&self, args: &mut [Argument<'_>]) -> Result<Argument<'static>, ServerErr> {
        debug!("[server] api_name: cublasSgemmStridedBatched");
        let handle = args[0]
            .downcast::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <handle> expected: usize".into()))?
            as cublas::cublasHandle_t;
        let transa = args[1]
            .downcast::<cublas::cublasOperation_t>()
            .map_err(|_| {
                ServerErr::InvalidType(
                    "InvalidType, <transa> expected: cublas::cublasOperation_t".into(),
                )
            })?;
        let transb = args[2]
            .downcast::<cublas::cublasOperation_t>()
            .map_err(|_| {
                ServerErr::InvalidType(
                    "InvalidType, <transb> expected: cublas::cublasOperation_t".into(),
                )
            })?;
        let m = args[3]
            .downcast::<c_int>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <m> expected: c_int".into()))?;
        let n = args[4]
            .downcast::<c_int>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <n> expected: c_int".into()))?;
        let k = args[5]
            .downcast::<c_int>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <k> expected: c_int".into()))?;
        let alpha = unsafe {
            args[6].downcast_mut::<f32>().map_err(|_| {
                ServerErr::InvalidType("InvalidType, <alpha> expected: *const f32".into())
            })?
        };
        let a = args[7]
            .downcast::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <A> expected: usize".into()))?
            as *const f32;
        let lda = args[8]
            .downcast::<c_int>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <lda> expected: c_int".into()))?;
        let stride_a = args[9].downcast::<c_longlong>().map_err(|_| {
            ServerErr::InvalidType("InvalidType, <strideA> expected: c_longlong".into())
        })?;
        let b = args[10]
            .downcast::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <B> expected: usize".into()))?
            as *const f32;
        let ldb = args[11]
            .downcast::<c_int>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <ldb> expected: c_int".into()))?;
        let stride_b = args[12].downcast::<c_longlong>().map_err(|_| {
            ServerErr::InvalidType("InvalidType, <strideB> expected: c_longlong".into())
        })?;
        let beta = unsafe {
            args[13].downcast_mut::<f32>().map_err(|_| {
                ServerErr::InvalidType("InvalidType, <beta> expected: *const f32".into())
            })?
        };
        let c = args[14]
            .downcast::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <C> expected: usize".into()))?
            as *mut f32;
        let ldc = args[15]
            .downcast::<c_int>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <ldc> expected: c_int".into()))?;
        let stride_c = args[16].downcast::<c_longlong>().map_err(|_| {
            ServerErr::InvalidType("InvalidType, <strideC> expected: c_longlong".into())
        })?;
        let batch_count = args[17].downcast::<c_int>().map_err(|_| {
            ServerErr::InvalidType("InvalidType, <batchCount> expected: c_int".into())
        })?;
        let res = unsafe {
            cublas::cublasSgemmStridedBatched(
                handle,
                transa,
                transb,
                m,
                n,
                k,
                alpha,
                a,
                lda,
                stride_a,
                b,
                ldb,
                stride_b,
                beta,
                c,
                ldc,
                stride_c,
                batch_count,
            )
        };
        if res > 0 {
            return Err(ServerErr::ApiRunError(
                "cublasSgemmStridedBatched".to_string(),
                res as i32,
            ));
        }

        debug!("--------------cublasSgemmStridedBatched, res={}", res);
        let ret_value = Argument::from_value(res, ArgumentFlag::ARG_OUT);
        Ok(ret_value)
    }
}

pub struct CublasGemmStridedBatchedExHandler;
impl ApiHandler for CublasGemmStridedBatchedExHandler {
    fn handle_api(&self, args: &mut [Argument<'_>]) -> Result<Argument<'static>, ServerErr> {
        debug!("[server] api_name: cublasGemmStridedBatchedEx");
        let handle = args[0]
            .downcast::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <handle> expected: usize".into()))?
            as cublas::cublasHandle_t;
        let transa = args[1]
            .downcast::<cublas::cublasOperation_t>()
            .map_err(|_| {
                ServerErr::InvalidType(
                    "InvalidType, <transa> expected: cublas::cublasOperation_t".into(),
                )
            })?;
        let transb = args[2]
            .downcast::<cublas::cublasOperation_t>()
            .map_err(|_| {
                ServerErr::InvalidType(
                    "InvalidType, <transb> expected: cublas::cublasOperation_t".into(),
                )
            })?;
        let m = args[3]
            .downcast::<c_int>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <m> expected: c_int".into()))?;
        let n = args[4]
            .downcast::<c_int>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <n> expected: c_int".into()))?;
        let k = args[5]
            .downcast::<c_int>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <k> expected: c_int".into()))?;
        let alpha_buf = args[6].downcast_slice::<u8>().map_err(|_| {
            ServerErr::InvalidType("InvalidType, <alpha_buf> expected: [u8]".into())
        })?;
        let alpha = alpha_buf.as_ptr().cast::<c_void>();
        let a = args[7]
            .downcast::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <A> expected: usize".into()))?
            as *const c_void;
        let atype = args[8].downcast::<cublas::cudaDataType>().map_err(|_| {
            ServerErr::InvalidType("InvalidType, <Atype> expected: cublas::cudaDataType".into())
        })?;
        let lda = args[9]
            .downcast::<c_int>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <lda> expected: c_int".into()))?;
        let stride_a = args[10].downcast::<c_longlong>().map_err(|_| {
            ServerErr::InvalidType("InvalidType, <strideA> expected: c_longlong".into())
        })?;
        let b = args[11]
            .downcast::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <B> expected: usize".into()))?
            as *const c_void;
        let btype = args[12].downcast::<cublas::cudaDataType>().map_err(|_| {
            ServerErr::InvalidType("InvalidType, <Btype> expected: cublas::cudaDataType".into())
        })?;
        let ldb = args[13]
            .downcast::<c_int>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <ldb> expected: c_int".into()))?;
        let stride_b = args[14].downcast::<c_longlong>().map_err(|_| {
            ServerErr::InvalidType("InvalidType, <strideB> expected: c_longlong".into())
        })?;
        let beta_buf = args[15]
            .downcast_slice::<u8>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <beta_buf> expected: [u8]".into()))?;
        let beta = beta_buf.as_ptr().cast::<c_void>();
        let c = args[16]
            .downcast::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <C> expected: usize".into()))?
            as *mut c_void;
        let ctype = args[17].downcast::<cublas::cudaDataType>().map_err(|_| {
            ServerErr::InvalidType("InvalidType, <Ctype> expected: cublas::cudaDataType".into())
        })?;
        let ldc = args[18]
            .downcast::<c_int>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <ldc> expected: c_int".into()))?;
        let stride_c = args[19].downcast::<c_longlong>().map_err(|_| {
            ServerErr::InvalidType("InvalidType, <strideC> expected: c_longlong".into())
        })?;
        let batch_count = args[20].downcast::<c_int>().map_err(|_| {
            ServerErr::InvalidType("InvalidType, <batchCount> expected: c_int".into())
        })?;
        let compute_type = args[21]
            .downcast::<cublas::cublasComputeType_t>()
            .map_err(|_| {
                ServerErr::InvalidType(
                    "InvalidType, <computeType> expected: cublas::cublasComputeType_t".into(),
                )
            })?;
        let algo = args[22]
            .downcast::<cublas::cublasGemmAlgo_t>()
            .map_err(|_| {
                ServerErr::InvalidType(
                    "InvalidType, <algo> expected: cublas::cublasGemmAlgo_t".into(),
                )
            })?;
        let res = unsafe {
            cublas::cublasGemmStridedBatchedEx(
                handle,
                transa,
                transb,
                m,
                n,
                k,
                alpha,
                a,
                atype,
                lda,
                stride_a,
                b,
                btype,
                ldb,
                stride_b,
                beta,
                c,
                ctype,
                ldc,
                stride_c,
                batch_count,
                compute_type,
                algo,
            )
        };
        if res > 0 {
            return Err(ServerErr::ApiRunError(
                "cublasGemmStridedBatchedEx".to_string(),
                res as i32,
            ));
        }

        debug!("--------------cublasGemmStridedBatchedEx, res={}", res);
        let ret_value = Argument::from_value(res, ArgumentFlag::ARG_OUT);
        Ok(ret_value)
    }
}

pub struct CublasLtCreateHandler;
impl ApiHandler for CublasLtCreateHandler {
    fn handle_api(&self, args: &mut [Argument<'_>]) -> Result<Argument<'static>, ServerErr> {
        debug!("[server] api_name: cublasLtCreate");
        let light_handle = unsafe {
            args[0].downcast_mut::<usize>().map_err(|_| {
                ServerErr::InvalidType(
                    "InvalidType, <lightHandle> expected: *mut cublaslt::cublasLtHandle_t".into(),
                )
            })? as *mut usize as *mut cublaslt::cublasLtHandle_t
        };
        let res = unsafe { cublaslt::cublasLtCreate(light_handle) };
        if res > 0 {
            return Err(ServerErr::ApiRunError(
                "cublasLtCreate".to_string(),
                res as i32,
            ));
        }

        debug!("--------------cublasLtCreate, res={}", res);
        let ret_value = Argument::from_value(res, ArgumentFlag::ARG_OUT);
        Ok(ret_value)
    }
}

pub struct CublasLtDestroyHandler;
impl ApiHandler for CublasLtDestroyHandler {
    fn handle_api(&self, args: &mut [Argument<'_>]) -> Result<Argument<'static>, ServerErr> {
        debug!("[server] api_name: cublasLtDestroy");
        let light_handle = args[0].downcast::<usize>().map_err(|_| {
            ServerErr::InvalidType("InvalidType, <lightHandle> expected: usize".into())
        })? as cublaslt::cublasLtHandle_t;
        let res = unsafe { cublaslt::cublasLtDestroy(light_handle) };
        if res > 0 {
            return Err(ServerErr::ApiRunError(
                "cublasLtDestroy".to_string(),
                res as i32,
            ));
        }

        debug!("--------------cublasLtDestroy, res={}", res);
        let ret_value = Argument::from_value(res, ArgumentFlag::ARG_OUT);
        Ok(ret_value)
    }
}

pub struct CudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlagsHandler;
impl ApiHandler for CudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlagsHandler {
    fn handle_api(&self, args: &mut [Argument<'_>]) -> Result<Argument<'static>, ServerErr> {
        info!("[server] api_name: cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags");

        let offset = args[1]
            .downcast::<isize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <offset> expected: isize".into()))?;
        let base_addr = BASE_ADDR.get().ok_or(ServerErr::InvalidType(
            "BASE_ADDR.get() failed: missing u64".into(),
        ))?;
        let func_ptr = (*base_addr as isize).wrapping_add(offset) as u64 as *const c_void;

        let num_blocks = unsafe {
            args[0].downcast_mut::<c_int>().map_err(|_| {
                ServerErr::InvalidType("InvalidType, <num_blocks> expected: *mut c_int".into())
            })?
        };

        let block_size = args[2].downcast::<c_int>().map_err(|_| {
            ServerErr::InvalidType("InvalidType, <block_size> expected: c_int".into())
        })?;
        let dynamic_smem_size = args[3].downcast::<usize>().map_err(|_| {
            ServerErr::InvalidType("InvalidType, <dynamic_smem_size> expected: usize".into())
        })?;
        let flags = args[4]
            .downcast::<c_uint>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <flags> expected: c_uint".into()))?;
        let res = unsafe {
            runtime::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
                num_blocks,
                func_ptr,
                block_size,
                dynamic_smem_size,
                flags,
            )
        };
        if res as i32 != 0 {
            return Err(ServerErr::ApiRunError(
                "cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags".to_string(),
                res as i32,
            ));
        }

        let ret_value = Argument::from_value(res, ArgumentFlag::ARG_OUT);
        Ok(ret_value)
    }
}

pub struct LoadLibraryHandler;
impl ApiHandler for LoadLibraryHandler {
    fn handle_api(&self, args: &mut [Argument<'_>]) -> Result<Argument<'static>, ServerErr> {
        info!("[server] api_name: LoadLibrary");

        let argv0 = args[0]
            .downcast_slice::<u8>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <argv0> expected: &[u8]".into()))?;

        let c_str = CStr::from_bytes_with_nul(argv0).expect("Unexpected C string");
        let rust_str = c_str
            .to_str()
            .map_err(|e| ServerErr::InvalidType(format!("argv0 is not UTF-8 encoded: {}", e)))?;
        let lib_path = Path::new(rust_str);

        let mut ret = 0u32;
        match dynlib::dlopen(lib_path, dynlib::DlOpenFlag::LAZY) {
            Ok(_) => {
                debug!(
                    "dynlib::dlopen(lib_path, LAZY) ok, lib_path:{}",
                    lib_path.display()
                );
            }
            Err(e) => {
                error!(
                    "dynlib::dlopen(lib_path, LAZY) failed, lib_path:{}, error:{}",
                    lib_path.display(),
                    e
                );
                let ret_value = Argument::from_value(1u32, ArgumentFlag::ARG_OUT);
                return Ok(ret_value);
            }
        }
        let current_process = Process::myself()
            .map_err(|_| ServerErr::InvalidType("Process::myself() failed".into()))?;
        let maps = current_process
            .maps()
            .map_err(|_| ServerErr::InvalidType("current_process.maps() failed".into()))?;

        let target_filename = lib_path
            .file_name()
            .and_then(|name| name.to_str())
            .ok_or_else(|| ServerErr::InvalidType("Invalid library path".into()))?;

        let mut found = false;

        for region in maps {
            if let MMapPath::Path(ref path) = region.pathname {
                if path.file_name().and_then(|name| name.to_str()) == Some(target_filename) {
                    if BASE_ADDR.set(region.address.0).is_err() {
                        warn!("BASE_ADDR already set!");
                    } else {
                        debug!(
                            "Set server-side global BASE_ADDR = 0x{:x}, so_name: {}",
                            region.address.0,
                            path.display()
                        );
                    }
                    found = true;
                    break;
                }
            }
        }

        if !found {
            ret = 2u32;
        }

        let ret_value = Argument::from_value(ret, ArgumentFlag::ARG_OUT);
        Ok(ret_value)
    }
}

pub struct NcclCommShrinkHandler;
impl ApiHandler for NcclCommShrinkHandler {
    fn handle_api(&self, args: &mut [Argument<'_>]) -> Result<Argument<'static>, ServerErr> {
        info!("[server] api_name: ncclCommShrink");
        let comm = args[0].downcast::<nccl::ncclComm_t>().map_err(|_| {
            ServerErr::InvalidType("InvalidType, <comm> expected: nccl::ncclComm_t".into())
        })?;
        let exclude_ranks_list = unsafe {
            args[1].downcast_mut_slice::<c_int>().map_err(|_| {
                ServerErr::InvalidType("InvalidType, <exclude_ranks_list> expected: [c_int]".into())
            })
        }?;
        let exclude_ranks_list_ptr = exclude_ranks_list.as_mut_ptr();
        let exclude_ranks_count = args[2].downcast::<c_int>().map_err(|_| {
            ServerErr::InvalidType("InvalidType, <exclude_ranks_count> expected: c_int".into())
        })?;
        let newcomm = unsafe {
            args[3].downcast_mut::<nccl::ncclComm_t>().map_err(|_| {
                ServerErr::InvalidType("InvalidType, <newcomm> expected: usize".into())
            })? as *mut nccl::ncclComm_t
        };
        let config = if args[4].is_empty() {
            std::ptr::null_mut()
        } else {
            unsafe {
                args[4].downcast_mut::<nccl::ncclConfig_t>().map_err(|_| {
                    ServerErr::InvalidType(
                        "InvalidType, <config> expected: *mut nccl::ncclConfig_t".into(),
                    )
                })?
            }
        };
        let shrink_flags = args[5].downcast::<c_int>().map_err(|_| {
            ServerErr::InvalidType("InvalidType, <shrink_flags> expected: c_int".into())
        })?;

        let res = unsafe {
            nccl::ncclCommShrink(
                comm,
                exclude_ranks_list_ptr,
                exclude_ranks_count,
                newcomm,
                config,
                shrink_flags,
            )
        };

        if res > 0 {
            return Err(ServerErr::ApiRunError(
                "ncclCommShrink".to_string(),
                res as i32,
            ));
        }
        let ret_value = Argument::from_value(res, ArgumentFlag::ARG_OUT);
        Ok(ret_value)
    }
}

pub struct NcclCommInitNewRankHandler;
impl ApiHandler for NcclCommInitNewRankHandler {
    fn handle_api(&self, args: &mut [Argument<'_>]) -> Result<Argument<'static>, ServerErr> {
        info!("[server] api_name: ncclCommInitNewRank");
        let comm = unsafe {
            args[0].downcast_mut::<nccl::ncclComm_t>().map_err(|_| {
                ServerErr::InvalidType("InvalidType, <comm> expected: nccl::ncclComm_t".into())
            })? as *mut nccl::ncclComm_t
        };
        let nranks = args[1]
            .downcast::<c_int>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <nranks> expected: c_int".into()))?;

        debug!("comm: {:p}, *{:?}", comm, unsafe { *comm });
        let res = unsafe { nccl::ncclCommInitNewRank(comm, nranks) };
        debug!("after comm: {:p}, *{:?}", comm, unsafe { *comm });
        if res > 0 {
            return Err(ServerErr::ApiRunError(
                "ncclCommInitNewRank".to_string(),
                res as i32,
            ));
        }
        let ret_value = Argument::from_value(res, ArgumentFlag::ARG_OUT);
        Ok(ret_value)
    }
}

pub struct NcclCommAddNewRankHandler;
impl ApiHandler for NcclCommAddNewRankHandler {
    fn handle_api(&self, args: &mut [Argument<'_>]) -> Result<Argument<'static>, ServerErr> {
        info!("[server] api_name: ncclCommAddNewRank");
        let comm = args[0].downcast::<nccl::ncclComm_t>().map_err(|_| {
            ServerErr::InvalidType("InvalidType, <comm> expected: nccl::ncclComm_t".into())
        })?;

        let res = unsafe { nccl::ncclCommAddNewRank(comm) };
        if res > 0 {
            return Err(ServerErr::ApiRunError(
                "ncclCommAddNewRank".to_string(),
                res as i32,
            ));
        }
        let ret_value = Argument::from_value(res, ArgumentFlag::ARG_OUT);
        Ok(ret_value)
    }
}

pub struct NcclCommSetupNewRankHandler;
impl ApiHandler for NcclCommSetupNewRankHandler {
    fn handle_api(&self, args: &mut [Argument<'_>]) -> Result<Argument<'static>, ServerErr> {
        info!("[server] api_name: ncclCommSetupNewRank");
        let comm = args[0].downcast::<nccl::ncclComm_t>().map_err(|_| {
            ServerErr::InvalidType("InvalidType, <comm> expected: nccl::ncclComm_t".into())
        })?;

        let res = unsafe { nccl::ncclCommSetupNewRank(comm) };
        if res > 0 {
            return Err(ServerErr::ApiRunError(
                "ncclCommSetupNewRank".to_string(),
                res as i32,
            ));
        }
        let ret_value = Argument::from_value(res, ArgumentFlag::ARG_OUT);
        Ok(ret_value)
    }
}

pub struct CudaHostAllocHandler;
impl ApiHandler for CudaHostAllocHandler {
    fn handle_api(&self, args: &mut [Argument<'_>]) -> Result<Argument<'static>, ServerErr> {
        let dev_ptr = unsafe {
            args[0].downcast_mut::<usize>().map_err(|_| {
                ServerErr::InvalidType("InvalidType, <dev_ptr> expected: usize".into())
            })? as *mut usize as *mut *mut c_void
        };
        let size = args[1]
            .downcast_ref::<usize>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <size> expected: usize".into()))?;
        let flags = args[2]
            .downcast_ref::<c_uint>()
            .map_err(|_| ServerErr::InvalidType("InvalidType, <flags> expected: c_uint".into()))?;
        debug!(
            "----server-1-----cudaHostAlloc, , ptr: {:p}, *ptr:{:p}, size: {}, flags:{}",
            dev_ptr,
            unsafe { *dev_ptr },
            size,
            flags
        );
        let res = unsafe { runtime::cudaHostAlloc(dev_ptr, *size, *flags) };
        debug!(
            "----server-2-----cudaHostAlloc, res: {}, ptr: {:p}, *ptr:{:p}, size: {}",
            res,
            dev_ptr,
            unsafe { *dev_ptr },
            size,
        );

        if res > 0 {
            return Err(ServerErr::ApiRunError(
                "cudaHostAlloc".to_string(),
                res as i32,
            ));
        }
        let ret_value = Argument::from_value(res, ArgumentFlag::ARG_OUT);
        Ok(ret_value)
    }
}