// SPDX-License-Identifier: Mulan PSL v2
/*
 * Copyright (c) 2025 Huawei Technologies Co., Ltd.
 * This software is licensed under Mulan PSL v2.
 * You can use this software according to the terms and conditions of the Mulan PSL v2.
 * You may obtain a copy of Mulan PSL v2 at:
 *         http://license.coscl.org.cn/MulanPSL2
 *
 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
 * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
 * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
 * See the Mulan PSL v2 for more details.
 */

use xgpu_macros::api_hook;

#[api_hook(RuntimeApi, backend = crate::hook_impl::ipc::runtime::RuntimeApiImpl)]
mod api {
    use std::ffi::{c_char, c_int, c_uint, c_ulonglong, c_void};

    use cudax::runtime::*;

    unsafe extern "C" {
        pub fn cudaDeviceReset() -> cudaError_t;

        pub fn cudaDeviceSynchronize() -> cudaError_t;

        pub fn cudaDeviceSetLimit(limit: cudaLimit, value: usize) -> cudaError_t;

        pub fn cudaDeviceGetLimit(p_value: *mut usize, limit: cudaLimit) -> cudaError_t;

        pub fn cudaDeviceGetTexture1DLinearMaxWidth(
            max_width_in_elements: *mut usize,
            fmt_desc: *const cudaChannelFormatDesc,
            device: c_int,
        ) -> cudaError_t;

        pub fn cudaDeviceGetCacheConfig(p_cache_config: *mut cudaFuncCache) -> cudaError_t;

        pub fn cudaDeviceGetStreamPriorityRange(
            least_priority: *mut c_int,
            greatest_priority: *mut c_int,
        ) -> cudaError_t;

        pub fn cudaDeviceSetCacheConfig(cache_config: cudaFuncCache) -> cudaError_t;

        pub fn cudaDeviceGetByPCIBusId(
            device: *mut c_int,
            pci_bus_id: *const c_char,
        ) -> cudaError_t;

        pub fn cudaDeviceGetPCIBusId(
            pci_bus_id: *mut c_char,
            len: c_int,
            device: c_int,
        ) -> cudaError_t;

        pub fn cudaIpcGetEventHandle(
            handle: *mut cudaIpcEventHandle_t,
            event: cudaEvent_t,
        ) -> cudaError_t;

        pub fn cudaIpcOpenEventHandle(
            event: *mut cudaEvent_t,
            handle: cudaIpcEventHandle_t,
        ) -> cudaError_t;

        pub fn cudaIpcGetMemHandle(
            handle: *mut cudaIpcMemHandle_t,
            dev_ptr: *mut c_void,
        ) -> cudaError_t;

        pub fn cudaIpcOpenMemHandle(
            dev_ptr: *mut *mut c_void,
            handle: cudaIpcMemHandle_t,
            flags: c_uint,
        ) -> cudaError_t;

        pub fn cudaIpcCloseMemHandle(dev_ptr: *mut c_void) -> cudaError_t;

        pub fn cudaDeviceFlushGPUDirectRDMAWrites(
            target: cudaFlushGPUDirectRDMAWritesTarget,
            scope: cudaFlushGPUDirectRDMAWritesScope,
        ) -> cudaError_t;

        pub fn cudaDeviceRegisterAsyncNotification(
            device: c_int,
            callback_func: cudaAsyncCallback,
            user_data: *mut c_void,
            callback: *mut cudaAsyncCallbackHandle_t,
        ) -> cudaError_t;

        pub fn cudaDeviceUnregisterAsyncNotification(
            device: c_int,
            callback: cudaAsyncCallbackHandle_t,
        ) -> cudaError_t;

        pub fn cudaDeviceGetSharedMemConfig(p_config: *mut cudaSharedMemConfig) -> cudaError_t;

        pub fn cudaDeviceSetSharedMemConfig(config: cudaSharedMemConfig) -> cudaError_t;

        pub fn cudaThreadExit() -> cudaError_t;

        pub fn cudaThreadSynchronize() -> cudaError_t;

        pub fn cudaThreadSetLimit(limit: cudaLimit, value: usize) -> cudaError_t;

        pub fn cudaThreadGetLimit(p_value: *mut usize, limit: cudaLimit) -> cudaError_t;

        pub fn cudaThreadGetCacheConfig(p_cache_config: *mut cudaFuncCache) -> cudaError_t;

        pub fn cudaThreadSetCacheConfig(cache_config: cudaFuncCache) -> cudaError_t;

        pub fn cudaGetLastError() -> cudaError_t;

        pub fn cudaPeekAtLastError() -> cudaError_t;

        pub fn cudaGetErrorName(error: cudaError_t) -> *const c_char;

        pub fn cudaGetErrorString(error: cudaError_t) -> *const c_char;

        pub fn cudaGetDeviceCount(count: *mut c_int) -> cudaError_t;

        pub fn cudaGetDeviceProperties_v2(prop: *mut cudaDeviceProp, device: c_int) -> cudaError_t;

        pub fn cudaDeviceGetAttribute(
            value: *mut c_int,
            attr: cudaDeviceAttr,
            device: c_int,
        ) -> cudaError_t;

        pub fn cudaDeviceGetDefaultMemPool(
            mem_pool: *mut cudaMemPool_t,
            device: c_int,
        ) -> cudaError_t;

        pub fn cudaDeviceSetMemPool(device: c_int, mem_pool: cudaMemPool_t) -> cudaError_t;

        pub fn cudaDeviceGetMemPool(mem_pool: *mut cudaMemPool_t, device: c_int) -> cudaError_t;

        pub fn cudaDeviceGetNvSciSyncAttributes(
            nv_sci_sync_attr_list: *mut c_void,
            device: c_int,
            flags: c_int,
        ) -> cudaError_t;

        pub fn cudaDeviceGetP2PAttribute(
            value: *mut c_int,
            attr: cudaDeviceP2PAttr,
            src_device: c_int,
            dst_device: c_int,
        ) -> cudaError_t;

        pub fn cudaChooseDevice(device: *mut c_int, prop: *const cudaDeviceProp) -> cudaError_t;

        pub fn cudaInitDevice(device: c_int, device_flags: c_uint, flags: c_uint) -> cudaError_t;

        pub fn cudaSetDevice(device: c_int) -> cudaError_t;

        pub fn cudaGetDevice(device: *mut c_int) -> cudaError_t;

        pub fn cudaSetValidDevices(device_arr: *mut c_int, len: c_int) -> cudaError_t;

        pub fn cudaSetDeviceFlags(flags: c_uint) -> cudaError_t;

        pub fn cudaGetDeviceFlags(flags: *mut c_uint) -> cudaError_t;

        pub fn cudaStreamCreate(p_stream: *mut cudaStream_t) -> cudaError_t;

        pub fn cudaStreamCreateWithFlags(p_stream: *mut cudaStream_t, flags: c_uint)
            -> cudaError_t;

        pub fn cudaStreamCreateWithPriority(
            p_stream: *mut cudaStream_t,
            flags: c_uint,
            priority: c_int,
        ) -> cudaError_t;

        pub fn cudaStreamGetPriority(h_stream: cudaStream_t, priority: *mut c_int) -> cudaError_t;

        pub fn cudaStreamGetFlags(h_stream: cudaStream_t, flags: *mut c_uint) -> cudaError_t;

        pub fn cudaStreamGetId(h_stream: cudaStream_t, stream_id: *mut c_ulonglong) -> cudaError_t;

        pub fn cudaCtxResetPersistingL2Cache() -> cudaError_t;

        pub fn cudaStreamCopyAttributes(dst: cudaStream_t, src: cudaStream_t) -> cudaError_t;

        pub fn cudaStreamGetAttribute(
            h_stream: cudaStream_t,
            attr: cudaLaunchAttributeID,
            value_out: *mut cudaLaunchAttributeValue,
        ) -> cudaError_t;

        pub fn cudaStreamSetAttribute(
            h_stream: cudaStream_t,
            attr: cudaLaunchAttributeID,
            value: *const cudaLaunchAttributeValue,
        ) -> cudaError_t;

        pub fn cudaStreamDestroy(stream: cudaStream_t) -> cudaError_t;

        pub fn cudaStreamWaitEvent(
            stream: cudaStream_t,
            event: cudaEvent_t,
            flags: c_uint,
        ) -> cudaError_t;

        pub fn cudaStreamAddCallback(
            stream: cudaStream_t,
            callback: cudaStreamCallback_t,
            user_data: *mut c_void,
            flags: c_uint,
        ) -> cudaError_t;

        pub fn cudaStreamSynchronize(stream: cudaStream_t) -> cudaError_t;

        pub fn cudaStreamQuery(stream: cudaStream_t) -> cudaError_t;

        pub fn cudaStreamAttachMemAsync(
            stream: cudaStream_t,
            dev_ptr: *mut c_void,
            length: usize,
            flags: c_uint,
        ) -> cudaError_t;

        pub fn cudaStreamBeginCapture(
            stream: cudaStream_t,
            mode: cudaStreamCaptureMode,
        ) -> cudaError_t;

        pub fn cudaStreamBeginCaptureToGraph(
            stream: cudaStream_t,
            graph: cudaGraph_t,
            dependencies: *const cudaGraphNode_t,
            dependency_data: *const cudaGraphEdgeData,
            num_dependencies: usize,
            mode: cudaStreamCaptureMode,
        ) -> cudaError_t;

        pub fn cudaThreadExchangeStreamCaptureMode(mode: *mut cudaStreamCaptureMode)
            -> cudaError_t;

        pub fn cudaStreamEndCapture(stream: cudaStream_t, p_graph: *mut cudaGraph_t)
            -> cudaError_t;

        pub fn cudaStreamIsCapturing(
            stream: cudaStream_t,
            p_capture_status: *mut cudaStreamCaptureStatus,
        ) -> cudaError_t;

        pub fn cudaStreamGetCaptureInfo_v2(
            stream: cudaStream_t,
            capture_status_out: *mut cudaStreamCaptureStatus,
            id_out: *mut c_ulonglong,
            graph_out: *mut cudaGraph_t,
            dependencies_out: *mut *const cudaGraphNode_t,
            num_dependencies_out: *mut usize,
        ) -> cudaError_t;

        pub fn cudaStreamGetCaptureInfo_v3(
            stream: cudaStream_t,
            capture_status_out: *mut cudaStreamCaptureStatus,
            id_out: *mut c_ulonglong,
            graph_out: *mut cudaGraph_t,
            dependencies_out: *mut *const cudaGraphNode_t,
            edge_data_out: *mut *const cudaGraphEdgeData,
            num_dependencies_out: *mut usize,
        ) -> cudaError_t;

        pub fn cudaStreamUpdateCaptureDependencies(
            stream: cudaStream_t,
            dependencies: *mut cudaGraphNode_t,
            num_dependencies: usize,
            flags: c_uint,
        ) -> cudaError_t;

        pub fn cudaStreamUpdateCaptureDependencies_v2(
            stream: cudaStream_t,
            dependencies: *mut cudaGraphNode_t,
            dependency_data: *const cudaGraphEdgeData,
            num_dependencies: usize,
            flags: c_uint,
        ) -> cudaError_t;

        pub fn cudaEventCreate(event: *mut cudaEvent_t) -> cudaError_t;

        pub fn cudaEventCreateWithFlags(event: *mut cudaEvent_t, flags: c_uint) -> cudaError_t;

        pub fn cudaEventRecord(event: cudaEvent_t, stream: cudaStream_t) -> cudaError_t;

        pub fn cudaEventRecordWithFlags(
            event: cudaEvent_t,
            stream: cudaStream_t,
            flags: c_uint,
        ) -> cudaError_t;

        pub fn cudaEventQuery(event: cudaEvent_t) -> cudaError_t;

        pub fn cudaEventSynchronize(event: cudaEvent_t) -> cudaError_t;

        pub fn cudaEventDestroy(event: cudaEvent_t) -> cudaError_t;

        pub fn cudaEventElapsedTime(
            ms: *mut f32,
            start: cudaEvent_t,
            end: cudaEvent_t,
        ) -> cudaError_t;

        pub fn cudaImportExternalMemory(
            ext_mem_out: *mut cudaExternalMemory_t,
            mem_handle_desc: *const cudaExternalMemoryHandleDesc,
        ) -> cudaError_t;

        pub fn cudaExternalMemoryGetMappedBuffer(
            dev_ptr: *mut *mut c_void,
            ext_mem: cudaExternalMemory_t,
            buffer_desc: *const cudaExternalMemoryBufferDesc,
        ) -> cudaError_t;

        pub fn cudaExternalMemoryGetMappedMipmappedArray(
            mipmap: *mut cudaMipmappedArray_t,
            ext_mem: cudaExternalMemory_t,
            mipmap_desc: *const cudaExternalMemoryMipmappedArrayDesc,
        ) -> cudaError_t;

        pub fn cudaDestroyExternalMemory(ext_mem: cudaExternalMemory_t) -> cudaError_t;

        pub fn cudaImportExternalSemaphore(
            ext_sem_out: *mut cudaExternalSemaphore_t,
            sem_handle_desc: *const cudaExternalSemaphoreHandleDesc,
        ) -> cudaError_t;

        pub fn cudaSignalExternalSemaphoresAsync_v2(
            ext_sem_array: *const cudaExternalSemaphore_t,
            params_array: *const cudaExternalSemaphoreSignalParams,
            num_ext_sems: c_uint,
            stream: cudaStream_t,
        ) -> cudaError_t;

        pub fn cudaWaitExternalSemaphoresAsync_v2(
            ext_sem_array: *const cudaExternalSemaphore_t,
            params_array: *const cudaExternalSemaphoreWaitParams,
            num_ext_sems: c_uint,
            stream: cudaStream_t,
        ) -> cudaError_t;

        pub fn cudaDestroyExternalSemaphore(ext_sem: cudaExternalSemaphore_t) -> cudaError_t;

        pub fn cudaLaunchKernel(
            func: *const c_void,
            grid_dim: dim3,
            block_dim: dim3,
            args: *mut *mut c_void,
            shared_mem: usize,
            stream: cudaStream_t,
        ) -> cudaError_t;

        pub fn cudaLaunchKernelExC(
            config: *const cudaLaunchConfig_t,
            func: *const c_void,
            args: *mut *mut c_void,
        ) -> cudaError_t;

        pub fn cudaLaunchCooperativeKernel(
            func: *const c_void,
            grid_dim: dim3,
            block_dim: dim3,
            args: *mut *mut c_void,
            shared_mem: usize,
            stream: cudaStream_t,
        ) -> cudaError_t;

        pub fn cudaLaunchCooperativeKernelMultiDevice(
            launch_params_list: *mut cudaLaunchParams,
            num_devices: c_uint,
            flags: c_uint,
        ) -> cudaError_t;

        pub fn cudaFuncSetCacheConfig(
            func: *const c_void,
            cache_config: cudaFuncCache,
        ) -> cudaError_t;

        pub fn cudaFuncGetAttributes(
            attr: *mut cudaFuncAttributes,
            func: *const c_void,
        ) -> cudaError_t;

        pub fn cudaFuncSetAttribute(
            func: *const c_void,
            attr: cudaFuncAttribute,
            value: c_int,
        ) -> cudaError_t;

        // pub fn cudaFuncGetName(name: *mut *const c_char, func: *const c_void) -> cudaError_t;

        pub fn cudaFuncGetParamInfo(
            func: *const c_void,
            param_index: usize,
            param_offset: *mut usize,
            param_size: *mut usize,
        ) -> cudaError_t;

        pub fn cudaSetDoubleForDevice(d: *mut f64) -> cudaError_t;

        pub fn cudaSetDoubleForHost(d: *mut f64) -> cudaError_t;

        pub fn cudaLaunchHostFunc(
            stream: cudaStream_t,
            fn_: cudaHostFn_t,
            user_data: *mut c_void,
        ) -> cudaError_t;

        pub fn cudaFuncSetSharedMemConfig(
            func: *const c_void,
            config: cudaSharedMemConfig,
        ) -> cudaError_t;

        pub fn cudaOccupancyMaxActiveBlocksPerMultiprocessor(
            num_blocks: *mut c_int,
            func: *const c_void,
            block_size: c_int,
            dynamic_smem_size: usize,
        ) -> cudaError_t;

        pub fn cudaOccupancyAvailableDynamicSMemPerBlock(
            dynamic_smem_size: *mut usize,
            func: *const c_void,
            num_blocks: c_int,
            block_size: c_int,
        ) -> cudaError_t;

        pub fn cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
            num_blocks: *mut c_int,
            func: *const c_void,
            block_size: c_int,
            dynamic_smem_size: usize,
            flags: c_uint,
        ) -> cudaError_t;

        pub fn cudaOccupancyMaxPotentialClusterSize(
            cluster_size: *mut c_int,
            func: *const c_void,
            launch_config: *const cudaLaunchConfig_t,
        ) -> cudaError_t;

        pub fn cudaOccupancyMaxActiveClusters(
            num_clusters: *mut c_int,
            func: *const c_void,
            launch_config: *const cudaLaunchConfig_t,
        ) -> cudaError_t;

        pub fn cudaMallocManaged(
            dev_ptr: *mut *mut c_void,
            size: usize,
            flags: c_uint,
        ) -> cudaError_t;

        pub fn cudaMalloc(dev_ptr: *mut *mut c_void, size: usize) -> cudaError_t;

        pub fn cudaMallocHost(ptr: *mut *mut c_void, size: usize) -> cudaError_t;

        pub fn cudaMallocPitch(
            dev_ptr: *mut *mut c_void,
            pitch: *mut usize,
            width: usize,
            height: usize,
        ) -> cudaError_t;

        pub fn cudaMallocArray(
            array: *mut cudaArray_t,
            desc: *const cudaChannelFormatDesc,
            width: usize,
            height: usize,
            flags: c_uint,
        ) -> cudaError_t;

        pub fn cudaFree(dev_ptr: *mut c_void) -> cudaError_t;

        pub fn cudaFreeHost(ptr: *mut c_void) -> cudaError_t;

        pub fn cudaFreeArray(array: cudaArray_t) -> cudaError_t;

        pub fn cudaFreeMipmappedArray(mipmapped_array: cudaMipmappedArray_t) -> cudaError_t;

        pub fn cudaHostAlloc(p_host: *mut *mut c_void, size: usize, flags: c_uint) -> cudaError_t;

        pub fn cudaHostRegister(ptr: *mut c_void, size: usize, flags: c_uint) -> cudaError_t;

        pub fn cudaHostUnregister(ptr: *mut c_void) -> cudaError_t;

        pub fn cudaHostGetDevicePointer(
            p_device: *mut *mut c_void,
            p_host: *mut c_void,
            flags: c_uint,
        ) -> cudaError_t;

        pub fn cudaHostGetFlags(p_flags: *mut c_uint, p_host: *mut c_void) -> cudaError_t;

        pub fn cudaMalloc3D(pitcheddev_ptr: *mut cudaPitchedPtr, extent: cudaExtent)
            -> cudaError_t;

        pub fn cudaMalloc3DArray(
            array: *mut cudaArray_t,
            desc: *const cudaChannelFormatDesc,
            extent: cudaExtent,
            flags: c_uint,
        ) -> cudaError_t;

        pub fn cudaMallocMipmappedArray(
            mipmapped_array: *mut cudaMipmappedArray_t,
            desc: *const cudaChannelFormatDesc,
            extent: cudaExtent,
            num_levels: c_uint,
            flags: c_uint,
        ) -> cudaError_t;

        pub fn cudaGetMipmappedArrayLevel(
            level_array: *mut cudaArray_t,
            mipmapped_array: cudaMipmappedArray_const_t,
            level: c_uint,
        ) -> cudaError_t;

        pub fn cudaMemcpy3D(p: *const cudaMemcpy3DParms) -> cudaError_t;

        pub fn cudaMemcpy3DPeer(p: *const cudaMemcpy3DPeerParms) -> cudaError_t;

        pub fn cudaMemcpy3DAsync(p: *const cudaMemcpy3DParms, stream: cudaStream_t) -> cudaError_t;

        pub fn cudaMemcpy3DPeerAsync(
            p: *const cudaMemcpy3DPeerParms,
            stream: cudaStream_t,
        ) -> cudaError_t;

        pub fn cudaMemGetInfo(free: *mut usize, total: *mut usize) -> cudaError_t;

        pub fn cudaArrayGetInfo(
            desc: *mut cudaChannelFormatDesc,
            extent: *mut cudaExtent,
            flags: *mut c_uint,
            array: cudaArray_t,
        ) -> cudaError_t;

        pub fn cudaArrayGetPlane(
            p_plane_array: *mut cudaArray_t,
            h_array: cudaArray_t,
            plane_idx: c_uint,
        ) -> cudaError_t;

        pub fn cudaArrayGetMemoryRequirements(
            memory_requirements: *mut cudaArrayMemoryRequirements,
            array: cudaArray_t,
            device: c_int,
        ) -> cudaError_t;

        pub fn cudaMipmappedArrayGetMemoryRequirements(
            memory_requirements: *mut cudaArrayMemoryRequirements,
            mipmap: cudaMipmappedArray_t,
            device: c_int,
        ) -> cudaError_t;

        pub fn cudaArrayGetSparseProperties(
            sparse_properties: *mut cudaArraySparseProperties,
            array: cudaArray_t,
        ) -> cudaError_t;

        pub fn cudaMipmappedArrayGetSparseProperties(
            sparse_properties: *mut cudaArraySparseProperties,
            mipmap: cudaMipmappedArray_t,
        ) -> cudaError_t;

        pub fn cudaMemcpy(
            dst: *mut c_void,
            src: *const c_void,
            count: usize,
            kind: cudaMemcpyKind,
        ) -> cudaError_t;

        pub fn cudaMemcpyPeer(
            dst: *mut c_void,
            dst_device: c_int,
            src: *const c_void,
            src_device: c_int,
            count: usize,
        ) -> cudaError_t;

        pub fn cudaMemcpy2D(
            dst: *mut c_void,
            dpitch: usize,
            src: *const c_void,
            spitch: usize,
            width: usize,
            height: usize,
            kind: cudaMemcpyKind,
        ) -> cudaError_t;

        pub fn cudaMemcpy2DToArray(
            dst: cudaArray_t,
            w_offset: usize,
            h_offset: usize,
            src: *const c_void,
            spitch: usize,
            width: usize,
            height: usize,
            kind: cudaMemcpyKind,
        ) -> cudaError_t;

        pub fn cudaMemcpy2DFromArray(
            dst: *mut c_void,
            dpitch: usize,
            src: cudaArray_const_t,
            w_offset: usize,
            h_offset: usize,
            width: usize,
            height: usize,
            kind: cudaMemcpyKind,
        ) -> cudaError_t;

        pub fn cudaMemcpy2DArrayToArray(
            dst: cudaArray_t,
            w_offset_dst: usize,
            h_offset_dst: usize,
            src: cudaArray_const_t,
            w_offset_src: usize,
            h_offset_src: usize,
            width: usize,
            height: usize,
            kind: cudaMemcpyKind,
        ) -> cudaError_t;

        pub fn cudaMemcpyToSymbol(
            symbol: *const c_void,
            src: *const c_void,
            count: usize,
            offset: usize,
            kind: cudaMemcpyKind,
        ) -> cudaError_t;

        pub fn cudaMemcpyFromSymbol(
            dst: *mut c_void,
            symbol: *const c_void,
            count: usize,
            offset: usize,
            kind: cudaMemcpyKind,
        ) -> cudaError_t;

        pub fn cudaMemcpyAsync(
            dst: *mut c_void,
            src: *const c_void,
            count: usize,
            kind: cudaMemcpyKind,
            stream: cudaStream_t,
        ) -> cudaError_t;

        pub fn cudaMemcpyPeerAsync(
            dst: *mut c_void,
            dst_device: c_int,
            src: *const c_void,
            src_device: c_int,
            count: usize,
            stream: cudaStream_t,
        ) -> cudaError_t;

        pub fn cudaMemcpy2DAsync(
            dst: *mut c_void,
            dpitch: usize,
            src: *const c_void,
            spitch: usize,
            width: usize,
            height: usize,
            kind: cudaMemcpyKind,
            stream: cudaStream_t,
        ) -> cudaError_t;

        pub fn cudaMemcpy2DToArrayAsync(
            dst: cudaArray_t,
            w_offset: usize,
            h_offset: usize,
            src: *const c_void,
            spitch: usize,
            width: usize,
            height: usize,
            kind: cudaMemcpyKind,
            stream: cudaStream_t,
        ) -> cudaError_t;

        pub fn cudaMemcpy2DFromArrayAsync(
            dst: *mut c_void,
            dpitch: usize,
            src: cudaArray_const_t,
            w_offset: usize,
            h_offset: usize,
            width: usize,
            height: usize,
            kind: cudaMemcpyKind,
            stream: cudaStream_t,
        ) -> cudaError_t;

        pub fn cudaMemcpyToSymbolAsync(
            symbol: *const c_void,
            src: *const c_void,
            count: usize,
            offset: usize,
            kind: cudaMemcpyKind,
            stream: cudaStream_t,
        ) -> cudaError_t;

        pub fn cudaMemcpyFromSymbolAsync(
            dst: *mut c_void,
            symbol: *const c_void,
            count: usize,
            offset: usize,
            kind: cudaMemcpyKind,
            stream: cudaStream_t,
        ) -> cudaError_t;

        pub fn cudaMemset(dev_ptr: *mut c_void, value: c_int, count: usize) -> cudaError_t;

        pub fn cudaMemset2D(
            dev_ptr: *mut c_void,
            pitch: usize,
            value: c_int,
            width: usize,
            height: usize,
        ) -> cudaError_t;

        pub fn cudaMemset3D(
            pitcheddev_ptr: cudaPitchedPtr,
            value: c_int,
            extent: cudaExtent,
        ) -> cudaError_t;

        pub fn cudaMemsetAsync(
            dev_ptr: *mut c_void,
            value: c_int,
            count: usize,
            stream: cudaStream_t,
        ) -> cudaError_t;

        pub fn cudaMemset2DAsync(
            dev_ptr: *mut c_void,
            pitch: usize,
            value: c_int,
            width: usize,
            height: usize,
            stream: cudaStream_t,
        ) -> cudaError_t;

        pub fn cudaMemset3DAsync(
            pitcheddev_ptr: cudaPitchedPtr,
            value: c_int,
            extent: cudaExtent,
            stream: cudaStream_t,
        ) -> cudaError_t;

        pub fn cudaGetSymbolAddress(
            dev_ptr: *mut *mut c_void,
            symbol: *const c_void,
        ) -> cudaError_t;

        pub fn cudaGetSymbolSize(size: *mut usize, symbol: *const c_void) -> cudaError_t;

        pub fn cudaMemPrefetchAsync(
            dev_ptr: *const c_void,
            count: usize,
            dst_device: c_int,
            stream: cudaStream_t,
        ) -> cudaError_t;

        pub fn cudaMemPrefetchAsync_v2(
            dev_ptr: *const c_void,
            count: usize,
            location: cudaMemLocation,
            flags: c_uint,
            stream: cudaStream_t,
        ) -> cudaError_t;

        pub fn cudaMemAdvise(
            dev_ptr: *const c_void,
            count: usize,
            advice: cudaMemoryAdvise,
            device: c_int,
        ) -> cudaError_t;

        pub fn cudaMemAdvise_v2(
            dev_ptr: *const c_void,
            count: usize,
            advice: cudaMemoryAdvise,
            location: cudaMemLocation,
        ) -> cudaError_t;

        pub fn cudaMemRangeGetAttribute(
            data: *mut c_void,
            data_size: usize,
            attribute: cudaMemRangeAttribute,
            dev_ptr: *const c_void,
            count: usize,
        ) -> cudaError_t;

        pub fn cudaMemRangeGetAttributes(
            data: *mut *mut c_void,
            data_sizes: *mut usize,
            attributes: *mut cudaMemRangeAttribute,
            num_attributes: usize,
            dev_ptr: *const c_void,
            count: usize,
        ) -> cudaError_t;

        pub fn cudaMemcpyToArray(
            dst: cudaArray_t,
            w_offset: usize,
            h_offset: usize,
            src: *const c_void,
            count: usize,
            kind: cudaMemcpyKind,
        ) -> cudaError_t;

        pub fn cudaMemcpyFromArray(
            dst: *mut c_void,
            src: cudaArray_const_t,
            w_offset: usize,
            h_offset: usize,
            count: usize,
            kind: cudaMemcpyKind,
        ) -> cudaError_t;

        pub fn cudaMemcpyArrayToArray(
            dst: cudaArray_t,
            w_offset_dst: usize,
            h_offset_dst: usize,
            src: cudaArray_const_t,
            w_offset_src: usize,
            h_offset_src: usize,
            count: usize,
            kind: cudaMemcpyKind,
        ) -> cudaError_t;

        pub fn cudaMemcpyToArrayAsync(
            dst: cudaArray_t,
            w_offset: usize,
            h_offset: usize,
            src: *const c_void,
            count: usize,
            kind: cudaMemcpyKind,
            stream: cudaStream_t,
        ) -> cudaError_t;

        pub fn cudaMemcpyFromArrayAsync(
            dst: *mut c_void,
            src: cudaArray_const_t,
            w_offset: usize,
            h_offset: usize,
            count: usize,
            kind: cudaMemcpyKind,
            stream: cudaStream_t,
        ) -> cudaError_t;

        pub fn cudaMallocAsync(
            dev_ptr: *mut *mut c_void,
            size: usize,
            h_stream: cudaStream_t,
        ) -> cudaError_t;

        pub fn cudaFreeAsync(dev_ptr: *mut c_void, h_stream: cudaStream_t) -> cudaError_t;

        pub fn cudaMemPoolTrimTo(mem_pool: cudaMemPool_t, min_bytes_to_keep: usize) -> cudaError_t;

        pub fn cudaMemPoolSetAttribute(
            mem_pool: cudaMemPool_t,
            attr: cudaMemPoolAttr,
            value: *mut c_void,
        ) -> cudaError_t;

        pub fn cudaMemPoolGetAttribute(
            mem_pool: cudaMemPool_t,
            attr: cudaMemPoolAttr,
            value: *mut c_void,
        ) -> cudaError_t;

        pub fn cudaMemPoolSetAccess(
            mem_pool: cudaMemPool_t,
            desc_list: *const cudaMemAccessDesc,
            count: usize,
        ) -> cudaError_t;

        pub fn cudaMemPoolGetAccess(
            flags: *mut cudaMemAccessFlags,
            mem_pool: cudaMemPool_t,
            location: *mut cudaMemLocation,
        ) -> cudaError_t;

        pub fn cudaMemPoolCreate(
            mem_pool: *mut cudaMemPool_t,
            pool_props: *const cudaMemPoolProps,
        ) -> cudaError_t;

        pub fn cudaMemPoolDestroy(mem_pool: cudaMemPool_t) -> cudaError_t;

        pub fn cudaMallocFromPoolAsync(
            ptr: *mut *mut c_void,
            size: usize,
            mem_pool: cudaMemPool_t,
            stream: cudaStream_t,
        ) -> cudaError_t;

        pub fn cudaMemPoolExportToShareableHandle(
            shareable_handle: *mut c_void,
            mem_pool: cudaMemPool_t,
            handle_type: cudaMemAllocationHandleType,
            flags: c_uint,
        ) -> cudaError_t;

        pub fn cudaMemPoolImportFromShareableHandle(
            mem_pool: *mut cudaMemPool_t,
            shareable_handle: *mut c_void,
            handle_type: cudaMemAllocationHandleType,
            flags: c_uint,
        ) -> cudaError_t;

        pub fn cudaMemPoolExportPointer(
            export_data: *mut cudaMemPoolPtrExportData,
            ptr: *mut c_void,
        ) -> cudaError_t;

        pub fn cudaMemPoolImportPointer(
            ptr: *mut *mut c_void,
            mem_pool: cudaMemPool_t,
            export_data: *mut cudaMemPoolPtrExportData,
        ) -> cudaError_t;

        pub fn cudaPointerGetAttributes(
            attributes: *mut cudaPointerAttributes,
            ptr: *const c_void,
        ) -> cudaError_t;

        pub fn cudaDeviceCanAccessPeer(
            can_access_peer: *mut c_int,
            device: c_int,
            peer_device: c_int,
        ) -> cudaError_t;

        pub fn cudaDeviceEnablePeerAccess(peer_device: c_int, flags: c_uint) -> cudaError_t;

        pub fn cudaDeviceDisablePeerAccess(peer_device: c_int) -> cudaError_t;

        pub fn cudaGraphicsUnregisterResource(resource: cudaGraphicsResource_t) -> cudaError_t;

        pub fn cudaGraphicsResourceSetMapFlags(
            resource: cudaGraphicsResource_t,
            flags: c_uint,
        ) -> cudaError_t;

        pub fn cudaGraphicsMapResources(
            count: c_int,
            resources: *mut cudaGraphicsResource_t,
            stream: cudaStream_t,
        ) -> cudaError_t;

        pub fn cudaGraphicsUnmapResources(
            count: c_int,
            resources: *mut cudaGraphicsResource_t,
            stream: cudaStream_t,
        ) -> cudaError_t;

        pub fn cudaGraphicsResourceGetMappedPointer(
            dev_ptr: *mut *mut c_void,
            size: *mut usize,
            resource: cudaGraphicsResource_t,
        ) -> cudaError_t;

        pub fn cudaGraphicsSubResourceGetMappedArray(
            array: *mut cudaArray_t,
            resource: cudaGraphicsResource_t,
            array_index: c_uint,
            mip_level: c_uint,
        ) -> cudaError_t;

        pub fn cudaGraphicsResourceGetMappedMipmappedArray(
            mipmapped_array: *mut cudaMipmappedArray_t,
            resource: cudaGraphicsResource_t,
        ) -> cudaError_t;

        pub fn cudaGetChannelDesc(
            desc: *mut cudaChannelFormatDesc,
            array: cudaArray_const_t,
        ) -> cudaError_t;

        pub fn cudaCreateChannelDesc(
            x: c_int,
            y: c_int,
            z: c_int,
            w: c_int,
            f: cudaChannelFormatKind,
        ) -> cudaChannelFormatDesc;

        pub fn cudaCreateTextureObject(
            p_tex_object: *mut cudaTextureObject_t,
            p_res_desc: *const cudaResourceDesc,
            p_tex_desc: *const cudaTextureDesc,
            p_res_view_desc: *const cudaResourceViewDesc,
        ) -> cudaError_t;

        pub fn cudaDestroyTextureObject(tex_object: cudaTextureObject_t) -> cudaError_t;

        pub fn cudaGetTextureObjectResourceDesc(
            p_res_desc: *mut cudaResourceDesc,
            tex_object: cudaTextureObject_t,
        ) -> cudaError_t;

        pub fn cudaGetTextureObjectTextureDesc(
            p_tex_desc: *mut cudaTextureDesc,
            tex_object: cudaTextureObject_t,
        ) -> cudaError_t;

        pub fn cudaGetTextureObjectResourceViewDesc(
            p_res_view_desc: *mut cudaResourceViewDesc,
            tex_object: cudaTextureObject_t,
        ) -> cudaError_t;

        pub fn cudaCreateSurfaceObject(
            p_surf_object: *mut cudaSurfaceObject_t,
            p_res_desc: *const cudaResourceDesc,
        ) -> cudaError_t;

        pub fn cudaDestroySurfaceObject(surf_object: cudaSurfaceObject_t) -> cudaError_t;

        pub fn cudaGetSurfaceObjectResourceDesc(
            p_res_desc: *mut cudaResourceDesc,
            surf_object: cudaSurfaceObject_t,
        ) -> cudaError_t;

        pub fn cudaDriverGetVersion(driver_version: *mut c_int) -> cudaError_t;

        pub fn cudaRuntimeGetVersion(runtime_version: *mut c_int) -> cudaError_t;

        pub fn cudaGraphCreate(p_graph: *mut cudaGraph_t, flags: c_uint) -> cudaError_t;

        pub fn cudaGraphAddKernelNode(
            p_graph_node: *mut cudaGraphNode_t,
            graph: cudaGraph_t,
            p_dependencies: *const cudaGraphNode_t,
            num_dependencies: usize,
            p_node_params: *const cudaKernelNodeParams,
        ) -> cudaError_t;

        pub fn cudaGraphKernelNodeGetParams(
            node: cudaGraphNode_t,
            p_node_params: *mut cudaKernelNodeParams,
        ) -> cudaError_t;

        pub fn cudaGraphKernelNodeSetParams(
            node: cudaGraphNode_t,
            p_node_params: *const cudaKernelNodeParams,
        ) -> cudaError_t;

        pub fn cudaGraphKernelNodeCopyAttributes(
            h_src: cudaGraphNode_t,
            h_dst: cudaGraphNode_t,
        ) -> cudaError_t;

        pub fn cudaGraphKernelNodeGetAttribute(
            h_node: cudaGraphNode_t,
            attr: cudaLaunchAttributeID,
            value_out: *mut cudaLaunchAttributeValue,
        ) -> cudaError_t;

        pub fn cudaGraphKernelNodeSetAttribute(
            h_node: cudaGraphNode_t,
            attr: cudaLaunchAttributeID,
            value: *const cudaLaunchAttributeValue,
        ) -> cudaError_t;

        pub fn cudaGraphAddMemcpyNode(
            p_graph_node: *mut cudaGraphNode_t,
            graph: cudaGraph_t,
            p_dependencies: *const cudaGraphNode_t,
            num_dependencies: usize,
            p_copy_params: *const cudaMemcpy3DParms,
        ) -> cudaError_t;

        pub fn cudaGraphAddMemcpyNodeToSymbol(
            p_graph_node: *mut cudaGraphNode_t,
            graph: cudaGraph_t,
            p_dependencies: *const cudaGraphNode_t,
            num_dependencies: usize,
            symbol: *const c_void,
            src: *const c_void,
            count: usize,
            offset: usize,
            kind: cudaMemcpyKind,
        ) -> cudaError_t;

        pub fn cudaGraphAddMemcpyNodeFromSymbol(
            p_graph_node: *mut cudaGraphNode_t,
            graph: cudaGraph_t,
            p_dependencies: *const cudaGraphNode_t,
            num_dependencies: usize,
            dst: *mut c_void,
            symbol: *const c_void,
            count: usize,
            offset: usize,
            kind: cudaMemcpyKind,
        ) -> cudaError_t;

        pub fn cudaGraphAddMemcpyNode1D(
            p_graph_node: *mut cudaGraphNode_t,
            graph: cudaGraph_t,
            p_dependencies: *const cudaGraphNode_t,
            num_dependencies: usize,
            dst: *mut c_void,
            src: *const c_void,
            count: usize,
            kind: cudaMemcpyKind,
        ) -> cudaError_t;

        pub fn cudaGraphMemcpyNodeGetParams(
            node: cudaGraphNode_t,
            p_node_params: *mut cudaMemcpy3DParms,
        ) -> cudaError_t;

        pub fn cudaGraphMemcpyNodeSetParams(
            node: cudaGraphNode_t,
            p_node_params: *const cudaMemcpy3DParms,
        ) -> cudaError_t;

        pub fn cudaGraphMemcpyNodeSetParamsToSymbol(
            node: cudaGraphNode_t,
            symbol: *const c_void,
            src: *const c_void,
            count: usize,
            offset: usize,
            kind: cudaMemcpyKind,
        ) -> cudaError_t;

        pub fn cudaGraphMemcpyNodeSetParamsFromSymbol(
            node: cudaGraphNode_t,
            dst: *mut c_void,
            symbol: *const c_void,
            count: usize,
            offset: usize,
            kind: cudaMemcpyKind,
        ) -> cudaError_t;

        pub fn cudaGraphMemcpyNodeSetParams1D(
            node: cudaGraphNode_t,
            dst: *mut c_void,
            src: *const c_void,
            count: usize,
            kind: cudaMemcpyKind,
        ) -> cudaError_t;

        pub fn cudaGraphAddMemsetNode(
            p_graph_node: *mut cudaGraphNode_t,
            graph: cudaGraph_t,
            p_dependencies: *const cudaGraphNode_t,
            num_dependencies: usize,
            p_memset_params: *const cudaMemsetParams,
        ) -> cudaError_t;

        pub fn cudaGraphMemsetNodeGetParams(
            node: cudaGraphNode_t,
            p_node_params: *mut cudaMemsetParams,
        ) -> cudaError_t;

        pub fn cudaGraphMemsetNodeSetParams(
            node: cudaGraphNode_t,
            p_node_params: *const cudaMemsetParams,
        ) -> cudaError_t;

        pub fn cudaGraphAddHostNode(
            p_graph_node: *mut cudaGraphNode_t,
            graph: cudaGraph_t,
            p_dependencies: *const cudaGraphNode_t,
            num_dependencies: usize,
            p_node_params: *const cudaHostNodeParams,
        ) -> cudaError_t;

        pub fn cudaGraphHostNodeGetParams(
            node: cudaGraphNode_t,
            p_node_params: *mut cudaHostNodeParams,
        ) -> cudaError_t;

        pub fn cudaGraphHostNodeSetParams(
            node: cudaGraphNode_t,
            p_node_params: *const cudaHostNodeParams,
        ) -> cudaError_t;

        pub fn cudaGraphAddChildGraphNode(
            p_graph_node: *mut cudaGraphNode_t,
            graph: cudaGraph_t,
            p_dependencies: *const cudaGraphNode_t,
            num_dependencies: usize,
            child_graph: cudaGraph_t,
        ) -> cudaError_t;

        pub fn cudaGraphChildGraphNodeGetGraph(
            node: cudaGraphNode_t,
            p_graph: *mut cudaGraph_t,
        ) -> cudaError_t;

        pub fn cudaGraphAddEmptyNode(
            p_graph_node: *mut cudaGraphNode_t,
            graph: cudaGraph_t,
            p_dependencies: *const cudaGraphNode_t,
            num_dependencies: usize,
        ) -> cudaError_t;

        pub fn cudaGraphAddEventRecordNode(
            p_graph_node: *mut cudaGraphNode_t,
            graph: cudaGraph_t,
            p_dependencies: *const cudaGraphNode_t,
            num_dependencies: usize,
            event: cudaEvent_t,
        ) -> cudaError_t;

        pub fn cudaGraphEventRecordNodeGetEvent(
            node: cudaGraphNode_t,
            event_out: *mut cudaEvent_t,
        ) -> cudaError_t;

        pub fn cudaGraphEventRecordNodeSetEvent(
            node: cudaGraphNode_t,
            event: cudaEvent_t,
        ) -> cudaError_t;

        pub fn cudaGraphAddEventWaitNode(
            p_graph_node: *mut cudaGraphNode_t,
            graph: cudaGraph_t,
            p_dependencies: *const cudaGraphNode_t,
            num_dependencies: usize,
            event: cudaEvent_t,
        ) -> cudaError_t;

        pub fn cudaGraphEventWaitNodeGetEvent(
            node: cudaGraphNode_t,
            event_out: *mut cudaEvent_t,
        ) -> cudaError_t;

        pub fn cudaGraphEventWaitNodeSetEvent(
            node: cudaGraphNode_t,
            event: cudaEvent_t,
        ) -> cudaError_t;

        pub fn cudaGraphAddExternalSemaphoresSignalNode(
            p_graph_node: *mut cudaGraphNode_t,
            graph: cudaGraph_t,
            p_dependencies: *const cudaGraphNode_t,
            num_dependencies: usize,
            node_params: *const cudaExternalSemaphoreSignalNodeParams,
        ) -> cudaError_t;

        pub fn cudaGraphExternalSemaphoresSignalNodeGetParams(
            h_node: cudaGraphNode_t,
            params_out: *mut cudaExternalSemaphoreSignalNodeParams,
        ) -> cudaError_t;

        pub fn cudaGraphExternalSemaphoresSignalNodeSetParams(
            h_node: cudaGraphNode_t,
            node_params: *const cudaExternalSemaphoreSignalNodeParams,
        ) -> cudaError_t;

        pub fn cudaGraphAddExternalSemaphoresWaitNode(
            p_graph_node: *mut cudaGraphNode_t,
            graph: cudaGraph_t,
            p_dependencies: *const cudaGraphNode_t,
            num_dependencies: usize,
            node_params: *const cudaExternalSemaphoreWaitNodeParams,
        ) -> cudaError_t;

        pub fn cudaGraphExternalSemaphoresWaitNodeGetParams(
            h_node: cudaGraphNode_t,
            params_out: *mut cudaExternalSemaphoreWaitNodeParams,
        ) -> cudaError_t;

        pub fn cudaGraphExternalSemaphoresWaitNodeSetParams(
            h_node: cudaGraphNode_t,
            node_params: *const cudaExternalSemaphoreWaitNodeParams,
        ) -> cudaError_t;

        pub fn cudaGraphAddMemAllocNode(
            p_graph_node: *mut cudaGraphNode_t,
            graph: cudaGraph_t,
            p_dependencies: *const cudaGraphNode_t,
            num_dependencies: usize,
            node_params: *mut cudaMemAllocNodeParams,
        ) -> cudaError_t;

        pub fn cudaGraphMemAllocNodeGetParams(
            node: cudaGraphNode_t,
            params_out: *mut cudaMemAllocNodeParams,
        ) -> cudaError_t;

        pub fn cudaGraphAddMemFreeNode(
            p_graph_node: *mut cudaGraphNode_t,
            graph: cudaGraph_t,
            p_dependencies: *const cudaGraphNode_t,
            num_dependencies: usize,
            dptr: *mut c_void,
        ) -> cudaError_t;

        pub fn cudaGraphMemFreeNodeGetParams(
            node: cudaGraphNode_t,
            dptr_out: *mut c_void,
        ) -> cudaError_t;

        pub fn cudaDeviceGraphMemTrim(device: c_int) -> cudaError_t;

        pub fn cudaDeviceGetGraphMemAttribute(
            device: c_int,
            attr: cudaGraphMemAttributeType,
            value: *mut c_void,
        ) -> cudaError_t;

        pub fn cudaDeviceSetGraphMemAttribute(
            device: c_int,
            attr: cudaGraphMemAttributeType,
            value: *mut c_void,
        ) -> cudaError_t;

        pub fn cudaGraphClone(
            p_graph_clone: *mut cudaGraph_t,
            original_graph: cudaGraph_t,
        ) -> cudaError_t;

        pub fn cudaGraphNodeFindInClone(
            p_node: *mut cudaGraphNode_t,
            original_node: cudaGraphNode_t,
            cloned_graph: cudaGraph_t,
        ) -> cudaError_t;

        pub fn cudaGraphNodeGetType(
            node: cudaGraphNode_t,
            p_type: *mut cudaGraphNodeType,
        ) -> cudaError_t;

        pub fn cudaGraphGetNodes(
            graph: cudaGraph_t,
            nodes: *mut cudaGraphNode_t,
            num_nodes: *mut usize,
        ) -> cudaError_t;

        pub fn cudaGraphGetRootNodes(
            graph: cudaGraph_t,
            p_root_nodes: *mut cudaGraphNode_t,
            p_num_root_nodes: *mut usize,
        ) -> cudaError_t;

        pub fn cudaGraphGetEdges(
            graph: cudaGraph_t,
            from: *mut cudaGraphNode_t,
            to: *mut cudaGraphNode_t,
            num_edges: *mut usize,
        ) -> cudaError_t;

        pub fn cudaGraphGetEdges_v2(
            graph: cudaGraph_t,
            from: *mut cudaGraphNode_t,
            to: *mut cudaGraphNode_t,
            edge_data: *mut cudaGraphEdgeData,
            num_edges: *mut usize,
        ) -> cudaError_t;

        pub fn cudaGraphNodeGetDependencies(
            node: cudaGraphNode_t,
            p_dependencies: *mut cudaGraphNode_t,
            p_num_dependencies: *mut usize,
        ) -> cudaError_t;

        pub fn cudaGraphNodeGetDependencies_v2(
            node: cudaGraphNode_t,
            p_dependencies: *mut cudaGraphNode_t,
            edge_data: *mut cudaGraphEdgeData,
            p_num_dependencies: *mut usize,
        ) -> cudaError_t;

        pub fn cudaGraphNodeGetDependentNodes(
            node: cudaGraphNode_t,
            p_dependent_nodes: *mut cudaGraphNode_t,
            p_num_dependent_nodes: *mut usize,
        ) -> cudaError_t;

        pub fn cudaGraphNodeGetDependentNodes_v2(
            node: cudaGraphNode_t,
            p_dependent_nodes: *mut cudaGraphNode_t,
            edge_data: *mut cudaGraphEdgeData,
            p_num_dependent_nodes: *mut usize,
        ) -> cudaError_t;

        pub fn cudaGraphAddDependencies(
            graph: cudaGraph_t,
            from: *const cudaGraphNode_t,
            to: *const cudaGraphNode_t,
            num_dependencies: usize,
        ) -> cudaError_t;

        pub fn cudaGraphAddDependencies_v2(
            graph: cudaGraph_t,
            from: *const cudaGraphNode_t,
            to: *const cudaGraphNode_t,
            edge_data: *const cudaGraphEdgeData,
            num_dependencies: usize,
        ) -> cudaError_t;

        pub fn cudaGraphRemoveDependencies(
            graph: cudaGraph_t,
            from: *const cudaGraphNode_t,
            to: *const cudaGraphNode_t,
            num_dependencies: usize,
        ) -> cudaError_t;

        pub fn cudaGraphRemoveDependencies_v2(
            graph: cudaGraph_t,
            from: *const cudaGraphNode_t,
            to: *const cudaGraphNode_t,
            edge_data: *const cudaGraphEdgeData,
            num_dependencies: usize,
        ) -> cudaError_t;

        pub fn cudaGraphDestroyNode(node: cudaGraphNode_t) -> cudaError_t;

        pub fn cudaGraphInstantiate(
            p_graph_exec: *mut cudaGraphExec_t,
            graph: cudaGraph_t,
            flags: c_ulonglong,
        ) -> cudaError_t;

        pub fn cudaGraphInstantiateWithFlags(
            p_graph_exec: *mut cudaGraphExec_t,
            graph: cudaGraph_t,
            flags: c_ulonglong,
        ) -> cudaError_t;

        pub fn cudaGraphInstantiateWithParams(
            p_graph_exec: *mut cudaGraphExec_t,
            graph: cudaGraph_t,
            instantiate_params: *mut cudaGraphInstantiateParams,
        ) -> cudaError_t;

        pub fn cudaGraphExecGetFlags(
            graph_exec: cudaGraphExec_t,
            flags: *mut c_ulonglong,
        ) -> cudaError_t;

        pub fn cudaGraphExecKernelNodeSetParams(
            h_graph_exec: cudaGraphExec_t,
            node: cudaGraphNode_t,
            p_node_params: *const cudaKernelNodeParams,
        ) -> cudaError_t;

        pub fn cudaGraphExecMemcpyNodeSetParams(
            h_graph_exec: cudaGraphExec_t,
            node: cudaGraphNode_t,
            p_node_params: *const cudaMemcpy3DParms,
        ) -> cudaError_t;

        pub fn cudaGraphExecMemcpyNodeSetParamsToSymbol(
            h_graph_exec: cudaGraphExec_t,
            node: cudaGraphNode_t,
            symbol: *const c_void,
            src: *const c_void,
            count: usize,
            offset: usize,
            kind: cudaMemcpyKind,
        ) -> cudaError_t;

        pub fn cudaGraphExecMemcpyNodeSetParamsFromSymbol(
            h_graph_exec: cudaGraphExec_t,
            node: cudaGraphNode_t,
            dst: *mut c_void,
            symbol: *const c_void,
            count: usize,
            offset: usize,
            kind: cudaMemcpyKind,
        ) -> cudaError_t;

        pub fn cudaGraphExecMemcpyNodeSetParams1D(
            h_graph_exec: cudaGraphExec_t,
            node: cudaGraphNode_t,
            dst: *mut c_void,
            src: *const c_void,
            count: usize,
            kind: cudaMemcpyKind,
        ) -> cudaError_t;

        pub fn cudaGraphExecMemsetNodeSetParams(
            h_graph_exec: cudaGraphExec_t,
            node: cudaGraphNode_t,
            p_node_params: *const cudaMemsetParams,
        ) -> cudaError_t;

        pub fn cudaGraphExecHostNodeSetParams(
            h_graph_exec: cudaGraphExec_t,
            node: cudaGraphNode_t,
            p_node_params: *const cudaHostNodeParams,
        ) -> cudaError_t;

        pub fn cudaGraphExecChildGraphNodeSetParams(
            h_graph_exec: cudaGraphExec_t,
            node: cudaGraphNode_t,
            child_graph: cudaGraph_t,
        ) -> cudaError_t;

        pub fn cudaGraphExecEventRecordNodeSetEvent(
            h_graph_exec: cudaGraphExec_t,
            h_node: cudaGraphNode_t,
            event: cudaEvent_t,
        ) -> cudaError_t;

        pub fn cudaGraphExecEventWaitNodeSetEvent(
            h_graph_exec: cudaGraphExec_t,
            h_node: cudaGraphNode_t,
            event: cudaEvent_t,
        ) -> cudaError_t;

        pub fn cudaGraphExecExternalSemaphoresSignalNodeSetParams(
            h_graph_exec: cudaGraphExec_t,
            h_node: cudaGraphNode_t,
            node_params: *const cudaExternalSemaphoreSignalNodeParams,
        ) -> cudaError_t;

        pub fn cudaGraphExecExternalSemaphoresWaitNodeSetParams(
            h_graph_exec: cudaGraphExec_t,
            h_node: cudaGraphNode_t,
            node_params: *const cudaExternalSemaphoreWaitNodeParams,
        ) -> cudaError_t;

        pub fn cudaGraphNodeSetEnabled(
            h_graph_exec: cudaGraphExec_t,
            h_node: cudaGraphNode_t,
            is_enabled: c_uint,
        ) -> cudaError_t;

        pub fn cudaGraphNodeGetEnabled(
            h_graph_exec: cudaGraphExec_t,
            h_node: cudaGraphNode_t,
            is_enabled: *mut c_uint,
        ) -> cudaError_t;

        pub fn cudaGraphExecUpdate(
            h_graph_exec: cudaGraphExec_t,
            h_graph: cudaGraph_t,
            result_info: *mut cudaGraphExecUpdateResultInfo,
        ) -> cudaError_t;

        pub fn cudaGraphUpload(graph_exec: cudaGraphExec_t, stream: cudaStream_t) -> cudaError_t;

        pub fn cudaGraphLaunch(graph_exec: cudaGraphExec_t, stream: cudaStream_t) -> cudaError_t;

        pub fn cudaGraphExecDestroy(graph_exec: cudaGraphExec_t) -> cudaError_t;

        pub fn cudaGraphDestroy(graph: cudaGraph_t) -> cudaError_t;

        pub fn cudaGraphDebugDotPrint(
            graph: cudaGraph_t,
            path: *const c_char,
            flags: c_uint,
        ) -> cudaError_t;

        pub fn cudaUserObjectCreate(
            object_out: *mut cudaUserObject_t,
            ptr: *mut c_void,
            destroy: cudaHostFn_t,
            initial_refcount: c_uint,
            flags: c_uint,
        ) -> cudaError_t;

        pub fn cudaUserObjectRetain(object: cudaUserObject_t, count: c_uint) -> cudaError_t;

        pub fn cudaUserObjectRelease(object: cudaUserObject_t, count: c_uint) -> cudaError_t;

        pub fn cudaGraphRetainUserObject(
            graph: cudaGraph_t,
            object: cudaUserObject_t,
            count: c_uint,
            flags: c_uint,
        ) -> cudaError_t;

        pub fn cudaGraphReleaseUserObject(
            graph: cudaGraph_t,
            object: cudaUserObject_t,
            count: c_uint,
        ) -> cudaError_t;

        pub fn cudaGraphAddNode(
            p_graph_node: *mut cudaGraphNode_t,
            graph: cudaGraph_t,
            p_dependencies: *const cudaGraphNode_t,
            num_dependencies: usize,
            node_params: *mut cudaGraphNodeParams,
        ) -> cudaError_t;

        pub fn cudaGraphAddNode_v2(
            p_graph_node: *mut cudaGraphNode_t,
            graph: cudaGraph_t,
            p_dependencies: *const cudaGraphNode_t,
            dependency_data: *const cudaGraphEdgeData,
            num_dependencies: usize,
            node_params: *mut cudaGraphNodeParams,
        ) -> cudaError_t;

        pub fn cudaGraphNodeSetParams(
            node: cudaGraphNode_t,
            node_params: *mut cudaGraphNodeParams,
        ) -> cudaError_t;

        pub fn cudaGraphExecNodeSetParams(
            graph_exec: cudaGraphExec_t,
            node: cudaGraphNode_t,
            node_params: *mut cudaGraphNodeParams,
        ) -> cudaError_t;

        pub fn cudaGraphConditionalHandleCreate(
            p_handle_out: *mut cudaGraphConditionalHandle,
            graph: cudaGraph_t,
            default_launch_value: c_uint,
            flags: c_uint,
        ) -> cudaError_t;

        pub fn cudaGetDriverEntryPoint(
            symbol: *const c_char,
            func_ptr: *mut *mut c_void,
            flags: c_ulonglong,
            driver_status: *mut cudaDriverEntryPointQueryResult,
        ) -> cudaError_t;

        pub fn cudaGetExportTable(
            pp_export_table: *mut *const c_void,
            p_export_table_id: *const cudaUUID_t,
        ) -> cudaError_t;

        pub fn cudaGetFuncBySymbol(
            function_ptr: *mut cudaFunction_t,
            symbol_ptr: *const c_void,
        ) -> cudaError_t;

        pub fn cudaGetKernel(
            kernel_ptr: *mut cudaKernel_t,
            entry_func_addr: *const c_void,
        ) -> cudaError_t;
    }
}

pub use api::RuntimeApi;