160e89e1创建于 2025年11月24日历史提交
cuda_runtime_template = '''
/**
 * Copyright 2023-2025 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


#include <stdio.h>
#include <string.h>
#include <fstream>
#include <sstream>
#include <iostream>
#include <chrono>
#include <vector>
#include <numeric>
#include <cuda.h>
#include <cuda_runtime.h>
#include<iomanip>
#include <cuda_fp16.h>

#define checkCudaDrvErrors(call)                                                                                   \\
    do {                                                                                                           \\
        CUresult status = call;                                                                                    \\
        if (status != 0) {                                                                                         \\
            const char *msg = nullptr;                                                                             \\
            cuGetErrorString(status, &msg);                                                                        \\
            std::cerr << "CUDA error at line " << __LINE__ << " in file " << __FILE__ << ": " << msg << std::endl; \\
            exit(1);                                                                                               \\
        }                                                                                                          \\
    } while (0)

std::string ReadFileToString(const char *filename)
{
    std::ifstream ifile(filename);
    std::ostringstream buf;
    char ch;
    while (buf && ifile.get(ch))
        buf.put(ch);
    return buf.str();
}

extern "C" void cuda_runtime_profiling(
    rt_code_params_list,
    int number=1,
    int repeat=1,
    int min_repeat_ms=0
    )
{
    std::cout<< "Start runtime profiling:" << std::endl;
    // initialize cuda
    checkCudaDrvErrors(cuInit(0));
    // get the number of cuda devices
    int device_count = 0;
    checkCudaDrvErrors(cuDeviceGetCount(&device_count));

    if (device_count == 0) {
        std::cerr << "No cuda devices found" << std::endl;
        exit(1);
    }

    // get the first cuda device
    CUdevice device;
    checkCudaDrvErrors(cuDeviceGet(&device, 0));

    // create a cuda context on the device
    CUcontext context;
    checkCudaDrvErrors(cuCtxCreate(&context, 0, device));

    CUstream stream;
    cuStreamCreate(&stream, 0);

    std::string ptx_code = ReadFileToString(rt_code_ptx_path);

    CUmodule module;
    checkCudaDrvErrors(cuModuleLoadData(&module, ptx_code.c_str()));

    // get the cuda kernel function
    CUfunction kernel;
    checkCudaDrvErrors(cuModuleGetFunction(&kernel, module, rt_code_kernel_name));

    // allocate input and output arrays on the gpu
rt_code_mem_alloc

    // copy input arrays from cpu to gpu
rt_code_mem_copy_htod

rt_code_set_grid_params
rt_code_set_block_params
rt_code_init_memref_params
    std::cout<< "Profiling init done;" << std::endl;
    // launch the kernel
    void *args[] = {rt_code_set_args_params};

    // skip first launch
    checkCudaDrvErrors(cuLaunchKernel(kernel, gx, gy, gz, bx, by, bz, 0, stream, args, NULL));
    cuStreamSynchronize(stream);
    std::vector<double> res;
    for (int i = 0; i < repeat; ++i) {

        std::chrono::time_point<
            std::chrono::high_resolution_clock, std::chrono::nanoseconds> tbegin, tend;


        tbegin = std::chrono::high_resolution_clock::now();
        // start timing
        for (int i = 0; i < number; ++i) {
            checkCudaDrvErrors(cuLaunchKernel(kernel, gx, gy, gz, bx, by, bz, 0, stream, args, NULL));
        }
        cuStreamSynchronize(stream);
        tend = std::chrono::high_resolution_clock::now();

        // ns->ms
        double speed = std::chrono::duration_cast<std::chrono::duration<double> >(
            tend - tbegin).count() / number;
        res.push_back(speed);
    }

    // copy output array from gpu to cpu
rt_code_mem_copy_dtoh
    // free memory
rt_code_free_d_mem
    cuStreamDestroy(stream);
    cuCtxDestroy(context);
    double avg = 0.0;
    for (double r : res) {
        avg += r;
    }
    avg /= static_cast<double>(res.size());
    std::cout<< "average latency = " <<  std::fixed << std::setprecision(10)  << avg * 1000 << "ms" << std::endl;
    std::cout<< "Finish runtime profiling." << std::endl;
}


extern "C" void cuda_runtime_exec(rt_code_params_list)
{
    std::cout<< "Start runtime execution." << std::endl;
    // initialize cuda
    checkCudaDrvErrors(cuInit(0));

    // get the number of cuda devices
    int device_count = 0;
    checkCudaDrvErrors(cuDeviceGetCount(&device_count));

    if (device_count == 0) {
        std::cerr << "No cuda devices found" << std::endl;
        exit(1);
    }

    // get the first cuda device
    CUdevice device;
    checkCudaDrvErrors(cuDeviceGet(&device, 0));

    // create a cuda context on the device
    CUcontext context;
    checkCudaDrvErrors(cuCtxCreate(&context, 0, device));

    std::string ptx_code = ReadFileToString(rt_code_ptx_path);

    CUmodule module;
    checkCudaDrvErrors(cuModuleLoadData(&module, ptx_code.c_str()));

    // get the cuda kernel function
    CUfunction kernel;
    checkCudaDrvErrors(cuModuleGetFunction(&kernel, module, rt_code_kernel_name));

    // allocate input and output arrays on the gpu
rt_code_mem_alloc

    // copy input arrays from cpu to gpu
rt_code_mem_copy_htod

rt_code_set_grid_params
rt_code_set_block_params
rt_code_init_memref_params

    // launch the kernel
    void *args[] = {rt_code_set_args_params};
    checkCudaDrvErrors(cuLaunchKernel(kernel, gx, gy, gz, bx, by, bz, 0, NULL, args, NULL));

    // copy output array from gpu to cpu
rt_code_mem_copy_dtoh
    // free memory
rt_code_free_d_mem
    std::cout<< "Finish runtime execution." << std::endl;
}
'''

cpu_profiling_template = '''
  llvm.func @nanoTime() -> i64 attributes {llvm.emit_c_interface, sym_visibility = "private"} {
    %0 = llvm.call @_mlir_ciface_nanoTime() : () -> i64
    llvm.return %0 : i64
  }
  llvm.func @warmUp(%arg0 : INPUTS_PTR) -> () attributes {llvm.emit_c_interface, sym_visibility = "private"} {
    %c0 = llvm.mlir.constant(0 : index) : i64
    %c1 = llvm.mlir.constant(1 : index) : i64
    %c100 = llvm.mlir.constant(1000 : index) : i64
    llvm.br ^bb1(%c0 : i64)
  ^bb1(%2: i64):  // 2 preds: ^bb0, ^bb2
    %1 = llvm.icmp "slt" %2, %c100 : i64
    llvm.cond_br %1, ^bb2, ^bb3
  ^bb2:  // pred: ^bb1
    llvm.call @KERNEL_NAME(%arg0) : (INPUTS_PTR) -> ()
    %3 = llvm.add %2, %c1  : i64
    llvm.br ^bb1(%3 : i64)
  ^bb3:  // pred: ^bb1
    llvm.return
  }
  llvm.func @_mlir_ciface_nanoTime() -> i64 attributes {llvm.emit_c_interface, sym_visibility = "private"}
  llvm.func @main(INPUTS_NAME : INPUTS_PTR, %arg_time: !llvm.ptr<i64>) attributes {llvm.emit_c_interface, sym_visibility = "public"} {
    %c0 = llvm.mlir.constant(0 : index) : i64
    %c1 = llvm.mlir.constant(1 : index) : i64
    %ctimes = llvm.mlir.constant(CTIMES : index) : i64
    llvm.call @warmUp(INPUTS_NAME) : (INPUTS_PTR) -> ()
    %0 = llvm.call @nanoTime() : () -> i64
    llvm.br ^bb1(%c0 : i64)
  ^bb1(%2: i64):  // 2 preds: ^bb0, ^bb2
    %1 = llvm.icmp "slt" %2, %ctimes : i64
    llvm.cond_br %1, ^bb2, ^bb3
  ^bb2:  // pred: ^bb1
    llvm.call @KERNEL_NAME(INPUTS_NAME) : (INPUTS_PTR) -> ()
    %3 = llvm.add %2, %c1  : i64
    llvm.br ^bb1(%3 : i64)
  ^bb3:  // pred: ^bb1
    %4 = llvm.call @nanoTime() : () -> i64
    %5 = llvm.sub %4, %0  : i64
    llvm.store %5, %arg_time : !llvm.ptr<i64>
    llvm.return
  }
'''