#ifndef LLVM_LIBC_UTILS_GPU_TIMING_NVPTX
#define LLVM_LIBC_UTILS_GPU_TIMING_NVPTX
#include "src/__support/GPU/utils.h"
#include "src/__support/common.h"
#include "src/__support/macros/attributes.h"
#include "src/__support/macros/config.h"
#include <stdint.h>
namespace LIBC_NAMESPACE_DECL {
[[gnu::noinline]] static uint64_t overhead() {
volatile uint32_t x = 1;
uint32_t y = x;
uint64_t start = gpu::processor_clock();
asm("" ::"r"(y), "llr"(start));
uint32_t result = y;
asm("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result) :);
uint64_t stop = gpu::processor_clock();
volatile auto storage = result;
return stop - start;
}
template <typename F, typename T>
[[gnu::noinline]] static LIBC_INLINE uint64_t latency(F f, T t) {
volatile T storage = t;
T arg = storage;
asm("" ::"r"(arg));
gpu::memory_fence();
uint64_t start = gpu::processor_clock();
asm("" ::"r"(arg), "llr"(start));
auto result = f(arg);
asm("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result) :);
uint64_t stop = gpu::processor_clock();
gpu::memory_fence();
asm("" ::"r"(stop));
volatile T output = result;
return stop - start;
}
template <typename F, typename T1, typename T2>
static LIBC_INLINE uint64_t latency(F f, T1 t1, T2 t2) {
volatile T1 storage = t1;
volatile T2 storage2 = t2;
T1 arg = storage;
T2 arg2 = storage2;
asm("" ::"r"(arg), "r"(arg2));
gpu::memory_fence();
uint64_t start = gpu::processor_clock();
asm("" ::"r"(arg), "r"(arg2), "llr"(start));
auto result = f(arg, arg2);
asm("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result) :);
uint64_t stop = gpu::processor_clock();
gpu::memory_fence();
asm("" ::"r"(stop));
volatile auto output = result;
return stop - start;
}
}
#endif