#ifndef LLVM_LIBC_SRC___SUPPORT_GPU_NVPTX_IO_H
#define LLVM_LIBC_SRC___SUPPORT_GPU_NVPTX_IO_H
#include "src/__support/common.h"
#include "src/__support/macros/config.h"
#include <stdint.h>
namespace LIBC_NAMESPACE_DECL {
namespace gpu {
template <typename T> using Private = [[clang::opencl_private]] T;
template <typename T> using Constant = [[clang::opencl_constant]] T;
template <typename T> using Local = [[clang::opencl_local]] T;
template <typename T> using Global = [[clang::opencl_global]] T;
LIBC_INLINE uint32_t get_num_blocks_x() {
return __nvvm_read_ptx_sreg_nctaid_x();
}
LIBC_INLINE uint32_t get_num_blocks_y() {
return __nvvm_read_ptx_sreg_nctaid_y();
}
LIBC_INLINE uint32_t get_num_blocks_z() {
return __nvvm_read_ptx_sreg_nctaid_z();
}
LIBC_INLINE uint64_t get_num_blocks() {
return get_num_blocks_x() * get_num_blocks_y() * get_num_blocks_z();
}
LIBC_INLINE uint32_t get_block_id_x() { return __nvvm_read_ptx_sreg_ctaid_x(); }
LIBC_INLINE uint32_t get_block_id_y() { return __nvvm_read_ptx_sreg_ctaid_y(); }
LIBC_INLINE uint32_t get_block_id_z() { return __nvvm_read_ptx_sreg_ctaid_z(); }
LIBC_INLINE uint64_t get_block_id() {
return get_block_id_x() + get_num_blocks_x() * get_block_id_y() +
get_num_blocks_x() * get_num_blocks_y() * get_block_id_z();
}
LIBC_INLINE uint32_t get_num_threads_x() {
return __nvvm_read_ptx_sreg_ntid_x();
}
LIBC_INLINE uint32_t get_num_threads_y() {
return __nvvm_read_ptx_sreg_ntid_y();
}
LIBC_INLINE uint32_t get_num_threads_z() {
return __nvvm_read_ptx_sreg_ntid_z();
}
LIBC_INLINE uint64_t get_num_threads() {
return get_num_threads_x() * get_num_threads_y() * get_num_threads_z();
}
LIBC_INLINE uint32_t get_thread_id_x() { return __nvvm_read_ptx_sreg_tid_x(); }
LIBC_INLINE uint32_t get_thread_id_y() { return __nvvm_read_ptx_sreg_tid_y(); }
LIBC_INLINE uint32_t get_thread_id_z() { return __nvvm_read_ptx_sreg_tid_z(); }
LIBC_INLINE uint64_t get_thread_id() {
return get_thread_id_x() + get_num_threads_x() * get_thread_id_y() +
get_num_threads_x() * get_num_threads_y() * get_thread_id_z();
}
LIBC_INLINE uint32_t get_lane_size() { return 32; }
[[clang::convergent]] LIBC_INLINE uint32_t get_lane_id() {
return __nvvm_read_ptx_sreg_laneid();
}
[[clang::convergent]] LIBC_INLINE uint64_t get_lane_mask() {
return __nvvm_activemask();
}
[[clang::convergent]] LIBC_INLINE uint32_t broadcast_value(uint64_t lane_mask,
uint32_t x) {
uint32_t mask = static_cast<uint32_t>(lane_mask);
uint32_t id = __builtin_ffs(mask) - 1;
return __nvvm_shfl_sync_idx_i32(mask, x, id, get_lane_size() - 1);
}
[[clang::convergent]] LIBC_INLINE uint64_t ballot(uint64_t lane_mask, bool x) {
uint32_t mask = static_cast<uint32_t>(lane_mask);
return __nvvm_vote_ballot_sync(mask, x);
}
[[clang::convergent]] LIBC_INLINE void sync_threads() { __syncthreads(); }
[[clang::convergent]] LIBC_INLINE void memory_fence() { __nvvm_membar_sys(); }
[[clang::convergent]] LIBC_INLINE void sync_lane(uint64_t mask) {
__nvvm_bar_warp_sync(static_cast<uint32_t>(mask));
}
[[clang::convergent]] LIBC_INLINE uint32_t shuffle(uint64_t lane_mask,
uint32_t idx, uint32_t x) {
uint32_t mask = static_cast<uint32_t>(lane_mask);
uint32_t bitmask = (mask >> idx) & 1;
return -bitmask & __nvvm_shfl_sync_idx_i32(mask, x, idx, get_lane_size() - 1);
}
LIBC_INLINE uint64_t processor_clock() { return __builtin_readcyclecounter(); }
LIBC_INLINE uint64_t fixed_frequency_clock() {
return __builtin_readsteadycounter();
}
[[noreturn]] LIBC_INLINE void end_program() { __nvvm_exit(); }
LIBC_INLINE uint32_t get_cluster_id() { return __nvvm_read_ptx_sreg_smid(); }
}
}
#endif