#ifndef LLVM_LIBC_SRC___SUPPORT_GPU_AMDGPU_IO_H
#define LLVM_LIBC_SRC___SUPPORT_GPU_AMDGPU_IO_H
#include "src/__support/common.h"
#include "src/__support/macros/config.h"
#include <stdint.h>
namespace LIBC_NAMESPACE_DECL {
namespace gpu {
template <typename T> using Private = [[clang::opencl_private]] T;
template <typename T> using Constant = [[clang::opencl_constant]] T;
template <typename T> using Local = [[clang::opencl_local]] T;
template <typename T> using Global = [[clang::opencl_global]] T;
LIBC_INLINE uint32_t get_num_blocks_x() {
return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
}
LIBC_INLINE uint32_t get_num_blocks_y() {
return __builtin_amdgcn_grid_size_y() / __builtin_amdgcn_workgroup_size_y();
}
LIBC_INLINE uint32_t get_num_blocks_z() {
return __builtin_amdgcn_grid_size_z() / __builtin_amdgcn_workgroup_size_z();
}
LIBC_INLINE uint64_t get_num_blocks() {
return get_num_blocks_x() * get_num_blocks_y() * get_num_blocks_z();
}
LIBC_INLINE uint32_t get_block_id_x() {
return __builtin_amdgcn_workgroup_id_x();
}
LIBC_INLINE uint32_t get_block_id_y() {
return __builtin_amdgcn_workgroup_id_y();
}
LIBC_INLINE uint32_t get_block_id_z() {
return __builtin_amdgcn_workgroup_id_z();
}
LIBC_INLINE uint64_t get_block_id() {
return get_block_id_x() + get_num_blocks_x() * get_block_id_y() +
get_num_blocks_x() * get_num_blocks_y() * get_block_id_z();
}
LIBC_INLINE uint32_t get_num_threads_x() {
return __builtin_amdgcn_workgroup_size_x();
}
LIBC_INLINE uint32_t get_num_threads_y() {
return __builtin_amdgcn_workgroup_size_y();
}
LIBC_INLINE uint32_t get_num_threads_z() {
return __builtin_amdgcn_workgroup_size_z();
}
LIBC_INLINE uint64_t get_num_threads() {
return get_num_threads_x() * get_num_threads_y() * get_num_threads_z();
}
LIBC_INLINE uint32_t get_thread_id_x() {
return __builtin_amdgcn_workitem_id_x();
}
LIBC_INLINE uint32_t get_thread_id_y() {
return __builtin_amdgcn_workitem_id_y();
}
LIBC_INLINE uint32_t get_thread_id_z() {
return __builtin_amdgcn_workitem_id_z();
}
LIBC_INLINE uint64_t get_thread_id() {
return get_thread_id_x() + get_num_threads_x() * get_thread_id_y() +
get_num_threads_x() * get_num_threads_y() * get_thread_id_z();
}
LIBC_INLINE uint32_t get_lane_size() {
return __builtin_amdgcn_wavefrontsize();
}
[[clang::convergent]] LIBC_INLINE uint32_t get_lane_id() {
return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
}
[[clang::convergent]] LIBC_INLINE uint64_t get_lane_mask() {
return __builtin_amdgcn_read_exec();
}
[[clang::convergent]] LIBC_INLINE uint32_t broadcast_value(uint64_t,
uint32_t x) {
return __builtin_amdgcn_readfirstlane(x);
}
[[clang::convergent]] LIBC_INLINE uint64_t ballot(uint64_t lane_mask, bool x) {
return lane_mask & __builtin_amdgcn_ballot_w64(x);
}
[[clang::convergent]] LIBC_INLINE void sync_threads() {
__builtin_amdgcn_s_barrier();
__builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup");
}
[[clang::convergent]] LIBC_INLINE void memory_fence() {
__builtin_amdgcn_fence(__ATOMIC_ACQ_REL, "");
}
[[clang::convergent]] LIBC_INLINE void sync_lane(uint64_t) {
__builtin_amdgcn_wave_barrier();
}
[[clang::convergent]] LIBC_INLINE uint32_t shuffle(uint64_t, uint32_t idx,
uint32_t x) {
return __builtin_amdgcn_ds_bpermute(idx << 2, x);
}
LIBC_INLINE uint64_t processor_clock() { return __builtin_readcyclecounter(); }
LIBC_INLINE uint64_t fixed_frequency_clock() {
return __builtin_readsteadycounter();
}
[[noreturn]] LIBC_INLINE void end_program() { __builtin_amdgcn_endpgm(); }
LIBC_INLINE uint32_t get_cluster_id() { return 0; }
}
}
#endif