#include <atomic>
#include <cassert>
#include <cstddef>
#include <deque>
#include <mutex>
#include <string>
#include <system_error>
#include <unistd.h>
#include <unordered_map>
#include "Shared/APITypes.h"
#include "Shared/Debug.h"
#include "Shared/Environment.h"
#include "Shared/Utils.h"
#include "Utils/ELF.h"
#include "GlobalHandler.h"
#include "OpenMP/OMPT/Callback.h"
#include "PluginInterface.h"
#include "UtilitiesRTL.h"
#include "omptarget.h"
#include "llvm/ADT/SmallString.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/BinaryFormat/ELF.h"
#include "llvm/Frontend/OpenMP/OMPConstants.h"
#include "llvm/Frontend/OpenMP/OMPGridValues.h"
#include "llvm/Support/Error.h"
#include "llvm/Support/FileOutputBuffer.h"
#include "llvm/Support/FileSystem.h"
#include "llvm/Support/MemoryBuffer.h"
#include "llvm/Support/Program.h"
#include "llvm/Support/raw_ostream.h"
#if !defined(__BYTE_ORDER__) || !defined(__ORDER_LITTLE_ENDIAN__) || \
!defined(__ORDER_BIG_ENDIAN__)
#error "Missing preprocessor definitions for endianness detection."
#endif
#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
#define LITTLEENDIAN_CPU
#elif defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
#define BIGENDIAN_CPU
#endif
#if defined(__has_include)
#if __has_include("hsa.h")
#include "hsa.h"
#include "hsa_ext_amd.h"
#elif __has_include("hsa/hsa.h")
#include "hsa/hsa.h"
#include "hsa/hsa_ext_amd.h"
#endif
#else
#include "hsa/hsa.h"
#include "hsa/hsa_ext_amd.h"
#endif
namespace llvm {
namespace omp {
namespace target {
namespace plugin {
struct AMDGPUKernelTy;
struct AMDGPUDeviceTy;
struct AMDGPUPluginTy;
struct AMDGPUStreamTy;
struct AMDGPUEventTy;
struct AMDGPUStreamManagerTy;
struct AMDGPUEventManagerTy;
struct AMDGPUDeviceImageTy;
struct AMDGPUMemoryManagerTy;
struct AMDGPUMemoryPoolTy;
namespace utils {
template <typename ElemTy, typename IterFuncTy, typename CallbackTy>
hsa_status_t iterate(IterFuncTy Func, CallbackTy Cb) {
auto L = [](ElemTy Elem, void *Data) -> hsa_status_t {
CallbackTy *Unwrapped = static_cast<CallbackTy *>(Data);
return (*Unwrapped)(Elem);
};
return Func(L, static_cast<void *>(&Cb));
}
template <typename ElemTy, typename IterFuncTy, typename IterFuncArgTy,
typename CallbackTy>
hsa_status_t iterate(IterFuncTy Func, IterFuncArgTy FuncArg, CallbackTy Cb) {
auto L = [](ElemTy Elem, void *Data) -> hsa_status_t {
CallbackTy *Unwrapped = static_cast<CallbackTy *>(Data);
return (*Unwrapped)(Elem);
};
return Func(FuncArg, L, static_cast<void *>(&Cb));
}
template <typename Elem1Ty, typename Elem2Ty, typename IterFuncTy,
typename IterFuncArgTy, typename CallbackTy>
hsa_status_t iterate(IterFuncTy Func, IterFuncArgTy FuncArg, CallbackTy Cb) {
auto L = [](Elem1Ty Elem1, Elem2Ty Elem2, void *Data) -> hsa_status_t {
CallbackTy *Unwrapped = static_cast<CallbackTy *>(Data);
return (*Unwrapped)(Elem1, Elem2);
};
return Func(FuncArg, L, static_cast<void *>(&Cb));
}
template <typename CallbackTy> Error iterateAgents(CallbackTy Callback) {
hsa_status_t Status = iterate<hsa_agent_t>(hsa_iterate_agents, Callback);
return Plugin::check(Status, "Error in hsa_iterate_agents: %s");
}
template <typename CallbackTy>
Error iterateAgentISAs(hsa_agent_t Agent, CallbackTy Cb) {
hsa_status_t Status = iterate<hsa_isa_t>(hsa_agent_iterate_isas, Agent, Cb);
return Plugin::check(Status, "Error in hsa_agent_iterate_isas: %s");
}
template <typename CallbackTy>
Error iterateAgentMemoryPools(hsa_agent_t Agent, CallbackTy Cb) {
hsa_status_t Status = iterate<hsa_amd_memory_pool_t>(
hsa_amd_agent_iterate_memory_pools, Agent, Cb);
return Plugin::check(Status,
"Error in hsa_amd_agent_iterate_memory_pools: %s");
}
Error asyncMemCopy(bool UseMultipleSdmaEngines, void *Dst, hsa_agent_t DstAgent,
const void *Src, hsa_agent_t SrcAgent, size_t Size,
uint32_t NumDepSignals, const hsa_signal_t *DepSignals,
hsa_signal_t CompletionSignal) {
if (!UseMultipleSdmaEngines) {
hsa_status_t S =
hsa_amd_memory_async_copy(Dst, DstAgent, Src, SrcAgent, Size,
NumDepSignals, DepSignals, CompletionSignal);
return Plugin::check(S, "Error in hsa_amd_memory_async_copy: %s");
}
#if !(HSA_AMD_INTERFACE_VERSION_MAJOR >= 1 && \
HSA_AMD_INTERFACE_VERSION_MINOR >= 2)
return Plugin::error("Async copy on selected SDMA requires ROCm 5.7");
#else
static std::atomic<int> SdmaEngine{1};
int LocalSdmaEngine = SdmaEngine.load(std::memory_order_acquire);
hsa_status_t S = hsa_amd_memory_async_copy_on_engine(
Dst, DstAgent, Src, SrcAgent, Size, NumDepSignals, DepSignals,
CompletionSignal, (hsa_amd_sdma_engine_id_t)LocalSdmaEngine,
true);
LocalSdmaEngine = (LocalSdmaEngine << 1) % 3;
SdmaEngine.store(LocalSdmaEngine, std::memory_order_relaxed);
return Plugin::check(S, "Error in hsa_amd_memory_async_copy_on_engine: %s");
#endif
}
Expected<std::string> getTargetTripleAndFeatures(hsa_agent_t Agent) {
std::string Target;
auto Err = utils::iterateAgentISAs(Agent, [&](hsa_isa_t ISA) {
uint32_t Length;
hsa_status_t Status;
Status = hsa_isa_get_info_alt(ISA, HSA_ISA_INFO_NAME_LENGTH, &Length);
if (Status != HSA_STATUS_SUCCESS)
return Status;
llvm::SmallVector<char> ISAName(Length);
Status = hsa_isa_get_info_alt(ISA, HSA_ISA_INFO_NAME, ISAName.begin());
if (Status != HSA_STATUS_SUCCESS)
return Status;
llvm::StringRef TripleTarget(ISAName.begin(), Length);
if (TripleTarget.consume_front("amdgcn-amd-amdhsa"))
Target = TripleTarget.ltrim('-').rtrim('\0').str();
return HSA_STATUS_SUCCESS;
});
if (Err)
return Err;
return Target;
}
}
template <typename ResourceTy>
struct AMDGPUResourceRef : public GenericDeviceResourceRef {
using HandleTy = ResourceTy *;
AMDGPUResourceRef() : Resource(nullptr) {}
AMDGPUResourceRef(HandleTy Resource) : Resource(Resource) {}
virtual ~AMDGPUResourceRef() {}
Error create(GenericDeviceTy &Device) override;
Error destroy(GenericDeviceTy &Device) override {
if (!Resource)
return Plugin::error("Destroying an invalid resource");
if (auto Err = Resource->deinit())
return Err;
delete Resource;
Resource = nullptr;
return Plugin::success();
}
operator HandleTy() const { return Resource; }
private:
HandleTy Resource;
};
struct AMDGPUMemoryPoolTy {
AMDGPUMemoryPoolTy(hsa_amd_memory_pool_t MemoryPool)
: MemoryPool(MemoryPool), GlobalFlags(0) {}
Error init() {
if (auto Err = getAttr(HSA_AMD_MEMORY_POOL_INFO_SEGMENT, Segment))
return Err;
if (auto Err = getAttr(HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, GlobalFlags))
return Err;
return Plugin::success();
}
hsa_amd_memory_pool_t get() const { return MemoryPool; }
bool isGlobal() const { return (Segment == HSA_AMD_SEGMENT_GLOBAL); }
bool isReadOnly() const { return (Segment == HSA_AMD_SEGMENT_READONLY); }
bool isPrivate() const { return (Segment == HSA_AMD_SEGMENT_PRIVATE); }
bool isGroup() const { return (Segment == HSA_AMD_SEGMENT_GROUP); }
bool isFineGrained() const {
assert(isGlobal() && "Not global memory");
return (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED);
}
bool isCoarseGrained() const {
assert(isGlobal() && "Not global memory");
return (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED);
}
bool supportsKernelArgs() const {
assert(isGlobal() && "Not global memory");
return (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT);
}
Error allocate(size_t Size, void **PtrStorage) {
hsa_status_t Status =
hsa_amd_memory_pool_allocate(MemoryPool, Size, 0, PtrStorage);
return Plugin::check(Status, "Error in hsa_amd_memory_pool_allocate: %s");
}
Error deallocate(void *Ptr) {
hsa_status_t Status = hsa_amd_memory_pool_free(Ptr);
return Plugin::check(Status, "Error in hsa_amd_memory_pool_free: %s");
}
bool canAccess(hsa_agent_t Agent) {
hsa_amd_memory_pool_access_t Access;
if (hsa_amd_agent_memory_pool_get_info(
Agent, MemoryPool, HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, &Access))
return false;
return Access != HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED;
}
Error enableAccess(void *Ptr, int64_t Size,
const llvm::SmallVector<hsa_agent_t> &Agents) const {
#ifdef OMPTARGET_DEBUG
for (hsa_agent_t Agent : Agents) {
hsa_amd_memory_pool_access_t Access;
if (auto Err =
getAttr(Agent, HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, Access))
return Err;
if (Access == HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED)
return Plugin::error("An agent is not allowed to access a memory pool");
}
#endif
hsa_status_t Status =
hsa_amd_agents_allow_access(Agents.size(), Agents.data(), nullptr, Ptr);
return Plugin::check(Status, "Error in hsa_amd_agents_allow_access: %s");
}
template <typename Ty>
Error getAttr(hsa_amd_memory_pool_info_t Kind, Ty &Value) const {
hsa_status_t Status;
Status = hsa_amd_memory_pool_get_info(MemoryPool, Kind, &Value);
return Plugin::check(Status, "Error in hsa_amd_memory_pool_get_info: %s");
}
template <typename Ty>
hsa_status_t getAttrRaw(hsa_amd_memory_pool_info_t Kind, Ty &Value) const {
return hsa_amd_memory_pool_get_info(MemoryPool, Kind, &Value);
}
template <typename Ty>
Error getAttr(hsa_agent_t Agent, hsa_amd_agent_memory_pool_info_t Kind,
Ty &Value) const {
hsa_status_t Status;
Status =
hsa_amd_agent_memory_pool_get_info(Agent, MemoryPool, Kind, &Value);
return Plugin::check(Status,
"Error in hsa_amd_agent_memory_pool_get_info: %s");
}
private:
hsa_amd_memory_pool_t MemoryPool;
hsa_amd_segment_t Segment;
uint32_t GlobalFlags;
};
struct AMDGPUMemoryManagerTy : public DeviceAllocatorTy {
AMDGPUMemoryManagerTy(AMDGPUPluginTy &Plugin)
: Plugin(Plugin), MemoryPool(nullptr), MemoryManager(nullptr) {}
Error init(AMDGPUMemoryPoolTy &MemoryPool) {
const uint32_t Threshold = 1 << 30;
this->MemoryManager = new MemoryManagerTy(*this, Threshold);
this->MemoryPool = &MemoryPool;
return Plugin::success();
}
Error deinit() {
assert(MemoryManager && "Invalid memory manager");
delete MemoryManager;
MemoryManager = nullptr;
return Plugin::success();
}
Error allocate(size_t Size, void **PtrStorage) {
assert(MemoryManager && "Invalid memory manager");
assert(PtrStorage && "Invalid pointer storage");
*PtrStorage = MemoryManager->allocate(Size, nullptr);
if (*PtrStorage == nullptr)
return Plugin::error("Failure to allocate from AMDGPU memory manager");
return Plugin::success();
}
Error deallocate(void *Ptr) {
assert(Ptr && "Invalid pointer");
if (MemoryManager->free(Ptr))
return Plugin::error("Failure to deallocate from AMDGPU memory manager");
return Plugin::success();
}
private:
void *allocate(size_t Size, void *HstPtr, TargetAllocTy Kind) override;
int free(void *TgtPtr, TargetAllocTy Kind) override {
if (auto Err = MemoryPool->deallocate(TgtPtr)) {
consumeError(std::move(Err));
return OFFLOAD_FAIL;
}
return OFFLOAD_SUCCESS;
}
AMDGPUPluginTy &Plugin;
AMDGPUMemoryPoolTy *MemoryPool;
MemoryManagerTy *MemoryManager;
};
struct AMDGPUDeviceImageTy : public DeviceImageTy {
AMDGPUDeviceImageTy(int32_t ImageId, GenericDeviceTy &Device,
const __tgt_device_image *TgtImage)
: DeviceImageTy(ImageId, Device, TgtImage) {}
Error loadExecutable(const AMDGPUDeviceTy &Device);
Error unloadExecutable() {
hsa_status_t Status = hsa_executable_destroy(Executable);
if (auto Err = Plugin::check(Status, "Error in hsa_executable_destroy: %s"))
return Err;
Status = hsa_code_object_destroy(CodeObject);
return Plugin::check(Status, "Error in hsa_code_object_destroy: %s");
}
hsa_executable_t getExecutable() const { return Executable; }
uint16_t getELFABIVersion() const { return ELFABIVersion; }
Expected<hsa_executable_symbol_t>
findDeviceSymbol(GenericDeviceTy &Device, StringRef SymbolName) const;
std::optional<utils::KernelMetaDataTy>
getKernelInfo(StringRef Identifier) const {
auto It = KernelInfoMap.find(Identifier);
if (It == KernelInfoMap.end())
return {};
return It->second;
}
private:
hsa_executable_t Executable;
hsa_code_object_t CodeObject;
StringMap<utils::KernelMetaDataTy> KernelInfoMap;
uint16_t ELFABIVersion;
};
struct AMDGPUKernelTy : public GenericKernelTy {
AMDGPUKernelTy(const char *Name) : GenericKernelTy(Name) {}
Error initImpl(GenericDeviceTy &Device, DeviceImageTy &Image) override {
AMDGPUDeviceImageTy &AMDImage = static_cast<AMDGPUDeviceImageTy &>(Image);
std::string KernelName(getName());
KernelName += ".kd";
auto SymbolOrErr = AMDImage.findDeviceSymbol(Device, KernelName);
if (!SymbolOrErr)
return SymbolOrErr.takeError();
hsa_executable_symbol_t Symbol = *SymbolOrErr;
hsa_symbol_kind_t SymbolType;
hsa_status_t Status;
std::pair<hsa_executable_symbol_info_t, void *> RequiredInfos[] = {
{HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &SymbolType},
{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &KernelObject},
{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &ArgsSize},
{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &GroupSize},
{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK, &DynamicStack},
{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &PrivateSize}};
for (auto &Info : RequiredInfos) {
Status = hsa_executable_symbol_get_info(Symbol, Info.first, Info.second);
if (auto Err = Plugin::check(
Status, "Error in hsa_executable_symbol_get_info: %s"))
return Err;
}
if (SymbolType != HSA_SYMBOL_KIND_KERNEL)
return Plugin::error("Symbol %s is not a kernel function");
ImplicitArgsSize = utils::getImplicitArgsSize(AMDImage.getELFABIVersion());
DP("ELFABIVersion: %d\n", AMDImage.getELFABIVersion());
KernelInfo = AMDImage.getKernelInfo(getName());
if (!KernelInfo.has_value())
INFO(OMP_INFOTYPE_PLUGIN_KERNEL, Device.getDeviceId(),
"Could not read extra information for kernel %s.", getName());
return Plugin::success();
}
Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads,
uint64_t NumBlocks, KernelArgsTy &KernelArgs,
KernelLaunchParamsTy LaunchParams,
AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
Error printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
KernelArgsTy &KernelArgs, uint32_t NumThreads,
uint64_t NumBlocks) const override;
uint32_t getGroupSize() const { return GroupSize; }
uint32_t getPrivateSize() const { return PrivateSize; }
uint64_t getKernelObject() const { return KernelObject; }
uint32_t getImplicitArgsSize() const { return ImplicitArgsSize; }
bool usesDynamicStack() const { return DynamicStack; }
private:
uint64_t KernelObject;
uint32_t ArgsSize;
uint32_t GroupSize;
uint32_t PrivateSize;
bool DynamicStack;
uint32_t ImplicitArgsSize;
std::optional<utils::KernelMetaDataTy> KernelInfo;
};
struct AMDGPUSignalTy {
AMDGPUSignalTy() : HSASignal({0}), UseCount() {}
AMDGPUSignalTy(AMDGPUDeviceTy &Device) : HSASignal({0}), UseCount() {}
Error init(uint32_t InitialValue = 1) {
hsa_status_t Status =
hsa_amd_signal_create(InitialValue, 0, nullptr, 0, &HSASignal);
return Plugin::check(Status, "Error in hsa_signal_create: %s");
}
Error deinit() {
hsa_status_t Status = hsa_signal_destroy(HSASignal);
return Plugin::check(Status, "Error in hsa_signal_destroy: %s");
}
Error wait(const uint64_t ActiveTimeout = 0, RPCServerTy *RPCServer = nullptr,
GenericDeviceTy *Device = nullptr) const {
if (ActiveTimeout && !RPCServer) {
hsa_signal_value_t Got = 1;
Got = hsa_signal_wait_scacquire(HSASignal, HSA_SIGNAL_CONDITION_EQ, 0,
ActiveTimeout, HSA_WAIT_STATE_ACTIVE);
if (Got == 0)
return Plugin::success();
}
uint64_t Timeout = RPCServer ? 8192 : UINT64_MAX;
auto WaitState = RPCServer ? HSA_WAIT_STATE_ACTIVE : HSA_WAIT_STATE_BLOCKED;
while (hsa_signal_wait_scacquire(HSASignal, HSA_SIGNAL_CONDITION_EQ, 0,
Timeout, WaitState) != 0) {
if (RPCServer && Device)
if (auto Err = RPCServer->runServer(*Device))
return Err;
}
return Plugin::success();
}
hsa_signal_value_t load() const {
return hsa_signal_load_scacquire(HSASignal);
}
void signal() {
assert(load() > 0 && "Invalid signal value");
hsa_signal_subtract_screlease(HSASignal, 1);
}
void reset() { hsa_signal_store_screlease(HSASignal, 1); }
void increaseUseCount() { UseCount.increase(); }
bool decreaseUseCount() { return UseCount.decrease(); }
hsa_signal_t get() const { return HSASignal; }
private:
hsa_signal_t HSASignal;
RefCountTy<> UseCount;
};
using AMDGPUSignalRef = AMDGPUResourceRef<AMDGPUSignalTy>;
using AMDGPUSignalManagerTy = GenericDeviceResourceManagerTy<AMDGPUSignalRef>;
struct AMDGPUQueueTy {
AMDGPUQueueTy() : Queue(nullptr), Mutex(), NumUsers(0) {}
Error init(hsa_agent_t Agent, int32_t QueueSize) {
if (Queue)
return Plugin::success();
hsa_status_t Status =
hsa_queue_create(Agent, QueueSize, HSA_QUEUE_TYPE_MULTI, callbackError,
nullptr, UINT32_MAX, UINT32_MAX, &Queue);
return Plugin::check(Status, "Error in hsa_queue_create: %s");
}
Error deinit() {
std::lock_guard<std::mutex> Lock(Mutex);
if (!Queue)
return Plugin::success();
hsa_status_t Status = hsa_queue_destroy(Queue);
return Plugin::check(Status, "Error in hsa_queue_destroy: %s");
}
bool getUserCount() const { return NumUsers; }
bool isInitialized() { return Queue != nullptr; }
void removeUser() { --NumUsers; }
void addUser() { ++NumUsers; }
Error pushKernelLaunch(const AMDGPUKernelTy &Kernel, void *KernelArgs,
uint32_t NumThreads, uint64_t NumBlocks,
uint32_t GroupSize, uint64_t StackSize,
AMDGPUSignalTy *OutputSignal,
AMDGPUSignalTy *InputSignal) {
assert(OutputSignal && "Invalid kernel output signal");
std::lock_guard<std::mutex> Lock(Mutex);
assert(Queue && "Interacted with a non-initialized queue!");
if (InputSignal && InputSignal->load())
if (auto Err = pushBarrierImpl(nullptr, InputSignal))
return Err;
uint64_t PacketId;
hsa_kernel_dispatch_packet_t *Packet = acquirePacket(PacketId);
assert(Packet && "Invalid packet");
uint16_t Setup = UINT16_C(1) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
Packet->workgroup_size_x = NumThreads;
Packet->workgroup_size_y = 1;
Packet->workgroup_size_z = 1;
Packet->reserved0 = 0;
Packet->grid_size_x = NumBlocks * NumThreads;
Packet->grid_size_y = 1;
Packet->grid_size_z = 1;
Packet->private_segment_size =
Kernel.usesDynamicStack() ? StackSize : Kernel.getPrivateSize();
Packet->group_segment_size = GroupSize;
Packet->kernel_object = Kernel.getKernelObject();
Packet->kernarg_address = KernelArgs;
Packet->reserved2 = 0;
Packet->completion_signal = OutputSignal->get();
publishKernelPacket(PacketId, Setup, Packet);
return Plugin::success();
}
Error pushBarrier(AMDGPUSignalTy *OutputSignal,
const AMDGPUSignalTy *InputSignal1,
const AMDGPUSignalTy *InputSignal2) {
std::lock_guard<std::mutex> Lock(Mutex);
assert(Queue && "Interacted with a non-initialized queue!");
return pushBarrierImpl(OutputSignal, InputSignal1, InputSignal2);
}
private:
Error pushBarrierImpl(AMDGPUSignalTy *OutputSignal,
const AMDGPUSignalTy *InputSignal1,
const AMDGPUSignalTy *InputSignal2 = nullptr) {
uint64_t PacketId;
hsa_barrier_and_packet_t *Packet =
(hsa_barrier_and_packet_t *)acquirePacket(PacketId);
assert(Packet && "Invalid packet");
Packet->reserved0 = 0;
Packet->reserved1 = 0;
Packet->dep_signal[0] = {0};
Packet->dep_signal[1] = {0};
Packet->dep_signal[2] = {0};
Packet->dep_signal[3] = {0};
Packet->dep_signal[4] = {0};
Packet->reserved2 = 0;
Packet->completion_signal = {0};
if (OutputSignal)
Packet->completion_signal = OutputSignal->get();
if (InputSignal1)
Packet->dep_signal[0] = InputSignal1->get();
if (InputSignal2)
Packet->dep_signal[1] = InputSignal2->get();
publishBarrierPacket(PacketId, Packet);
return Plugin::success();
}
hsa_kernel_dispatch_packet_t *acquirePacket(uint64_t &PacketId) {
PacketId = hsa_queue_add_write_index_relaxed(Queue, 1);
while (PacketId - hsa_queue_load_read_index_scacquire(Queue) >= Queue->size)
;
const uint32_t Mask = Queue->size - 1;
return (hsa_kernel_dispatch_packet_t *)Queue->base_address +
(PacketId & Mask);
}
void publishKernelPacket(uint64_t PacketId, uint16_t Setup,
hsa_kernel_dispatch_packet_t *Packet) {
uint32_t *PacketPtr = reinterpret_cast<uint32_t *>(Packet);
uint16_t Header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
uint32_t HeaderWord = Header | (Setup << 16u);
__atomic_store_n(PacketPtr, HeaderWord, __ATOMIC_RELEASE);
hsa_signal_store_relaxed(Queue->doorbell_signal, PacketId);
}
void publishBarrierPacket(uint64_t PacketId,
hsa_barrier_and_packet_t *Packet) {
uint32_t *PacketPtr = reinterpret_cast<uint32_t *>(Packet);
uint16_t Setup = 0;
uint16_t Header = HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE;
Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
uint32_t HeaderWord = Header | (Setup << 16u);
__atomic_store_n(PacketPtr, HeaderWord, __ATOMIC_RELEASE);
hsa_signal_store_relaxed(Queue->doorbell_signal, PacketId);
}
static void callbackError(hsa_status_t Status, hsa_queue_t *Source, void *) {
auto Err = Plugin::check(Status, "Received error in queue %p: %s", Source);
FATAL_MESSAGE(1, "%s", toString(std::move(Err)).data());
}
hsa_queue_t *Queue;
std::mutex Mutex;
uint32_t NumUsers;
};
struct AMDGPUStreamTy {
private:
struct MemcpyArgsTy {
void *Dst;
const void *Src;
size_t Size;
};
struct ReleaseBufferArgsTy {
void *Buffer;
AMDGPUMemoryManagerTy *MemoryManager;
};
struct ReleaseSignalArgsTy {
AMDGPUSignalTy *Signal;
AMDGPUSignalManagerTy *SignalManager;
};
struct StreamSlotTy {
AMDGPUSignalTy *Signal;
Error (*ActionFunction)(void *);
union {
MemcpyArgsTy MemcpyArgs;
ReleaseBufferArgsTy ReleaseBufferArgs;
ReleaseSignalArgsTy ReleaseSignalArgs;
} ActionArgs;
StreamSlotTy() : Signal(nullptr), ActionFunction(nullptr) {}
Error schedHostMemoryCopy(void *Dst, const void *Src, size_t Size) {
ActionFunction = memcpyAction;
ActionArgs.MemcpyArgs = MemcpyArgsTy{Dst, Src, Size};
return Plugin::success();
}
Error schedReleaseBuffer(void *Buffer, AMDGPUMemoryManagerTy &Manager) {
ActionFunction = releaseBufferAction;
ActionArgs.ReleaseBufferArgs = ReleaseBufferArgsTy{Buffer, &Manager};
return Plugin::success();
}
Error schedReleaseSignal(AMDGPUSignalTy *SignalToRelease,
AMDGPUSignalManagerTy *SignalManager) {
ActionFunction = releaseSignalAction;
ActionArgs.ReleaseSignalArgs =
ReleaseSignalArgsTy{SignalToRelease, SignalManager};
return Plugin::success();
}
Error performAction() {
if (!ActionFunction)
return Plugin::success();
if (ActionFunction == memcpyAction) {
if (auto Err = memcpyAction(&ActionArgs))
return Err;
} else if (ActionFunction == releaseBufferAction) {
if (auto Err = releaseBufferAction(&ActionArgs))
return Err;
} else if (ActionFunction == releaseSignalAction) {
if (auto Err = releaseSignalAction(&ActionArgs))
return Err;
} else {
return Plugin::error("Unknown action function!");
}
ActionFunction = nullptr;
return Plugin::success();
}
};
hsa_agent_t Agent;
AMDGPUQueueTy *Queue;
AMDGPUSignalManagerTy &SignalManager;
GenericDeviceTy &Device;
std::deque<StreamSlotTy> Slots;
uint32_t NextSlot;
uint32_t SyncCycle;
RPCServerTy *RPCServer;
mutable std::mutex Mutex;
const uint64_t StreamBusyWaitMicroseconds;
bool UseMultipleSdmaEngines;
uint32_t size() const { return NextSlot; }
uint32_t last() const { return size() - 1; }
std::pair<uint32_t, AMDGPUSignalTy *> consume(AMDGPUSignalTy *OutputSignal) {
if (Slots.size() == NextSlot)
Slots.resize(Slots.size() * 2);
uint32_t Curr = NextSlot++;
AMDGPUSignalTy *InputSignal = (Curr > 0) ? Slots[Curr - 1].Signal : nullptr;
Slots[Curr].Signal = OutputSignal;
return std::make_pair(Curr, InputSignal);
}
Error complete() {
for (uint32_t Slot = 0; Slot < NextSlot; ++Slot) {
if (auto Err = Slots[Slot].performAction())
return Err;
if (Slots[Slot].Signal->decreaseUseCount())
if (auto Err = SignalManager.returnResource(Slots[Slot].Signal))
return Err;
Slots[Slot].Signal = nullptr;
}
NextSlot = 0;
SyncCycle += 1;
return Plugin::success();
}
Error waitOnStreamOperation(AMDGPUStreamTy &OtherStream, uint32_t Slot) {
if (Queue == nullptr)
return Plugin::error("Target queue was nullptr");
AMDGPUSignalTy *OtherSignal = OtherStream.Slots[Slot].Signal;
OtherSignal->increaseUseCount();
AMDGPUSignalTy *OutputSignal = nullptr;
if (auto Err = SignalManager.getResource(OutputSignal))
return Err;
OutputSignal->reset();
OutputSignal->increaseUseCount();
auto [Curr, InputSignal] = consume(OutputSignal);
if (auto Err = Slots[Curr].schedReleaseSignal(OtherSignal, &SignalManager))
return Err;
return Queue->pushBarrier(OutputSignal, InputSignal, OtherSignal);
}
static bool asyncActionCallback(hsa_signal_value_t Value, void *Args) {
StreamSlotTy *Slot = reinterpret_cast<StreamSlotTy *>(Args);
assert(Slot && "Invalid slot");
assert(Slot->Signal && "Invalid signal");
std::atomic_thread_fence(std::memory_order_acquire);
if (auto Err = Slot->performAction())
FATAL_MESSAGE(1, "Error peforming post action: %s",
toString(std::move(Err)).data());
Slot->Signal->signal();
return false;
}
static Error memcpyAction(void *Data) {
MemcpyArgsTy *Args = reinterpret_cast<MemcpyArgsTy *>(Data);
assert(Args && "Invalid arguments");
assert(Args->Dst && "Invalid destination buffer");
assert(Args->Src && "Invalid source buffer");
std::memcpy(Args->Dst, Args->Src, Args->Size);
return Plugin::success();
}
static Error releaseBufferAction(void *Data) {
ReleaseBufferArgsTy *Args = reinterpret_cast<ReleaseBufferArgsTy *>(Data);
assert(Args && "Invalid arguments");
assert(Args->MemoryManager && "Invalid memory manager");
assert(Args->Buffer && "Invalid buffer");
return Args->MemoryManager->deallocate(Args->Buffer);
}
static Error releaseSignalAction(void *Data) {
ReleaseSignalArgsTy *Args = reinterpret_cast<ReleaseSignalArgsTy *>(Data);
assert(Args && "Invalid arguments");
assert(Args->Signal && "Invalid signal");
assert(Args->SignalManager && "Invalid signal manager");
if (Args->Signal->decreaseUseCount())
if (auto Err = Args->SignalManager->returnResource(Args->Signal))
return Err;
return Plugin::success();
}
public:
AMDGPUStreamTy(AMDGPUDeviceTy &Device);
Error init() { return Plugin::success(); }
Error deinit() { return Plugin::success(); }
void setRPCServer(RPCServerTy *Server) { RPCServer = Server; }
Error pushKernelLaunch(const AMDGPUKernelTy &Kernel, void *KernelArgs,
uint32_t NumThreads, uint64_t NumBlocks,
uint32_t GroupSize, uint64_t StackSize,
AMDGPUMemoryManagerTy &MemoryManager) {
if (Queue == nullptr)
return Plugin::error("Target queue was nullptr");
AMDGPUSignalTy *OutputSignal = nullptr;
if (auto Err = SignalManager.getResource(OutputSignal))
return Err;
OutputSignal->reset();
OutputSignal->increaseUseCount();
std::lock_guard<std::mutex> StreamLock(Mutex);
auto [Curr, InputSignal] = consume(OutputSignal);
if (auto Err = Slots[Curr].schedReleaseBuffer(KernelArgs, MemoryManager))
return Err;
return Queue->pushKernelLaunch(Kernel, KernelArgs, NumThreads, NumBlocks,
GroupSize, StackSize, OutputSignal,
InputSignal);
}
Error pushPinnedMemoryCopyAsync(void *Dst, const void *Src,
uint64_t CopySize) {
AMDGPUSignalTy *OutputSignal = nullptr;
if (auto Err = SignalManager.getResource(OutputSignal))
return Err;
OutputSignal->reset();
OutputSignal->increaseUseCount();
std::lock_guard<std::mutex> Lock(Mutex);
auto [Curr, InputSignal] = consume(OutputSignal);
if (InputSignal && InputSignal->load()) {
hsa_signal_t InputSignalRaw = InputSignal->get();
return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Src, Agent,
CopySize, 1, &InputSignalRaw,
OutputSignal->get());
}
return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Src, Agent,
CopySize, 0, nullptr, OutputSignal->get());
}
Error pushMemoryCopyD2HAsync(void *Dst, const void *Src, void *Inter,
uint64_t CopySize,
AMDGPUMemoryManagerTy &MemoryManager) {
AMDGPUSignalTy *OutputSignals[2] = {};
if (auto Err = SignalManager.getResources(2, OutputSignals))
return Err;
for (auto *Signal : OutputSignals) {
Signal->reset();
Signal->increaseUseCount();
}
std::lock_guard<std::mutex> Lock(Mutex);
auto [Curr, InputSignal] = consume(OutputSignals[0]);
if (auto Err = Slots[Curr].schedReleaseBuffer(Inter, MemoryManager))
return Err;
if (InputSignal && InputSignal->load()) {
hsa_signal_t InputSignalRaw = InputSignal->get();
if (auto Err = utils::asyncMemCopy(
UseMultipleSdmaEngines, Inter, Agent, Src, Agent, CopySize, 1,
&InputSignalRaw, OutputSignals[0]->get()))
return Err;
} else {
if (auto Err = utils::asyncMemCopy(UseMultipleSdmaEngines, Inter, Agent,
Src, Agent, CopySize, 0, nullptr,
OutputSignals[0]->get()))
return Err;
}
std::tie(Curr, InputSignal) = consume(OutputSignals[1]);
assert(InputSignal && "Invalid input signal");
if (auto Err = Slots[Curr].schedHostMemoryCopy(Dst, Inter, CopySize))
return Err;
std::atomic_thread_fence(std::memory_order_release);
hsa_status_t Status = hsa_amd_signal_async_handler(
InputSignal->get(), HSA_SIGNAL_CONDITION_EQ, 0, asyncActionCallback,
(void *)&Slots[Curr]);
return Plugin::check(Status, "Error in hsa_amd_signal_async_handler: %s");
}
Error pushMemoryCopyH2DAsync(void *Dst, const void *Src, void *Inter,
uint64_t CopySize,
AMDGPUMemoryManagerTy &MemoryManager) {
AMDGPUSignalTy *OutputSignals[2] = {};
if (auto Err = SignalManager.getResources(2, OutputSignals))
return Err;
for (auto *Signal : OutputSignals) {
Signal->reset();
Signal->increaseUseCount();
}
AMDGPUSignalTy *OutputSignal = OutputSignals[0];
std::lock_guard<std::mutex> Lock(Mutex);
auto [Curr, InputSignal] = consume(OutputSignal);
if (InputSignal && InputSignal->load()) {
if (auto Err = Slots[Curr].schedHostMemoryCopy(Inter, Src, CopySize))
return Err;
std::atomic_thread_fence(std::memory_order_release);
hsa_status_t Status = hsa_amd_signal_async_handler(
InputSignal->get(), HSA_SIGNAL_CONDITION_EQ, 0, asyncActionCallback,
(void *)&Slots[Curr]);
if (auto Err = Plugin::check(Status,
"Error in hsa_amd_signal_async_handler: %s"))
return Err;
OutputSignal = OutputSignals[1];
std::tie(Curr, InputSignal) = consume(OutputSignal);
} else {
std::memcpy(Inter, Src, CopySize);
OutputSignals[1]->decreaseUseCount();
if (auto Err = SignalManager.returnResource(OutputSignals[1]))
return Err;
}
if (auto Err = Slots[Curr].schedReleaseBuffer(Inter, MemoryManager))
return Err;
if (InputSignal && InputSignal->load()) {
hsa_signal_t InputSignalRaw = InputSignal->get();
return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Inter,
Agent, CopySize, 1, &InputSignalRaw,
OutputSignal->get());
}
return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Inter, Agent,
CopySize, 0, nullptr, OutputSignal->get());
}
Error pushMemoryCopyD2DAsync(void *Dst, hsa_agent_t DstAgent, const void *Src,
hsa_agent_t SrcAgent, uint64_t CopySize) {
AMDGPUSignalTy *OutputSignal;
if (auto Err = SignalManager.getResources(1, &OutputSignal))
return Err;
OutputSignal->reset();
OutputSignal->increaseUseCount();
std::lock_guard<std::mutex> Lock(Mutex);
auto [Curr, InputSignal] = consume(OutputSignal);
if (InputSignal && InputSignal->load()) {
hsa_signal_t InputSignalRaw = InputSignal->get();
return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, DstAgent, Src,
SrcAgent, CopySize, 1, &InputSignalRaw,
OutputSignal->get());
}
return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, DstAgent, Src,
SrcAgent, CopySize, 0, nullptr,
OutputSignal->get());
}
Error synchronize() {
std::lock_guard<std::mutex> Lock(Mutex);
if (size() == 0)
return Plugin::success();
if (auto Err = Slots[last()].Signal->wait(StreamBusyWaitMicroseconds,
RPCServer, &Device))
return Err;
return complete();
}
Expected<bool> query() {
std::lock_guard<std::mutex> Lock(Mutex);
if (size() == 0)
return true;
if (Slots[last()].Signal->load())
return false;
if (auto Err = complete())
return std::move(Err);
return true;
}
Error recordEvent(AMDGPUEventTy &Event) const;
Error waitEvent(const AMDGPUEventTy &Event);
friend struct AMDGPUStreamManagerTy;
};
struct AMDGPUEventTy {
AMDGPUEventTy(AMDGPUDeviceTy &Device)
: RecordedStream(nullptr), RecordedSlot(-1), RecordedSyncCycle(-1) {}
Error init() { return Plugin::success(); }
Error deinit() { return Plugin::success(); }
Error record(AMDGPUStreamTy &Stream) {
std::lock_guard<std::mutex> Lock(Mutex);
RecordedStream = &Stream;
return Stream.recordEvent(*this);
}
Error wait(AMDGPUStreamTy &Stream) {
std::lock_guard<std::mutex> Lock(Mutex);
if (!RecordedStream)
return Plugin::error("Event does not have any recorded stream");
if (RecordedStream == &Stream)
return Plugin::success();
if (RecordedSlot < 0)
return Plugin::success();
return Stream.waitEvent(*this);
}
protected:
AMDGPUStreamTy *RecordedStream;
int64_t RecordedSlot;
int64_t RecordedSyncCycle;
mutable std::mutex Mutex;
friend struct AMDGPUStreamTy;
};
Error AMDGPUStreamTy::recordEvent(AMDGPUEventTy &Event) const {
std::lock_guard<std::mutex> Lock(Mutex);
if (size() > 0) {
Event.RecordedSyncCycle = SyncCycle;
Event.RecordedSlot = last();
assert(Event.RecordedSyncCycle >= 0 && "Invalid recorded sync cycle");
assert(Event.RecordedSlot >= 0 && "Invalid recorded slot");
} else {
Event.RecordedSyncCycle = -1;
Event.RecordedSlot = -1;
}
return Plugin::success();
}
Error AMDGPUStreamTy::waitEvent(const AMDGPUEventTy &Event) {
AMDGPUStreamTy &RecordedStream = *Event.RecordedStream;
std::scoped_lock<std::mutex, std::mutex> Lock(Mutex, RecordedStream.Mutex);
if (RecordedStream.SyncCycle != (uint32_t)Event.RecordedSyncCycle)
return Plugin::success();
if (!RecordedStream.Slots[Event.RecordedSlot].Signal->load())
return Plugin::success();
return waitOnStreamOperation(RecordedStream, Event.RecordedSlot);
}
struct AMDGPUStreamManagerTy final
: GenericDeviceResourceManagerTy<AMDGPUResourceRef<AMDGPUStreamTy>> {
using ResourceRef = AMDGPUResourceRef<AMDGPUStreamTy>;
using ResourcePoolTy = GenericDeviceResourceManagerTy<ResourceRef>;
AMDGPUStreamManagerTy(GenericDeviceTy &Device, hsa_agent_t HSAAgent)
: GenericDeviceResourceManagerTy(Device),
OMPX_QueueTracking("LIBOMPTARGET_AMDGPU_HSA_QUEUE_BUSY_TRACKING", true),
NextQueue(0), Agent(HSAAgent) {}
Error init(uint32_t InitialSize, int NumHSAQueues, int HSAQueueSize) {
Queues = std::vector<AMDGPUQueueTy>(NumHSAQueues);
QueueSize = HSAQueueSize;
MaxNumQueues = NumHSAQueues;
if (auto Err = Queues.front().init(Agent, QueueSize))
return Err;
return GenericDeviceResourceManagerTy::init(InitialSize);
}
Error deinit() override {
for (AMDGPUQueueTy &Queue : Queues) {
if (auto Err = Queue.deinit())
return Err;
}
return GenericDeviceResourceManagerTy::deinit();
}
virtual Error getResource(AMDGPUStreamTy *&StreamHandle) override {
return getResourcesImpl(1, &StreamHandle, [this](AMDGPUStreamTy *&Handle) {
return assignNextQueue(Handle);
});
}
virtual Error returnResource(AMDGPUStreamTy *StreamHandle) override {
return returnResourceImpl(StreamHandle, [](AMDGPUStreamTy *Handle) {
Handle->Queue->removeUser();
return Plugin::success();
});
}
private:
inline Error assignNextQueue(AMDGPUStreamTy *Stream) {
uint32_t Index = OMPX_QueueTracking ? 0 : NextQueue++ % MaxNumQueues;
if (OMPX_QueueTracking) {
for (uint32_t I = 0; I < MaxNumQueues; ++I) {
if (Queues[I].isInitialized() && Queues[I].getUserCount() == 0) {
Index = I;
break;
}
if (Queues[Index].getUserCount() > Queues[I].getUserCount())
Index = I;
}
}
if (auto Err = Queues[Index].init(Agent, QueueSize))
return Err;
Queues[Index].addUser();
Stream->Queue = &Queues[Index];
return Plugin::success();
}
BoolEnvar OMPX_QueueTracking;
uint32_t NextQueue;
std::vector<AMDGPUQueueTy> Queues;
hsa_agent_t Agent;
uint32_t MaxNumQueues;
uint32_t QueueSize;
};
struct AMDGenericDeviceTy {
AMDGenericDeviceTy() {}
virtual ~AMDGenericDeviceTy() {}
Error initMemoryPools() {
Error Err = retrieveAllMemoryPools();
if (Err)
return Err;
for (AMDGPUMemoryPoolTy *MemoryPool : AllMemoryPools) {
Error Err = MemoryPool->init();
if (Err)
return Err;
if (!MemoryPool->isGlobal())
continue;
if (MemoryPool->isFineGrained()) {
FineGrainedMemoryPools.push_back(MemoryPool);
if (MemoryPool->supportsKernelArgs())
ArgsMemoryPools.push_back(MemoryPool);
} else if (MemoryPool->isCoarseGrained()) {
CoarseGrainedMemoryPools.push_back(MemoryPool);
}
}
return Plugin::success();
}
Error deinitMemoryPools() {
for (AMDGPUMemoryPoolTy *Pool : AllMemoryPools)
delete Pool;
AllMemoryPools.clear();
FineGrainedMemoryPools.clear();
CoarseGrainedMemoryPools.clear();
ArgsMemoryPools.clear();
return Plugin::success();
}
virtual Error retrieveAllMemoryPools() = 0;
virtual hsa_agent_t getAgent() const = 0;
protected:
llvm::SmallVector<AMDGPUMemoryPoolTy *> AllMemoryPools;
llvm::SmallVector<AMDGPUMemoryPoolTy *> FineGrainedMemoryPools;
llvm::SmallVector<AMDGPUMemoryPoolTy *> CoarseGrainedMemoryPools;
llvm::SmallVector<AMDGPUMemoryPoolTy *> ArgsMemoryPools;
};
struct AMDHostDeviceTy : public AMDGenericDeviceTy {
AMDHostDeviceTy(AMDGPUPluginTy &Plugin,
const llvm::SmallVector<hsa_agent_t> &HostAgents)
: AMDGenericDeviceTy(), Agents(HostAgents), ArgsMemoryManager(Plugin),
PinnedMemoryManager(Plugin) {
assert(HostAgents.size() && "No host agent found");
}
Error init() {
if (auto Err = initMemoryPools())
return Err;
if (auto Err = ArgsMemoryManager.init(getArgsMemoryPool()))
return Err;
if (auto Err = PinnedMemoryManager.init(getFineGrainedMemoryPool()))
return Err;
return Plugin::success();
}
Error deinit() {
if (auto Err = deinitMemoryPools())
return Err;
if (auto Err = ArgsMemoryManager.deinit())
return Err;
if (auto Err = PinnedMemoryManager.deinit())
return Err;
return Plugin::success();
}
Error retrieveAllMemoryPools() override {
for (hsa_agent_t Agent : Agents) {
Error Err = utils::iterateAgentMemoryPools(
Agent, [&](hsa_amd_memory_pool_t HSAMemoryPool) {
AMDGPUMemoryPoolTy *MemoryPool =
new AMDGPUMemoryPoolTy(HSAMemoryPool);
AllMemoryPools.push_back(MemoryPool);
return HSA_STATUS_SUCCESS;
});
if (Err)
return Err;
}
return Plugin::success();
}
hsa_agent_t getAgent() const override { return Agents[0]; }
AMDGPUMemoryPoolTy &getFineGrainedMemoryPool() {
assert(!FineGrainedMemoryPools.empty() && "No fine-grained mempool");
return *FineGrainedMemoryPools[0];
}
AMDGPUMemoryPoolTy &getCoarseGrainedMemoryPool() {
assert(!CoarseGrainedMemoryPools.empty() && "No coarse-grained mempool");
return *CoarseGrainedMemoryPools[0];
}
AMDGPUMemoryPoolTy &getArgsMemoryPool() {
assert(!ArgsMemoryPools.empty() && "No kernelargs mempool");
return *ArgsMemoryPools[0];
}
AMDGPUMemoryManagerTy &getArgsMemoryManager() { return ArgsMemoryManager; }
AMDGPUMemoryManagerTy &getPinnedMemoryManager() {
return PinnedMemoryManager;
}
private:
const llvm::SmallVector<hsa_agent_t> Agents;
AMDGPUMemoryManagerTy ArgsMemoryManager;
AMDGPUMemoryManagerTy PinnedMemoryManager;
};
struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
AMDGPUDeviceTy(GenericPluginTy &Plugin, int32_t DeviceId, int32_t NumDevices,
AMDHostDeviceTy &HostDevice, hsa_agent_t Agent)
: GenericDeviceTy(Plugin, DeviceId, NumDevices, {}), AMDGenericDeviceTy(),
OMPX_NumQueues("LIBOMPTARGET_AMDGPU_NUM_HSA_QUEUES", 4),
OMPX_QueueSize("LIBOMPTARGET_AMDGPU_HSA_QUEUE_SIZE", 512),
OMPX_DefaultTeamsPerCU("LIBOMPTARGET_AMDGPU_TEAMS_PER_CU", 4),
OMPX_MaxAsyncCopyBytes("LIBOMPTARGET_AMDGPU_MAX_ASYNC_COPY_BYTES",
1 * 1024 * 1024),
OMPX_InitialNumSignals("LIBOMPTARGET_AMDGPU_NUM_INITIAL_HSA_SIGNALS",
64),
OMPX_StreamBusyWait("LIBOMPTARGET_AMDGPU_STREAM_BUSYWAIT", 2000000),
OMPX_UseMultipleSdmaEngines(
"LIBOMPTARGET_AMDGPU_USE_MULTIPLE_SDMA_ENGINES", false),
OMPX_ApuMaps("OMPX_APU_MAPS", false), AMDGPUStreamManager(*this, Agent),
AMDGPUEventManager(*this), AMDGPUSignalManager(*this), Agent(Agent),
HostDevice(HostDevice) {}
~AMDGPUDeviceTy() {}
Error initImpl(GenericPluginTy &Plugin) override {
if (auto Err = initMemoryPools())
return Err;
char GPUName[64];
if (auto Err = getDeviceAttr(HSA_AGENT_INFO_NAME, GPUName))
return Err;
ComputeUnitKind = GPUName;
uint32_t WavefrontSize = 0;
if (auto Err = getDeviceAttr(HSA_AGENT_INFO_WAVEFRONT_SIZE, WavefrontSize))
return Err;
GridValues.GV_Warp_Size = WavefrontSize;
if (getDeviceAttrRaw(HSA_AMD_AGENT_INFO_TIMESTAMP_FREQUENCY,
ClockFrequency) != HSA_STATUS_SUCCESS)
ClockFrequency = 0;
if (WavefrontSize == 32)
GridValues = getAMDGPUGridValues<32>();
else if (WavefrontSize == 64)
GridValues = getAMDGPUGridValues<64>();
else
return Plugin::error("Unexpected AMDGPU wavefront %d", WavefrontSize);
uint16_t WorkgroupMaxDim[3];
if (auto Err =
getDeviceAttr(HSA_AGENT_INFO_WORKGROUP_MAX_DIM, WorkgroupMaxDim))
return Err;
GridValues.GV_Max_WG_Size = WorkgroupMaxDim[0];
hsa_dim3_t GridMaxDim;
if (auto Err = getDeviceAttr(HSA_AGENT_INFO_GRID_MAX_DIM, GridMaxDim))
return Err;
GridValues.GV_Max_Teams = GridMaxDim.x / GridValues.GV_Max_WG_Size;
if (GridValues.GV_Max_Teams == 0)
return Plugin::error("Maximum number of teams cannot be zero");
uint32_t ComputeUnits = 0;
if (auto Err =
getDeviceAttr(HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, ComputeUnits))
return Err;
GridValues.GV_Default_Num_Teams = ComputeUnits * OMPX_DefaultTeamsPerCU;
uint32_t WavesPerCU = 0;
if (auto Err =
getDeviceAttr(HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU, WavesPerCU))
return Err;
HardwareParallelism = ComputeUnits * WavesPerCU;
uint32_t MaxQueueSize;
if (auto Err = getDeviceAttr(HSA_AGENT_INFO_QUEUE_MAX_SIZE, MaxQueueSize))
return Err;
uint32_t MaxQueues;
if (auto Err = getDeviceAttr(HSA_AGENT_INFO_QUEUES_MAX, MaxQueues))
return Err;
OMPX_NumQueues = std::max(1U, std::min(OMPX_NumQueues.get(), MaxQueues));
OMPX_QueueSize = std::min(OMPX_QueueSize.get(), MaxQueueSize);
if (auto Err = AMDGPUStreamManager.init(OMPX_InitialNumStreams,
OMPX_NumQueues, OMPX_QueueSize))
return Err;
if (auto Err = AMDGPUEventManager.init(OMPX_InitialNumEvents))
return Err;
if (auto Err = AMDGPUSignalManager.init(OMPX_InitialNumSignals))
return Err;
auto TargeTripleAndFeaturesOrError =
utils::getTargetTripleAndFeatures(Agent);
if (!TargeTripleAndFeaturesOrError)
return TargeTripleAndFeaturesOrError.takeError();
if (static_cast<StringRef>(*TargeTripleAndFeaturesOrError)
.contains("xnack+"))
IsXnackEnabled = true;
if (auto Err = checkIfAPU())
return Err;
return Plugin::success();
}
Error deinitImpl() override {
if (auto Err = AMDGPUStreamManager.deinit())
return Err;
if (auto Err = AMDGPUEventManager.deinit())
return Err;
if (auto Err = AMDGPUSignalManager.deinit())
return Err;
if (!LoadedImages.empty()) {
for (DeviceImageTy *Image : LoadedImages) {
AMDGPUDeviceImageTy &AMDImage =
static_cast<AMDGPUDeviceImageTy &>(*Image);
if (auto Err = AMDImage.unloadExecutable())
return Err;
}
}
Agent = {0};
return Plugin::success();
}
virtual Error callGlobalConstructors(GenericPluginTy &Plugin,
DeviceImageTy &Image) override {
GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler();
if (Handler.isSymbolInImage(*this, Image, "amdgcn.device.fini"))
Image.setPendingGlobalDtors();
return callGlobalCtorDtorCommon(Plugin, Image, true);
}
virtual Error callGlobalDestructors(GenericPluginTy &Plugin,
DeviceImageTy &Image) override {
if (Image.hasPendingGlobalDtors())
return callGlobalCtorDtorCommon(Plugin, Image, false);
return Plugin::success();
}
uint64_t getStreamBusyWaitMicroseconds() const { return OMPX_StreamBusyWait; }
Expected<std::unique_ptr<MemoryBuffer>>
doJITPostProcessing(std::unique_ptr<MemoryBuffer> MB) const override {
SmallString<128> LinkerInputFilePath;
std::error_code EC = sys::fs::createTemporaryFile("amdgpu-pre-link-jit",
"o", LinkerInputFilePath);
if (EC)
return Plugin::error("Failed to create temporary file for linker");
Expected<std::unique_ptr<FileOutputBuffer>> OutputOrErr =
FileOutputBuffer::create(LinkerInputFilePath, MB->getBuffer().size());
if (!OutputOrErr)
return OutputOrErr.takeError();
std::unique_ptr<FileOutputBuffer> Output = std::move(*OutputOrErr);
llvm::copy(MB->getBuffer(), Output->getBufferStart());
if (Error E = Output->commit())
return std::move(E);
SmallString<128> LinkerOutputFilePath;
EC = sys::fs::createTemporaryFile("amdgpu-pre-link-jit", "so",
LinkerOutputFilePath);
if (EC)
return Plugin::error("Failed to create temporary file for linker");
const auto &ErrorOrPath = sys::findProgramByName("lld");
if (!ErrorOrPath)
return createStringError(inconvertibleErrorCode(),
"Failed to find `lld` on the PATH.");
std::string LLDPath = ErrorOrPath.get();
INFO(OMP_INFOTYPE_PLUGIN_KERNEL, getDeviceId(),
"Using `%s` to link JITed amdgcn ouput.", LLDPath.c_str());
std::string MCPU = "-plugin-opt=mcpu=" + getComputeUnitKind();
StringRef Args[] = {LLDPath,
"-flavor",
"gnu",
"--no-undefined",
"-shared",
MCPU,
"-o",
LinkerOutputFilePath.data(),
LinkerInputFilePath.data()};
std::string Error;
int RC = sys::ExecuteAndWait(LLDPath, Args, std::nullopt, {}, 0, 0, &Error);
if (RC)
return Plugin::error("Linking optimized bitcode failed: %s",
Error.c_str());
auto BufferOrErr = MemoryBuffer::getFileOrSTDIN(LinkerOutputFilePath);
if (!BufferOrErr)
return Plugin::error("Failed to open temporary file for lld");
if (sys::fs::remove(LinkerOutputFilePath))
return Plugin::error("Failed to remove temporary output file for lld");
if (sys::fs::remove(LinkerInputFilePath))
return Plugin::error("Failed to remove temporary input file for lld");
return std::move(*BufferOrErr);
}
std::string getComputeUnitKind() const override { return ComputeUnitKind; }
uint64_t getClockFrequency() const override { return ClockFrequency; }
Expected<GenericKernelTy &> constructKernel(const char *Name) override {
AMDGPUKernelTy *AMDGPUKernel = Plugin.allocate<AMDGPUKernelTy>();
if (!AMDGPUKernel)
return Plugin::error("Failed to allocate memory for AMDGPU kernel");
new (AMDGPUKernel) AMDGPUKernelTy(Name);
return *AMDGPUKernel;
}
Error setContext() override { return Plugin::success(); }
uint64_t getHardwareParallelism() const override {
return HardwareParallelism;
}
bool shouldSetupRPCServer() const override {
return libomptargetSupportsRPC();
}
uint64_t requestedRPCPortCount() const override {
return getHardwareParallelism();
}
Error getStream(AsyncInfoWrapperTy &AsyncInfoWrapper,
AMDGPUStreamTy *&Stream) {
Stream = AsyncInfoWrapper.getQueueAs<AMDGPUStreamTy *>();
if (!Stream) {
if (auto Err = AMDGPUStreamManager.getResource(Stream))
return Err;
AsyncInfoWrapper.setQueueAs<AMDGPUStreamTy *>(Stream);
}
return Plugin::success();
}
Expected<DeviceImageTy *> loadBinaryImpl(const __tgt_device_image *TgtImage,
int32_t ImageId) override {
AMDGPUDeviceImageTy *AMDImage = Plugin.allocate<AMDGPUDeviceImageTy>();
new (AMDImage) AMDGPUDeviceImageTy(ImageId, *this, TgtImage);
if (Error Err = AMDImage->loadExecutable(*this))
return std::move(Err);
return AMDImage;
}
void *allocate(size_t Size, void *, TargetAllocTy Kind) override;
int free(void *TgtPtr, TargetAllocTy Kind) override {
if (TgtPtr == nullptr)
return OFFLOAD_SUCCESS;
AMDGPUMemoryPoolTy *MemoryPool = nullptr;
switch (Kind) {
case TARGET_ALLOC_DEFAULT:
case TARGET_ALLOC_DEVICE:
case TARGET_ALLOC_DEVICE_NON_BLOCKING:
MemoryPool = CoarseGrainedMemoryPools[0];
break;
case TARGET_ALLOC_HOST:
MemoryPool = &HostDevice.getFineGrainedMemoryPool();
break;
case TARGET_ALLOC_SHARED:
MemoryPool = &HostDevice.getFineGrainedMemoryPool();
break;
}
if (!MemoryPool) {
REPORT("No memory pool for the specified allocation kind\n");
return OFFLOAD_FAIL;
}
if (Error Err = MemoryPool->deallocate(TgtPtr)) {
REPORT("%s\n", toString(std::move(Err)).data());
return OFFLOAD_FAIL;
}
return OFFLOAD_SUCCESS;
}
Error synchronizeImpl(__tgt_async_info &AsyncInfo) override {
AMDGPUStreamTy *Stream =
reinterpret_cast<AMDGPUStreamTy *>(AsyncInfo.Queue);
assert(Stream && "Invalid stream");
if (auto Err = Stream->synchronize())
return Err;
AsyncInfo.Queue = nullptr;
return AMDGPUStreamManager.returnResource(Stream);
}
Error queryAsyncImpl(__tgt_async_info &AsyncInfo) override {
AMDGPUStreamTy *Stream =
reinterpret_cast<AMDGPUStreamTy *>(AsyncInfo.Queue);
assert(Stream && "Invalid stream");
auto CompletedOrErr = Stream->query();
if (!CompletedOrErr)
return CompletedOrErr.takeError();
if (!(*CompletedOrErr))
return Plugin::success();
AsyncInfo.Queue = nullptr;
return AMDGPUStreamManager.returnResource(Stream);
}
Expected<void *> dataLockImpl(void *HstPtr, int64_t Size) override {
void *PinnedPtr = nullptr;
hsa_status_t Status =
hsa_amd_memory_lock(HstPtr, Size, nullptr, 0, &PinnedPtr);
if (auto Err = Plugin::check(Status, "Error in hsa_amd_memory_lock: %s\n"))
return std::move(Err);
return PinnedPtr;
}
Error dataUnlockImpl(void *HstPtr) override {
hsa_status_t Status = hsa_amd_memory_unlock(HstPtr);
return Plugin::check(Status, "Error in hsa_amd_memory_unlock: %s\n");
}
Expected<bool> isPinnedPtrImpl(void *HstPtr, void *&BaseHstPtr,
void *&BaseDevAccessiblePtr,
size_t &BaseSize) const override {
hsa_amd_pointer_info_t Info;
Info.size = sizeof(hsa_amd_pointer_info_t);
hsa_status_t Status = hsa_amd_pointer_info(
HstPtr, &Info, nullptr, nullptr,
nullptr);
if (auto Err = Plugin::check(Status, "Error in hsa_amd_pointer_info: %s"))
return std::move(Err);
if (Info.type != HSA_EXT_POINTER_TYPE_LOCKED &&
Info.type != HSA_EXT_POINTER_TYPE_HSA)
return false;
assert(Info.hostBaseAddress && "Invalid host pinned address");
assert(Info.agentBaseAddress && "Invalid agent pinned address");
assert(Info.sizeInBytes > 0 && "Invalid pinned allocation size");
BaseHstPtr = Info.hostBaseAddress;
BaseDevAccessiblePtr = Info.agentBaseAddress;
BaseSize = Info.sizeInBytes;
return true;
}
Error dataSubmitImpl(void *TgtPtr, const void *HstPtr, int64_t Size,
AsyncInfoWrapperTy &AsyncInfoWrapper) override {
AMDGPUStreamTy *Stream = nullptr;
void *PinnedPtr = nullptr;
if (void *PinnedPtr =
PinnedAllocs.getDeviceAccessiblePtrFromPinnedBuffer(HstPtr)) {
if (auto Err = getStream(AsyncInfoWrapper, Stream))
return Err;
return Stream->pushPinnedMemoryCopyAsync(TgtPtr, PinnedPtr, Size);
}
if (Size >= OMPX_MaxAsyncCopyBytes) {
if (AsyncInfoWrapper.hasQueue())
if (auto Err = synchronize(AsyncInfoWrapper))
return Err;
hsa_status_t Status;
Status = hsa_amd_memory_lock(const_cast<void *>(HstPtr), Size, nullptr, 0,
&PinnedPtr);
if (auto Err =
Plugin::check(Status, "Error in hsa_amd_memory_lock: %s\n"))
return Err;
AMDGPUSignalTy Signal;
if (auto Err = Signal.init())
return Err;
if (auto Err = utils::asyncMemCopy(useMultipleSdmaEngines(), TgtPtr,
Agent, PinnedPtr, Agent, Size, 0,
nullptr, Signal.get()))
return Err;
if (auto Err = Signal.wait(getStreamBusyWaitMicroseconds()))
return Err;
if (auto Err = Signal.deinit())
return Err;
Status = hsa_amd_memory_unlock(const_cast<void *>(HstPtr));
return Plugin::check(Status, "Error in hsa_amd_memory_unlock: %s\n");
}
AMDGPUMemoryManagerTy &PinnedMemoryManager =
HostDevice.getPinnedMemoryManager();
if (auto Err = PinnedMemoryManager.allocate(Size, &PinnedPtr))
return Err;
if (auto Err = getStream(AsyncInfoWrapper, Stream))
return Err;
return Stream->pushMemoryCopyH2DAsync(TgtPtr, HstPtr, PinnedPtr, Size,
PinnedMemoryManager);
}
Error dataRetrieveImpl(void *HstPtr, const void *TgtPtr, int64_t Size,
AsyncInfoWrapperTy &AsyncInfoWrapper) override {
AMDGPUStreamTy *Stream = nullptr;
void *PinnedPtr = nullptr;
if (void *PinnedPtr =
PinnedAllocs.getDeviceAccessiblePtrFromPinnedBuffer(HstPtr)) {
if (auto Err = getStream(AsyncInfoWrapper, Stream))
return Err;
return Stream->pushPinnedMemoryCopyAsync(PinnedPtr, TgtPtr, Size);
}
if (Size >= OMPX_MaxAsyncCopyBytes) {
if (AsyncInfoWrapper.hasQueue())
if (auto Err = synchronize(AsyncInfoWrapper))
return Err;
hsa_status_t Status;
Status = hsa_amd_memory_lock(const_cast<void *>(HstPtr), Size, nullptr, 0,
&PinnedPtr);
if (auto Err =
Plugin::check(Status, "Error in hsa_amd_memory_lock: %s\n"))
return Err;
AMDGPUSignalTy Signal;
if (auto Err = Signal.init())
return Err;
if (auto Err = utils::asyncMemCopy(useMultipleSdmaEngines(), PinnedPtr,
Agent, TgtPtr, Agent, Size, 0, nullptr,
Signal.get()))
return Err;
if (auto Err = Signal.wait(getStreamBusyWaitMicroseconds()))
return Err;
if (auto Err = Signal.deinit())
return Err;
Status = hsa_amd_memory_unlock(const_cast<void *>(HstPtr));
return Plugin::check(Status, "Error in hsa_amd_memory_unlock: %s\n");
}
AMDGPUMemoryManagerTy &PinnedMemoryManager =
HostDevice.getPinnedMemoryManager();
if (auto Err = PinnedMemoryManager.allocate(Size, &PinnedPtr))
return Err;
if (auto Err = getStream(AsyncInfoWrapper, Stream))
return Err;
return Stream->pushMemoryCopyD2HAsync(HstPtr, TgtPtr, PinnedPtr, Size,
PinnedMemoryManager);
}
Error dataExchangeImpl(const void *SrcPtr, GenericDeviceTy &DstGenericDevice,
void *DstPtr, int64_t Size,
AsyncInfoWrapperTy &AsyncInfoWrapper) override {
AMDGPUDeviceTy &DstDevice = static_cast<AMDGPUDeviceTy &>(DstGenericDevice);
if (Size >= OMPX_MaxAsyncCopyBytes) {
if (AsyncInfoWrapper.hasQueue())
if (auto Err = synchronize(AsyncInfoWrapper))
return Err;
AMDGPUSignalTy Signal;
if (auto Err = Signal.init())
return Err;
if (auto Err = utils::asyncMemCopy(
useMultipleSdmaEngines(), DstPtr, DstDevice.getAgent(), SrcPtr,
getAgent(), (uint64_t)Size, 0, nullptr, Signal.get()))
return Err;
if (auto Err = Signal.wait(getStreamBusyWaitMicroseconds()))
return Err;
return Signal.deinit();
}
AMDGPUStreamTy *Stream = nullptr;
if (auto Err = getStream(AsyncInfoWrapper, Stream))
return Err;
if (Size <= 0)
return Plugin::success();
return Stream->pushMemoryCopyD2DAsync(DstPtr, DstDevice.getAgent(), SrcPtr,
getAgent(), (uint64_t)Size);
}
Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override {
return Plugin::success();
}
Error initDeviceInfoImpl(__tgt_device_info *DeviceInfo) override {
DeviceInfo->Context = nullptr;
if (!DeviceInfo->Device)
DeviceInfo->Device = reinterpret_cast<void *>(Agent.handle);
return Plugin::success();
}
Error createEventImpl(void **EventPtrStorage) override {
AMDGPUEventTy **Event = reinterpret_cast<AMDGPUEventTy **>(EventPtrStorage);
return AMDGPUEventManager.getResource(*Event);
}
Error destroyEventImpl(void *EventPtr) override {
AMDGPUEventTy *Event = reinterpret_cast<AMDGPUEventTy *>(EventPtr);
return AMDGPUEventManager.returnResource(Event);
}
Error recordEventImpl(void *EventPtr,
AsyncInfoWrapperTy &AsyncInfoWrapper) override {
AMDGPUEventTy *Event = reinterpret_cast<AMDGPUEventTy *>(EventPtr);
assert(Event && "Invalid event");
AMDGPUStreamTy *Stream = nullptr;
if (auto Err = getStream(AsyncInfoWrapper, Stream))
return Err;
return Event->record(*Stream);
}
Error waitEventImpl(void *EventPtr,
AsyncInfoWrapperTy &AsyncInfoWrapper) override {
AMDGPUEventTy *Event = reinterpret_cast<AMDGPUEventTy *>(EventPtr);
AMDGPUStreamTy *Stream = nullptr;
if (auto Err = getStream(AsyncInfoWrapper, Stream))
return Err;
return Event->wait(*Stream);
}
Error syncEventImpl(void *EventPtr) override {
return Plugin::error("Synchronize event not implemented");
}
Error obtainInfoImpl(InfoQueueTy &Info) override {
char TmpChar[1000];
const char *TmpCharPtr = "Unknown";
uint16_t Major, Minor;
uint32_t TmpUInt, TmpUInt2;
uint32_t CacheSize[4];
size_t TmpSt;
bool TmpBool;
uint16_t WorkgrpMaxDim[3];
hsa_dim3_t GridMaxDim;
hsa_status_t Status, Status2;
Status = hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MAJOR, &Major);
Status2 = hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MINOR, &Minor);
if (Status == HSA_STATUS_SUCCESS && Status2 == HSA_STATUS_SUCCESS)
Info.add("HSA Runtime Version",
std::to_string(Major) + "." + std::to_string(Minor));
Info.add("HSA OpenMP Device Number", DeviceId);
Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_PRODUCT_NAME, TmpChar);
if (Status == HSA_STATUS_SUCCESS)
Info.add("Product Name", TmpChar);
Status = getDeviceAttrRaw(HSA_AGENT_INFO_NAME, TmpChar);
if (Status == HSA_STATUS_SUCCESS)
Info.add("Device Name", TmpChar);
Status = getDeviceAttrRaw(HSA_AGENT_INFO_VENDOR_NAME, TmpChar);
if (Status == HSA_STATUS_SUCCESS)
Info.add("Vendor Name", TmpChar);
hsa_device_type_t DevType;
Status = getDeviceAttrRaw(HSA_AGENT_INFO_DEVICE, DevType);
if (Status == HSA_STATUS_SUCCESS) {
switch (DevType) {
case HSA_DEVICE_TYPE_CPU:
TmpCharPtr = "CPU";
break;
case HSA_DEVICE_TYPE_GPU:
TmpCharPtr = "GPU";
break;
case HSA_DEVICE_TYPE_DSP:
TmpCharPtr = "DSP";
break;
}
Info.add("Device Type", TmpCharPtr);
}
Status = getDeviceAttrRaw(HSA_AGENT_INFO_QUEUES_MAX, TmpUInt);
if (Status == HSA_STATUS_SUCCESS)
Info.add("Max Queues", TmpUInt);
Status = getDeviceAttrRaw(HSA_AGENT_INFO_QUEUE_MIN_SIZE, TmpUInt);
if (Status == HSA_STATUS_SUCCESS)
Info.add("Queue Min Size", TmpUInt);
Status = getDeviceAttrRaw(HSA_AGENT_INFO_QUEUE_MAX_SIZE, TmpUInt);
if (Status == HSA_STATUS_SUCCESS)
Info.add("Queue Max Size", TmpUInt);
Status = getDeviceAttrRaw(HSA_AGENT_INFO_CACHE_SIZE, CacheSize);
if (Status == HSA_STATUS_SUCCESS) {
Info.add("Cache");
for (int I = 0; I < 4; I++)
if (CacheSize[I])
Info.add<InfoLevel2>("L" + std::to_string(I), CacheSize[I]);
}
Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_CACHELINE_SIZE, TmpUInt);
if (Status == HSA_STATUS_SUCCESS)
Info.add("Cacheline Size", TmpUInt);
Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY, TmpUInt);
if (Status == HSA_STATUS_SUCCESS)
Info.add("Max Clock Freq", TmpUInt, "MHz");
Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, TmpUInt);
if (Status == HSA_STATUS_SUCCESS)
Info.add("Compute Units", TmpUInt);
Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU, TmpUInt);
if (Status == HSA_STATUS_SUCCESS)
Info.add("SIMD per CU", TmpUInt);
Status = getDeviceAttrRaw(HSA_AGENT_INFO_FAST_F16_OPERATION, TmpBool);
if (Status == HSA_STATUS_SUCCESS)
Info.add("Fast F16 Operation", TmpBool);
Status = getDeviceAttrRaw(HSA_AGENT_INFO_WAVEFRONT_SIZE, TmpUInt2);
if (Status == HSA_STATUS_SUCCESS)
Info.add("Wavefront Size", TmpUInt2);
Status = getDeviceAttrRaw(HSA_AGENT_INFO_WORKGROUP_MAX_SIZE, TmpUInt);
if (Status == HSA_STATUS_SUCCESS)
Info.add("Workgroup Max Size", TmpUInt);
Status = getDeviceAttrRaw(HSA_AGENT_INFO_WORKGROUP_MAX_DIM, WorkgrpMaxDim);
if (Status == HSA_STATUS_SUCCESS) {
Info.add("Workgroup Max Size per Dimension");
Info.add<InfoLevel2>("x", WorkgrpMaxDim[0]);
Info.add<InfoLevel2>("y", WorkgrpMaxDim[1]);
Info.add<InfoLevel2>("z", WorkgrpMaxDim[2]);
}
Status = getDeviceAttrRaw(
(hsa_agent_info_t)HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU, TmpUInt);
if (Status == HSA_STATUS_SUCCESS) {
Info.add("Max Waves Per CU", TmpUInt);
Info.add("Max Work-item Per CU", TmpUInt * TmpUInt2);
}
Status = getDeviceAttrRaw(HSA_AGENT_INFO_GRID_MAX_SIZE, TmpUInt);
if (Status == HSA_STATUS_SUCCESS)
Info.add("Grid Max Size", TmpUInt);
Status = getDeviceAttrRaw(HSA_AGENT_INFO_GRID_MAX_DIM, GridMaxDim);
if (Status == HSA_STATUS_SUCCESS) {
Info.add("Grid Max Size per Dimension");
Info.add<InfoLevel2>("x", GridMaxDim.x);
Info.add<InfoLevel2>("y", GridMaxDim.y);
Info.add<InfoLevel2>("z", GridMaxDim.z);
}
Status = getDeviceAttrRaw(HSA_AGENT_INFO_FBARRIER_MAX_SIZE, TmpUInt);
if (Status == HSA_STATUS_SUCCESS)
Info.add("Max fbarriers/Workgrp", TmpUInt);
Info.add("Memory Pools");
for (AMDGPUMemoryPoolTy *Pool : AllMemoryPools) {
std::string TmpStr, TmpStr2;
if (Pool->isGlobal())
TmpStr = "Global";
else if (Pool->isReadOnly())
TmpStr = "ReadOnly";
else if (Pool->isPrivate())
TmpStr = "Private";
else if (Pool->isGroup())
TmpStr = "Group";
else
TmpStr = "Unknown";
Info.add<InfoLevel2>(std::string("Pool ") + TmpStr);
if (Pool->isGlobal()) {
if (Pool->isFineGrained())
TmpStr2 += "Fine Grained ";
if (Pool->isCoarseGrained())
TmpStr2 += "Coarse Grained ";
if (Pool->supportsKernelArgs())
TmpStr2 += "Kernarg ";
Info.add<InfoLevel3>("Flags", TmpStr2);
}
Status = Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_SIZE, TmpSt);
if (Status == HSA_STATUS_SUCCESS)
Info.add<InfoLevel3>("Size", TmpSt, "bytes");
Status = Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED,
TmpBool);
if (Status == HSA_STATUS_SUCCESS)
Info.add<InfoLevel3>("Allocatable", TmpBool);
Status = Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE,
TmpSt);
if (Status == HSA_STATUS_SUCCESS)
Info.add<InfoLevel3>("Runtime Alloc Granule", TmpSt, "bytes");
Status = Pool->getAttrRaw(
HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT, TmpSt);
if (Status == HSA_STATUS_SUCCESS)
Info.add<InfoLevel3>("Runtime Alloc Alignment", TmpSt, "bytes");
Status =
Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL, TmpBool);
if (Status == HSA_STATUS_SUCCESS)
Info.add<InfoLevel3>("Accessable by all", TmpBool);
}
Info.add("ISAs");
auto Err = utils::iterateAgentISAs(getAgent(), [&](hsa_isa_t ISA) {
Status = hsa_isa_get_info_alt(ISA, HSA_ISA_INFO_NAME, TmpChar);
if (Status == HSA_STATUS_SUCCESS)
Info.add<InfoLevel2>("Name", TmpChar);
return Status;
});
if (Err)
consumeError(std::move(Err));
return Plugin::success();
}
bool useAutoZeroCopyImpl() override {
return ((IsAPU || OMPX_ApuMaps) && IsXnackEnabled);
}
Error getDeviceStackSize(uint64_t &Value) override {
Value = StackSize;
return Plugin::success();
}
Error setDeviceStackSize(uint64_t Value) override {
StackSize = Value;
return Plugin::success();
}
Error getDeviceHeapSize(uint64_t &Value) override {
Value = DeviceMemoryPoolSize;
return Plugin::success();
}
Error setDeviceHeapSize(uint64_t Value) override {
for (DeviceImageTy *Image : LoadedImages)
if (auto Err = setupDeviceMemoryPool(Plugin, *Image, Value))
return Err;
DeviceMemoryPoolSize = Value;
return Plugin::success();
}
Error getDeviceMemorySize(uint64_t &Value) override {
for (AMDGPUMemoryPoolTy *Pool : AllMemoryPools) {
if (Pool->isGlobal()) {
hsa_status_t Status =
Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_SIZE, Value);
return Plugin::check(Status, "Error in getting device memory size: %s");
}
}
return Plugin::error("getDeviceMemorySize:: no global pool");
}
template <typename Ty> Error getDeviceAttr(uint32_t Kind, Ty &Value) {
hsa_status_t Status =
hsa_agent_get_info(Agent, (hsa_agent_info_t)Kind, &Value);
return Plugin::check(Status, "Error in hsa_agent_get_info: %s");
}
template <typename Ty>
hsa_status_t getDeviceAttrRaw(uint32_t Kind, Ty &Value) {
return hsa_agent_get_info(Agent, (hsa_agent_info_t)Kind, &Value);
}
hsa_agent_t getAgent() const override { return Agent; }
AMDGPUSignalManagerTy &getSignalManager() { return AMDGPUSignalManager; }
Error retrieveAllMemoryPools() override {
return utils::iterateAgentMemoryPools(
Agent, [&](hsa_amd_memory_pool_t HSAMemoryPool) {
AMDGPUMemoryPoolTy *MemoryPool =
Plugin.allocate<AMDGPUMemoryPoolTy>();
new (MemoryPool) AMDGPUMemoryPoolTy(HSAMemoryPool);
AllMemoryPools.push_back(MemoryPool);
return HSA_STATUS_SUCCESS;
});
}
bool useMultipleSdmaEngines() const { return OMPX_UseMultipleSdmaEngines; }
private:
using AMDGPUEventRef = AMDGPUResourceRef<AMDGPUEventTy>;
using AMDGPUEventManagerTy = GenericDeviceResourceManagerTy<AMDGPUEventRef>;
Error callGlobalCtorDtorCommon(GenericPluginTy &Plugin, DeviceImageTy &Image,
bool IsCtor) {
const char *KernelName =
IsCtor ? "amdgcn.device.init" : "amdgcn.device.fini";
GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler();
if (IsCtor && !Handler.isSymbolInImage(*this, Image, KernelName))
return Plugin::success();
AMDGPUKernelTy AMDGPUKernel(KernelName);
if (auto Err = AMDGPUKernel.init(*this, Image))
return Err;
AsyncInfoWrapperTy AsyncInfoWrapper(*this, nullptr);
KernelArgsTy KernelArgs = {};
if (auto Err =
AMDGPUKernel.launchImpl(*this, 1u,
1ul, KernelArgs,
KernelLaunchParamsTy{}, AsyncInfoWrapper))
return Err;
Error Err = Plugin::success();
AsyncInfoWrapper.finalize(Err);
return Err;
}
Error checkIfAPU() {
llvm::StringRef StrGfxName(ComputeUnitKind);
IsAPU = llvm::StringSwitch<bool>(StrGfxName)
.Case("gfx940", true)
.Default(false);
if (IsAPU)
return Plugin::success();
bool MayBeAPU = llvm::StringSwitch<bool>(StrGfxName)
.Case("gfx942", true)
.Default(false);
if (!MayBeAPU)
return Plugin::success();
uint32_t ChipID = 0;
if (auto Err = getDeviceAttr(HSA_AMD_AGENT_INFO_CHIP_ID, ChipID))
return Err;
if (!(ChipID & 0x1)) {
IsAPU = true;
return Plugin::success();
}
return Plugin::success();
}
UInt32Envar OMPX_NumQueues;
UInt32Envar OMPX_QueueSize;
UInt32Envar OMPX_DefaultTeamsPerCU;
UInt32Envar OMPX_MaxAsyncCopyBytes;
UInt32Envar OMPX_InitialNumSignals;
UInt32Envar OMPX_StreamBusyWait;
BoolEnvar OMPX_UseMultipleSdmaEngines;
BoolEnvar OMPX_ApuMaps;
AMDGPUStreamManagerTy AMDGPUStreamManager;
AMDGPUEventManagerTy AMDGPUEventManager;
AMDGPUSignalManagerTy AMDGPUSignalManager;
hsa_agent_t Agent;
std::string ComputeUnitKind;
uint64_t ClockFrequency;
uint64_t HardwareParallelism;
AMDHostDeviceTy &HostDevice;
uint64_t DeviceMemoryPoolSize = 1L << 29L ;
uint64_t StackSize = 16 * 1024 ;
bool IsAPU = false;
bool IsXnackEnabled = false;
};
Error AMDGPUDeviceImageTy::loadExecutable(const AMDGPUDeviceTy &Device) {
hsa_status_t Status;
Status = hsa_code_object_deserialize(getStart(), getSize(), "", &CodeObject);
if (auto Err =
Plugin::check(Status, "Error in hsa_code_object_deserialize: %s"))
return Err;
Status = hsa_executable_create_alt(
HSA_PROFILE_FULL, HSA_DEFAULT_FLOAT_ROUNDING_MODE_ZERO, "", &Executable);
if (auto Err =
Plugin::check(Status, "Error in hsa_executable_create_alt: %s"))
return Err;
Status = hsa_executable_load_code_object(Executable, Device.getAgent(),
CodeObject, "");
if (auto Err =
Plugin::check(Status, "Error in hsa_executable_load_code_object: %s"))
return Err;
Status = hsa_executable_freeze(Executable, "");
if (auto Err = Plugin::check(Status, "Error in hsa_executable_freeze: %s"))
return Err;
uint32_t Result;
Status = hsa_executable_validate(Executable, &Result);
if (auto Err = Plugin::check(Status, "Error in hsa_executable_validate: %s"))
return Err;
if (Result)
return Plugin::error("Loaded HSA executable does not validate");
if (auto Err = utils::readAMDGPUMetaDataFromImage(
getMemoryBuffer(), KernelInfoMap, ELFABIVersion))
return Err;
return Plugin::success();
}
Expected<hsa_executable_symbol_t>
AMDGPUDeviceImageTy::findDeviceSymbol(GenericDeviceTy &Device,
StringRef SymbolName) const {
AMDGPUDeviceTy &AMDGPUDevice = static_cast<AMDGPUDeviceTy &>(Device);
hsa_agent_t Agent = AMDGPUDevice.getAgent();
hsa_executable_symbol_t Symbol;
hsa_status_t Status = hsa_executable_get_symbol_by_name(
Executable, SymbolName.data(), &Agent, &Symbol);
if (auto Err = Plugin::check(
Status, "Error in hsa_executable_get_symbol_by_name(%s): %s",
SymbolName.data()))
return std::move(Err);
return Symbol;
}
template <typename ResourceTy>
Error AMDGPUResourceRef<ResourceTy>::create(GenericDeviceTy &Device) {
if (Resource)
return Plugin::error("Creating an existing resource");
AMDGPUDeviceTy &AMDGPUDevice = static_cast<AMDGPUDeviceTy &>(Device);
Resource = new ResourceTy(AMDGPUDevice);
return Resource->init();
}
AMDGPUStreamTy::AMDGPUStreamTy(AMDGPUDeviceTy &Device)
: Agent(Device.getAgent()), Queue(nullptr),
SignalManager(Device.getSignalManager()), Device(Device),
Slots(32), NextSlot(0), SyncCycle(0), RPCServer(nullptr),
StreamBusyWaitMicroseconds(Device.getStreamBusyWaitMicroseconds()),
UseMultipleSdmaEngines(Device.useMultipleSdmaEngines()) {}
struct AMDGPUGlobalHandlerTy final : public GenericGlobalHandlerTy {
Error getGlobalMetadataFromDevice(GenericDeviceTy &Device,
DeviceImageTy &Image,
GlobalTy &DeviceGlobal) override {
AMDGPUDeviceImageTy &AMDImage = static_cast<AMDGPUDeviceImageTy &>(Image);
auto SymbolOrErr =
AMDImage.findDeviceSymbol(Device, DeviceGlobal.getName());
if (!SymbolOrErr)
return SymbolOrErr.takeError();
hsa_executable_symbol_t Symbol = *SymbolOrErr;
hsa_symbol_kind_t SymbolType;
hsa_status_t Status;
uint64_t SymbolAddr;
uint32_t SymbolSize;
std::pair<hsa_executable_symbol_info_t, void *> RequiredInfos[] = {
{HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &SymbolType},
{HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &SymbolAddr},
{HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE, &SymbolSize}};
for (auto &Info : RequiredInfos) {
Status = hsa_executable_symbol_get_info(Symbol, Info.first, Info.second);
if (auto Err = Plugin::check(
Status, "Error in hsa_executable_symbol_get_info: %s"))
return Err;
}
if (SymbolSize != DeviceGlobal.getSize())
return Plugin::error(
"Failed to load global '%s' due to size mismatch (%zu != %zu)",
DeviceGlobal.getName().data(), SymbolSize,
(size_t)DeviceGlobal.getSize());
DeviceGlobal.setPtr(reinterpret_cast<void *>(SymbolAddr));
return Plugin::success();
}
};
struct AMDGPUPluginTy final : public GenericPluginTy {
AMDGPUPluginTy()
: GenericPluginTy(getTripleArch()), Initialized(false),
HostDevice(nullptr) {}
AMDGPUPluginTy(const AMDGPUPluginTy &) = delete;
AMDGPUPluginTy(AMDGPUPluginTy &&) = delete;
Expected<int32_t> initImpl() override {
hsa_status_t Status = hsa_init();
if (Status != HSA_STATUS_SUCCESS) {
DP("Failed to initialize AMDGPU's HSA library\n");
return 0;
}
Initialized = true;
Status = hsa_amd_register_system_event_handler(eventHandler, nullptr);
if (auto Err = Plugin::check(
Status, "Error in hsa_amd_register_system_event_handler: %s"))
return std::move(Err);
llvm::SmallVector<hsa_agent_t> HostAgents;
auto Err = utils::iterateAgents([&](hsa_agent_t Agent) {
hsa_device_type_t DeviceType;
hsa_status_t Status =
hsa_agent_get_info(Agent, HSA_AGENT_INFO_DEVICE, &DeviceType);
if (Status != HSA_STATUS_SUCCESS)
return Status;
if (DeviceType == HSA_DEVICE_TYPE_GPU) {
hsa_agent_feature_t Features;
Status = hsa_agent_get_info(Agent, HSA_AGENT_INFO_FEATURE, &Features);
if (Features & HSA_AGENT_FEATURE_KERNEL_DISPATCH)
KernelAgents.push_back(Agent);
} else if (DeviceType == HSA_DEVICE_TYPE_CPU) {
HostAgents.push_back(Agent);
}
return HSA_STATUS_SUCCESS;
});
if (Err)
return std::move(Err);
int32_t NumDevices = KernelAgents.size();
if (NumDevices == 0) {
DP("There are no devices supporting AMDGPU.\n");
return 0;
}
if (HostAgents.empty())
return Plugin::error("No AMDGPU host agents");
HostDevice = allocate<AMDHostDeviceTy>();
new (HostDevice) AMDHostDeviceTy(*this, HostAgents);
if (auto Err = HostDevice->init())
return std::move(Err);
return NumDevices;
}
Error deinitImpl() override {
if (!Initialized)
return Plugin::success();
if (HostDevice)
if (auto Err = HostDevice->deinit())
return Err;
hsa_status_t Status = hsa_shut_down();
return Plugin::check(Status, "Error in hsa_shut_down: %s");
}
GenericDeviceTy *createDevice(GenericPluginTy &Plugin, int32_t DeviceId,
int32_t NumDevices) override {
return new AMDGPUDeviceTy(Plugin, DeviceId, NumDevices, getHostDevice(),
getKernelAgent(DeviceId));
}
GenericGlobalHandlerTy *createGlobalHandler() override {
return new AMDGPUGlobalHandlerTy();
}
Triple::ArchType getTripleArch() const override { return Triple::amdgcn; }
const char *getName() const override { return GETNAME(TARGET_NAME); }
uint16_t getMagicElfBits() const override { return ELF::EM_AMDGPU; }
Expected<bool> isELFCompatible(uint32_t DeviceId,
StringRef Image) const override {
auto ElfOrErr = ELF64LEObjectFile::create(
MemoryBufferRef(Image, ""), false);
if (!ElfOrErr)
return ElfOrErr.takeError();
std::optional<StringRef> Processor = ElfOrErr->tryGetCPUName();
if (!Processor)
return false;
auto TargeTripleAndFeaturesOrError =
utils::getTargetTripleAndFeatures(getKernelAgent(DeviceId));
if (!TargeTripleAndFeaturesOrError)
return TargeTripleAndFeaturesOrError.takeError();
return utils::isImageCompatibleWithEnv(Processor ? *Processor : "",
ElfOrErr->getPlatformFlags(),
*TargeTripleAndFeaturesOrError);
}
bool isDataExchangable(int32_t SrcDeviceId, int32_t DstDeviceId) override {
return true;
}
AMDHostDeviceTy &getHostDevice() {
assert(HostDevice && "Host device not initialized");
return *HostDevice;
}
hsa_agent_t getKernelAgent(int32_t AgentId) const {
assert((uint32_t)AgentId < KernelAgents.size() && "Invalid agent id");
return KernelAgents[AgentId];
}
const llvm::SmallVector<hsa_agent_t> &getKernelAgents() const {
return KernelAgents;
}
private:
static hsa_status_t eventHandler(const hsa_amd_event_t *Event, void *) {
if (Event->event_type != HSA_AMD_GPU_MEMORY_FAULT_EVENT)
return HSA_STATUS_SUCCESS;
SmallVector<std::string> Reasons;
uint32_t ReasonsMask = Event->memory_fault.fault_reason_mask;
if (ReasonsMask & HSA_AMD_MEMORY_FAULT_PAGE_NOT_PRESENT)
Reasons.emplace_back("Page not present or supervisor privilege");
if (ReasonsMask & HSA_AMD_MEMORY_FAULT_READ_ONLY)
Reasons.emplace_back("Write access to a read-only page");
if (ReasonsMask & HSA_AMD_MEMORY_FAULT_NX)
Reasons.emplace_back("Execute access to a page marked NX");
if (ReasonsMask & HSA_AMD_MEMORY_FAULT_HOST_ONLY)
Reasons.emplace_back("GPU attempted access to a host only page");
if (ReasonsMask & HSA_AMD_MEMORY_FAULT_DRAMECC)
Reasons.emplace_back("DRAM ECC failure");
if (ReasonsMask & HSA_AMD_MEMORY_FAULT_IMPRECISE)
Reasons.emplace_back("Can't determine the exact fault address");
if (ReasonsMask & HSA_AMD_MEMORY_FAULT_SRAMECC)
Reasons.emplace_back("SRAM ECC failure (ie registers, no fault address)");
if (ReasonsMask & HSA_AMD_MEMORY_FAULT_HANG)
Reasons.emplace_back("GPU reset following unspecified hang");
if (Reasons.empty())
Reasons.emplace_back("Unknown (" + std::to_string(ReasonsMask) + ")");
uint32_t Node = -1;
hsa_agent_get_info(Event->memory_fault.agent, HSA_AGENT_INFO_NODE, &Node);
FATAL_MESSAGE(1,
"Memory access fault by GPU %" PRIu32 " (agent 0x%" PRIx64
") at virtual address %p. Reasons: %s",
Node, Event->memory_fault.agent.handle,
(void *)Event->memory_fault.virtual_address,
llvm::join(Reasons, ", ").c_str());
return HSA_STATUS_ERROR;
}
bool Initialized;
llvm::SmallVector<hsa_agent_t> KernelAgents;
AMDHostDeviceTy *HostDevice;
};
Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
uint32_t NumThreads, uint64_t NumBlocks,
KernelArgsTy &KernelArgs,
KernelLaunchParamsTy LaunchParams,
AsyncInfoWrapperTy &AsyncInfoWrapper) const {
if (ArgsSize != LaunchParams.Size &&
ArgsSize != LaunchParams.Size + getImplicitArgsSize())
return Plugin::error("Mismatch of kernel arguments size");
AMDGPUPluginTy &AMDGPUPlugin =
static_cast<AMDGPUPluginTy &>(GenericDevice.Plugin);
AMDHostDeviceTy &HostDevice = AMDGPUPlugin.getHostDevice();
AMDGPUMemoryManagerTy &ArgsMemoryManager = HostDevice.getArgsMemoryManager();
void *AllArgs = nullptr;
if (auto Err = ArgsMemoryManager.allocate(ArgsSize, &AllArgs))
return Err;
uint32_t GroupSize = getGroupSize();
if (uint32_t MaxDynCGroupMem = std::max(
KernelArgs.DynCGroupMem, GenericDevice.getDynamicMemorySize())) {
GroupSize += MaxDynCGroupMem;
}
uint64_t StackSize;
if (auto Err = GenericDevice.getDeviceStackSize(StackSize))
return Err;
utils::AMDGPUImplicitArgsTy *ImplArgs = nullptr;
if (ArgsSize == LaunchParams.Size + getImplicitArgsSize()) {
ImplArgs = reinterpret_cast<utils::AMDGPUImplicitArgsTy *>(
advanceVoidPtr(AllArgs, LaunchParams.Size));
std::memset(ImplArgs, 0, getImplicitArgsSize());
}
if (LaunchParams.Size)
std::memcpy(AllArgs, LaunchParams.Data, LaunchParams.Size);
AMDGPUDeviceTy &AMDGPUDevice = static_cast<AMDGPUDeviceTy &>(GenericDevice);
AMDGPUStreamTy *Stream = nullptr;
if (auto Err = AMDGPUDevice.getStream(AsyncInfoWrapper, Stream))
return Err;
if (GenericDevice.getRPCServer())
Stream->setRPCServer(GenericDevice.getRPCServer());
if (ImplArgs &&
getImplicitArgsSize() == sizeof(utils::AMDGPUImplicitArgsTy)) {
ImplArgs->BlockCountX = NumBlocks;
ImplArgs->BlockCountY = 1;
ImplArgs->BlockCountZ = 1;
ImplArgs->GroupSizeX = NumThreads;
ImplArgs->GroupSizeY = 1;
ImplArgs->GroupSizeZ = 1;
ImplArgs->GridDims = 1;
ImplArgs->DynamicLdsSize = KernelArgs.DynCGroupMem;
}
return Stream->pushKernelLaunch(*this, AllArgs, NumThreads, NumBlocks,
GroupSize, StackSize, ArgsMemoryManager);
}
Error AMDGPUKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
KernelArgsTy &KernelArgs,
uint32_t NumThreads,
uint64_t NumBlocks) const {
if (!(getInfoLevel() & OMP_INFOTYPE_PLUGIN_KERNEL))
return Plugin::success();
if (!KernelInfo.has_value())
return Plugin::success();
auto NumGroups = NumBlocks;
auto ThreadsPerGroup = NumThreads;
auto ArgNum = KernelArgs.NumArgs;
auto LoopTripCount = KernelArgs.Tripcount;
auto GroupSegmentSize = (*KernelInfo).GroupSegmentList;
auto SGPRCount = (*KernelInfo).SGPRCount;
auto VGPRCount = (*KernelInfo).VGPRCount;
auto SGPRSpillCount = (*KernelInfo).SGPRSpillCount;
auto VGPRSpillCount = (*KernelInfo).VGPRSpillCount;
auto MaxFlatWorkgroupSize = (*KernelInfo).MaxFlatWorkgroupSize;
INFO(OMP_INFOTYPE_PLUGIN_KERNEL, GenericDevice.getDeviceId(),
"#Args: %d Teams x Thrds: %4lux%4u (MaxFlatWorkGroupSize: %u) LDS "
"Usage: %uB #SGPRs/VGPRs: %u/%u #SGPR/VGPR Spills: %u/%u Tripcount: "
"%lu\n",
ArgNum, NumGroups, ThreadsPerGroup, MaxFlatWorkgroupSize,
GroupSegmentSize, SGPRCount, VGPRCount, SGPRSpillCount, VGPRSpillCount,
LoopTripCount);
return Plugin::success();
}
template <typename... ArgsTy>
static Error Plugin::check(int32_t Code, const char *ErrFmt, ArgsTy... Args) {
hsa_status_t ResultCode = static_cast<hsa_status_t>(Code);
if (ResultCode == HSA_STATUS_SUCCESS || ResultCode == HSA_STATUS_INFO_BREAK)
return Error::success();
const char *Desc = "Unknown error";
hsa_status_t Ret = hsa_status_string(ResultCode, &Desc);
if (Ret != HSA_STATUS_SUCCESS)
REPORT("Unrecognized " GETNAME(TARGET_NAME) " error code %d\n", Code);
return createStringError<ArgsTy..., const char *>(inconvertibleErrorCode(),
ErrFmt, Args..., Desc);
}
void *AMDGPUMemoryManagerTy::allocate(size_t Size, void *HstPtr,
TargetAllocTy Kind) {
void *Ptr = nullptr;
if (auto Err = MemoryPool->allocate(Size, &Ptr)) {
consumeError(std::move(Err));
return nullptr;
}
assert(Ptr && "Invalid pointer");
llvm::SmallVector<hsa_agent_t> Agents;
llvm::copy_if(
Plugin.getKernelAgents(), std::back_inserter(Agents),
[&](hsa_agent_t Agent) { return MemoryPool->canAccess(Agent); });
if (auto Err = MemoryPool->enableAccess(Ptr, Size, Agents)) {
REPORT("%s\n", toString(std::move(Err)).data());
return nullptr;
}
return Ptr;
}
void *AMDGPUDeviceTy::allocate(size_t Size, void *, TargetAllocTy Kind) {
if (Size == 0)
return nullptr;
AMDGPUMemoryPoolTy *MemoryPool = nullptr;
switch (Kind) {
case TARGET_ALLOC_DEFAULT:
case TARGET_ALLOC_DEVICE:
case TARGET_ALLOC_DEVICE_NON_BLOCKING:
MemoryPool = CoarseGrainedMemoryPools[0];
break;
case TARGET_ALLOC_HOST:
MemoryPool = &HostDevice.getFineGrainedMemoryPool();
break;
case TARGET_ALLOC_SHARED:
MemoryPool = &HostDevice.getFineGrainedMemoryPool();
break;
}
if (!MemoryPool) {
REPORT("No memory pool for the specified allocation kind\n");
return nullptr;
}
void *Alloc = nullptr;
if (Error Err = MemoryPool->allocate(Size, &Alloc)) {
REPORT("%s\n", toString(std::move(Err)).data());
return nullptr;
}
if (Alloc) {
llvm::SmallVector<hsa_agent_t> Agents;
llvm::copy_if(static_cast<AMDGPUPluginTy &>(Plugin).getKernelAgents(),
std::back_inserter(Agents), [&](hsa_agent_t Agent) {
return MemoryPool->canAccess(Agent);
});
if (auto Err = MemoryPool->enableAccess(Alloc, Size, Agents)) {
REPORT("%s\n", toString(std::move(Err)).data());
return nullptr;
}
}
return Alloc;
}
}
}
}
}
extern "C" {
llvm::omp::target::plugin::GenericPluginTy *createPlugin_amdgpu() {
return new llvm::omp::target::plugin::AMDGPUPluginTy();
}
}