#include "omptarget.h"
#include "OffloadPolicy.h"
#include "OpenMP/OMPT/Callback.h"
#include "OpenMP/OMPT/Interface.h"
#include "PluginManager.h"
#include "Shared/Debug.h"
#include "Shared/EnvironmentVar.h"
#include "Shared/Utils.h"
#include "device.h"
#include "private.h"
#include "rtl.h"
#include "Shared/Profile.h"
#include "OpenMP/Mapping.h"
#include "OpenMP/omp.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/bit.h"
#include "llvm/Frontend/OpenMP/OMPConstants.h"
#include "llvm/Object/ObjectFile.h"
#include <cassert>
#include <cstdint>
#include <vector>
using llvm::SmallVector;
#ifdef OMPT_SUPPORT
using namespace llvm::omp::target::ompt;
#endif
int AsyncInfoTy::synchronize() {
int Result = OFFLOAD_SUCCESS;
if (!isQueueEmpty()) {
switch (SyncType) {
case SyncTy::BLOCKING:
Result = Device.synchronize(*this);
assert(AsyncInfo.Queue == nullptr &&
"The device plugin should have nulled the queue to indicate there "
"are no outstanding actions!");
break;
case SyncTy::NON_BLOCKING:
Result = Device.queryAsync(*this);
break;
}
}
if (Result == OFFLOAD_SUCCESS && isQueueEmpty())
Result = runPostProcessing();
return Result;
}
void *&AsyncInfoTy::getVoidPtrLocation() {
BufferLocations.push_back(nullptr);
return BufferLocations.back();
}
bool AsyncInfoTy::isDone() const { return isQueueEmpty(); }
int32_t AsyncInfoTy::runPostProcessing() {
size_t Size = PostProcessingFunctions.size();
for (size_t I = 0; I < Size; ++I) {
const int Result = PostProcessingFunctions[I]();
if (Result != OFFLOAD_SUCCESS)
return Result;
}
const auto *PrevBegin = PostProcessingFunctions.begin();
PostProcessingFunctions.erase(PrevBegin, PrevBegin + Size);
return OFFLOAD_SUCCESS;
}
bool AsyncInfoTy::isQueueEmpty() const { return AsyncInfo.Queue == nullptr; }
* in order to ensure proper alignment of members. E.g.
*
* struct S {
* int a; // 4-aligned
* int b; // 4-aligned
* int *p; // 8-aligned
* } s1;
* ...
* #pragma omp target map(tofrom: s1.b, s1.p[0:N])
* {
* s1.b = 5;
* for (int i...) s1.p[i] = ...;
* }
*
* Here we are mapping s1 starting from member b, so BaseAddress=&s1=&s1.a and
* BeginAddress=&s1.b. Let's assume that the struct begins at address 0x100,
* then &s1.a=0x100, &s1.b=0x104, &s1.p=0x108. Each member obeys the alignment
* requirements for its type. Now, when we allocate memory on the device, in
* CUDA's case cuMemAlloc() returns an address which is at least 256-aligned.
* This means that the chunk of the struct on the device will start at a
* 256-aligned address, let's say 0x200. Then the address of b will be 0x200 and
* address of p will be a misaligned 0x204 (on the host there was no need to add
* padding between b and p, so p comes exactly 4 bytes after b). If the device
* kernel tries to access s1.p, a misaligned address error occurs (as reported
* by the CUDA plugin). By padding the begin address down to a multiple of 8 and
* extending the size of the allocated chuck accordingly, the chuck on the
* device will start at 0x200 with the padding (4 bytes), then &s1.b=0x204 and
* &s1.p=0x208, as they should be to satisfy the alignment requirements.
*/
static const int64_t MaxAlignment = 16;
static uint64_t getPartialStructRequiredAlignment(void *HstPtrBase) {
int LowestOneBit = __builtin_ffsl(reinterpret_cast<uintptr_t>(HstPtrBase));
uint64_t BaseAlignment = 1 << (LowestOneBit - 1);
return MaxAlignment < BaseAlignment ? MaxAlignment : BaseAlignment;
}
static int initLibrary(DeviceTy &Device) {
* Map global data
*/
int32_t DeviceId = Device.DeviceID;
int Rc = OFFLOAD_SUCCESS;
{
std::lock_guard<decltype(PM->TrlTblMtx)> LG(PM->TrlTblMtx);
for (auto *HostEntriesBegin : PM->HostEntriesBeginRegistrationOrder) {
TranslationTable *TransTable =
&PM->HostEntriesBeginToTransTable[HostEntriesBegin];
if (TransTable->HostTable.EntriesBegin ==
TransTable->HostTable.EntriesEnd) {
continue;
}
if (TransTable->TargetsTable[DeviceId] != 0) {
continue;
}
assert(TransTable->TargetsImages.size() > (size_t)DeviceId &&
"Not expecting a device ID outside the table's bounds!");
__tgt_device_image *Img = TransTable->TargetsImages[DeviceId];
if (!Img) {
REPORT("No image loaded for device id %d.\n", DeviceId);
Rc = OFFLOAD_FAIL;
break;
}
auto BinaryOrErr = Device.loadBinary(Img);
if (llvm::Error Err = BinaryOrErr.takeError()) {
REPORT("Failed to load image %s\n",
llvm::toString(std::move(Err)).c_str());
Rc = OFFLOAD_FAIL;
break;
}
llvm::SmallVector<__tgt_offload_entry> &DeviceEntries =
TransTable->TargetsEntries[DeviceId];
for (__tgt_offload_entry &Entry :
llvm::make_range(Img->EntriesBegin, Img->EntriesEnd)) {
__tgt_device_binary &Binary = *BinaryOrErr;
__tgt_offload_entry DeviceEntry = Entry;
if (Entry.size) {
if (Device.RTL->get_global(Binary, Entry.size, Entry.name,
&DeviceEntry.addr) != OFFLOAD_SUCCESS)
REPORT("Failed to load symbol %s\n", Entry.name);
if ((PM->getRequirements() & OMP_REQ_UNIFIED_SHARED_MEMORY) ||
(PM->getRequirements() & OMPX_REQ_AUTO_ZERO_COPY)) {
if (Device.RTL->data_submit(DeviceId, DeviceEntry.addr, Entry.addr,
Entry.size) != OFFLOAD_SUCCESS)
REPORT("Failed to write symbol for USM %s\n", Entry.name);
}
} else if (Entry.addr) {
if (Device.RTL->get_function(Binary, Entry.name, &DeviceEntry.addr) !=
OFFLOAD_SUCCESS)
REPORT("Failed to load kernel %s\n", Entry.name);
}
DP("Entry point " DPxMOD " maps to%s %s (" DPxMOD ")\n",
DPxPTR(Entry.addr), (Entry.size) ? " global" : "", Entry.name,
DPxPTR(DeviceEntry.addr));
DeviceEntries.emplace_back(DeviceEntry);
}
__tgt_target_table DeviceTable{&DeviceEntries[0],
&DeviceEntries[0] + DeviceEntries.size()};
TransTable->DeviceTables[DeviceId] = DeviceTable;
__tgt_target_table *TargetTable = TransTable->TargetsTable[DeviceId] =
&TransTable->DeviceTables[DeviceId];
size_t Hsize =
TransTable->HostTable.EntriesEnd - TransTable->HostTable.EntriesBegin;
size_t Tsize = TargetTable->EntriesEnd - TargetTable->EntriesBegin;
if (Hsize != Tsize) {
REPORT(
"Host and Target tables mismatch for device id %d [%zx != %zx].\n",
DeviceId, Hsize, Tsize);
TransTable->TargetsImages[DeviceId] = 0;
TransTable->TargetsTable[DeviceId] = 0;
Rc = OFFLOAD_FAIL;
break;
}
MappingInfoTy::HDTTMapAccessorTy HDTTMap =
Device.getMappingInfo().HostDataToTargetMap.getExclusiveAccessor();
__tgt_target_table *HostTable = &TransTable->HostTable;
for (__tgt_offload_entry *CurrDeviceEntry = TargetTable->EntriesBegin,
*CurrHostEntry = HostTable->EntriesBegin,
*EntryDeviceEnd = TargetTable->EntriesEnd;
CurrDeviceEntry != EntryDeviceEnd;
CurrDeviceEntry++, CurrHostEntry++) {
if (CurrDeviceEntry->size == 0)
continue;
assert(CurrDeviceEntry->size == CurrHostEntry->size &&
"data size mismatch");
if (Device.getMappingInfo().getTgtPtrBegin(HDTTMap, CurrHostEntry->addr,
CurrHostEntry->size))
continue;
void *CurrDeviceEntryAddr = CurrDeviceEntry->addr;
if (CurrDeviceEntry->flags & OMP_DECLARE_TARGET_INDIRECT) {
AsyncInfoTy AsyncInfo(Device);
void *DevPtr;
Device.retrieveData(&DevPtr, CurrDeviceEntryAddr, sizeof(void *),
AsyncInfo, nullptr, &HDTTMap);
if (AsyncInfo.synchronize() != OFFLOAD_SUCCESS)
return OFFLOAD_FAIL;
CurrDeviceEntryAddr = DevPtr;
}
DP("Add mapping from host " DPxMOD " to device " DPxMOD " with size %zu"
", name \"%s\"\n",
DPxPTR(CurrHostEntry->addr), DPxPTR(CurrDeviceEntry->addr),
CurrDeviceEntry->size, CurrDeviceEntry->name);
HDTTMap->emplace(new HostDataToTargetTy(
(uintptr_t)CurrHostEntry->addr ,
(uintptr_t)CurrHostEntry->addr ,
(uintptr_t)CurrHostEntry->addr + CurrHostEntry->size ,
(uintptr_t)CurrDeviceEntryAddr ,
(uintptr_t)CurrDeviceEntryAddr ,
false , CurrHostEntry->name,
true ));
if (Device.notifyDataMapped(CurrHostEntry->addr, CurrHostEntry->size))
return OFFLOAD_FAIL;
}
}
}
if (Rc != OFFLOAD_SUCCESS)
return Rc;
static Int32Envar DumpOffloadEntries =
Int32Envar("OMPTARGET_DUMP_OFFLOAD_ENTRIES", -1);
if (DumpOffloadEntries.get() == DeviceId)
Device.dumpOffloadEntries();
return OFFLOAD_SUCCESS;
}
void handleTargetOutcome(bool Success, ident_t *Loc) {
switch (OffloadPolicy::get(*PM).Kind) {
case OffloadPolicy::DISABLED:
if (Success) {
FATAL_MESSAGE0(1, "expected no offloading while offloading is disabled");
}
break;
case OffloadPolicy::MANDATORY:
if (!Success) {
if (getInfoLevel() & OMP_INFOTYPE_DUMP_TABLE) {
auto ExclusiveDevicesAccessor = PM->getExclusiveDevicesAccessor();
for (auto &Device : PM->devices(ExclusiveDevicesAccessor))
dumpTargetPointerMappings(Loc, Device);
} else
FAILURE_MESSAGE("Consult https://openmp.llvm.org/design/Runtimes.html "
"for debugging options.\n");
if (!PM->getNumActivePlugins()) {
FAILURE_MESSAGE(
"No images found compatible with the installed hardware. ");
llvm::SmallVector<llvm::StringRef> Archs;
for (auto &Image : PM->deviceImages()) {
const char *Start = reinterpret_cast<const char *>(
Image.getExecutableImage().ImageStart);
uint64_t Length = llvm::omp::target::getPtrDiff(
Start, Image.getExecutableImage().ImageEnd);
llvm::MemoryBufferRef Buffer(llvm::StringRef(Start, Length),
"");
auto ObjectOrErr = llvm::object::ObjectFile::createObjectFile(Buffer);
if (auto Err = ObjectOrErr.takeError()) {
llvm::consumeError(std::move(Err));
continue;
}
if (auto CPU = (*ObjectOrErr)->tryGetCPUName())
Archs.push_back(*CPU);
}
fprintf(stderr, "Found %zu image(s): (%s)\n", Archs.size(),
llvm::join(Archs, ",").c_str());
}
SourceInfo Info(Loc);
if (Info.isAvailible())
fprintf(stderr, "%s:%d:%d: ", Info.getFilename(), Info.getLine(),
Info.getColumn());
else
FAILURE_MESSAGE("Source location information not present. Compile with "
"-g or -gline-tables-only.\n");
FATAL_MESSAGE0(
1, "failure of target construct while offloading is mandatory");
} else {
if (getInfoLevel() & OMP_INFOTYPE_DUMP_TABLE) {
auto ExclusiveDevicesAccessor = PM->getExclusiveDevicesAccessor();
for (auto &Device : PM->devices(ExclusiveDevicesAccessor))
dumpTargetPointerMappings(Loc, Device);
}
}
break;
}
}
bool checkDeviceAndCtors(int64_t &DeviceID, ident_t *Loc) {
if (OffloadPolicy::get(*PM).Kind == OffloadPolicy::DISABLED) {
DP("Offload is disabled\n");
return true;
}
if (DeviceID == OFFLOAD_DEVICE_DEFAULT) {
DeviceID = omp_get_default_device();
DP("Use default device id %" PRId64 "\n", DeviceID);
}
if (omp_get_num_devices() == 0) {
DP("omp_get_num_devices() == 0 but offload is manadatory\n");
handleTargetOutcome(false, Loc);
return true;
}
if (DeviceID == omp_get_initial_device()) {
DP("Device is host (%" PRId64 "), returning as if offload is disabled\n",
DeviceID);
return true;
}
auto DeviceOrErr = PM->getDevice(DeviceID);
if (!DeviceOrErr)
FATAL_MESSAGE(DeviceID, "%s", toString(DeviceOrErr.takeError()).data());
if (initLibrary(*DeviceOrErr) != OFFLOAD_SUCCESS) {
REPORT("Failed to init globals on device %" PRId64 "\n", DeviceID);
handleTargetOutcome(false, Loc);
return true;
}
return false;
}
static int32_t getParentIndex(int64_t Type) {
return ((Type & OMP_TGT_MAPTYPE_MEMBER_OF) >> 48) - 1;
}
void *targetAllocExplicit(size_t Size, int DeviceNum, int Kind,
const char *Name) {
DP("Call to %s for device %d requesting %zu bytes\n", Name, DeviceNum, Size);
if (Size <= 0) {
DP("Call to %s with non-positive length\n", Name);
return NULL;
}
void *Rc = NULL;
if (DeviceNum == omp_get_initial_device()) {
Rc = malloc(Size);
DP("%s returns host ptr " DPxMOD "\n", Name, DPxPTR(Rc));
return Rc;
}
auto DeviceOrErr = PM->getDevice(DeviceNum);
if (!DeviceOrErr)
FATAL_MESSAGE(DeviceNum, "%s", toString(DeviceOrErr.takeError()).c_str());
Rc = DeviceOrErr->allocData(Size, nullptr, Kind);
DP("%s returns device ptr " DPxMOD "\n", Name, DPxPTR(Rc));
return Rc;
}
void targetFreeExplicit(void *DevicePtr, int DeviceNum, int Kind,
const char *Name) {
DP("Call to %s for device %d and address " DPxMOD "\n", Name, DeviceNum,
DPxPTR(DevicePtr));
if (!DevicePtr) {
DP("Call to %s with NULL ptr\n", Name);
return;
}
if (DeviceNum == omp_get_initial_device()) {
free(DevicePtr);
DP("%s deallocated host ptr\n", Name);
return;
}
auto DeviceOrErr = PM->getDevice(DeviceNum);
if (!DeviceOrErr)
FATAL_MESSAGE(DeviceNum, "%s", toString(DeviceOrErr.takeError()).c_str());
if (DeviceOrErr->deleteData(DevicePtr, Kind) == OFFLOAD_FAIL)
FATAL_MESSAGE(DeviceNum, "%s", "Failed to deallocate device ptr");
DP("omp_target_free deallocated device ptr\n");
}
void *targetLockExplicit(void *HostPtr, size_t Size, int DeviceNum,
const char *Name) {
DP("Call to %s for device %d locking %zu bytes\n", Name, DeviceNum, Size);
if (Size <= 0) {
DP("Call to %s with non-positive length\n", Name);
return NULL;
}
void *RC = NULL;
auto DeviceOrErr = PM->getDevice(DeviceNum);
if (!DeviceOrErr)
FATAL_MESSAGE(DeviceNum, "%s", toString(DeviceOrErr.takeError()).c_str());
int32_t Err = 0;
Err = DeviceOrErr->RTL->data_lock(DeviceNum, HostPtr, Size, &RC);
if (Err) {
DP("Could not lock ptr %p\n", HostPtr);
return nullptr;
}
DP("%s returns device ptr " DPxMOD "\n", Name, DPxPTR(RC));
return RC;
}
void targetUnlockExplicit(void *HostPtr, int DeviceNum, const char *Name) {
DP("Call to %s for device %d unlocking\n", Name, DeviceNum);
auto DeviceOrErr = PM->getDevice(DeviceNum);
if (!DeviceOrErr)
FATAL_MESSAGE(DeviceNum, "%s", toString(DeviceOrErr.takeError()).c_str());
DeviceOrErr->RTL->data_unlock(DeviceNum, HostPtr);
DP("%s returns\n", Name);
}
int targetDataMapper(ident_t *Loc, DeviceTy &Device, void *ArgBase, void *Arg,
int64_t ArgSize, int64_t ArgType, map_var_info_t ArgNames,
void *ArgMapper, AsyncInfoTy &AsyncInfo,
TargetDataFuncPtrTy TargetDataFunction) {
DP("Calling the mapper function " DPxMOD "\n", DPxPTR(ArgMapper));
MapperComponentsTy MapperComponents;
MapperFuncPtrTy MapperFuncPtr = (MapperFuncPtrTy)(ArgMapper);
(*MapperFuncPtr)((void *)&MapperComponents, ArgBase, Arg, ArgSize, ArgType,
ArgNames);
SmallVector<void *> MapperArgsBase(MapperComponents.Components.size());
SmallVector<void *> MapperArgs(MapperComponents.Components.size());
SmallVector<int64_t> MapperArgSizes(MapperComponents.Components.size());
SmallVector<int64_t> MapperArgTypes(MapperComponents.Components.size());
SmallVector<void *> MapperArgNames(MapperComponents.Components.size());
for (unsigned I = 0, E = MapperComponents.Components.size(); I < E; ++I) {
auto &C = MapperComponents.Components[I];
MapperArgsBase[I] = C.Base;
MapperArgs[I] = C.Begin;
MapperArgSizes[I] = C.Size;
MapperArgTypes[I] = C.Type;
MapperArgNames[I] = C.Name;
}
int Rc = TargetDataFunction(Loc, Device, MapperComponents.Components.size(),
MapperArgsBase.data(), MapperArgs.data(),
MapperArgSizes.data(), MapperArgTypes.data(),
MapperArgNames.data(), nullptr,
AsyncInfo, true);
return Rc;
}
int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
void **ArgsBase, void **Args, int64_t *ArgSizes,
int64_t *ArgTypes, map_var_info_t *ArgNames,
void **ArgMappers, AsyncInfoTy &AsyncInfo,
bool FromMapper) {
for (int32_t I = 0; I < ArgNum; ++I) {
if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) ||
(ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE))
continue;
TIMESCOPE_WITH_DETAILS_AND_IDENT(
"HostToDev", "Size=" + std::to_string(ArgSizes[I]) + "B", Loc);
if (ArgMappers && ArgMappers[I]) {
DP("Calling targetDataMapper for the %dth argument\n", I);
map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I];
int Rc = targetDataMapper(Loc, Device, ArgsBase[I], Args[I], ArgSizes[I],
ArgTypes[I], ArgName, ArgMappers[I], AsyncInfo,
targetDataBegin);
if (Rc != OFFLOAD_SUCCESS) {
REPORT("Call to targetDataBegin via targetDataMapper for custom mapper"
" failed.\n");
return OFFLOAD_FAIL;
}
continue;
}
void *HstPtrBegin = Args[I];
void *HstPtrBase = ArgsBase[I];
int64_t DataSize = ArgSizes[I];
map_var_info_t HstPtrName = (!ArgNames) ? nullptr : ArgNames[I];
int64_t TgtPadding = 0;
const int NextI = I + 1;
if (getParentIndex(ArgTypes[I]) < 0 && NextI < ArgNum &&
getParentIndex(ArgTypes[NextI]) == I) {
int64_t Alignment = getPartialStructRequiredAlignment(HstPtrBase);
TgtPadding = (int64_t)HstPtrBegin % Alignment;
if (TgtPadding) {
DP("Using a padding of %" PRId64 " bytes for begin address " DPxMOD
"\n",
TgtPadding, DPxPTR(HstPtrBegin));
}
}
void *PointerHstPtrBegin, *PointerTgtPtrBegin;
TargetPointerResultTy PointerTpr;
bool IsHostPtr = false;
bool IsImplicit = ArgTypes[I] & OMP_TGT_MAPTYPE_IMPLICIT;
bool HasCloseModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_CLOSE;
bool HasPresentModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_PRESENT;
bool HasHoldModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_OMPX_HOLD;
bool UpdateRef =
!(ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) && !(FromMapper && I == 0);
MappingInfoTy::HDTTMapAccessorTy HDTTMap =
Device.getMappingInfo().HostDataToTargetMap.getExclusiveAccessor();
if (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) {
DP("Has a pointer entry: \n");
PointerTpr = Device.getMappingInfo().getTargetPointer(
HDTTMap, HstPtrBase, HstPtrBase, 0, sizeof(void *),
nullptr,
false, false, IsImplicit, UpdateRef,
HasCloseModifier, HasPresentModifier, HasHoldModifier, AsyncInfo,
nullptr, false);
PointerTgtPtrBegin = PointerTpr.TargetPointer;
IsHostPtr = PointerTpr.Flags.IsHostPointer;
if (!PointerTgtPtrBegin) {
REPORT("Call to getTargetPointer returned null pointer (%s).\n",
HasPresentModifier ? "'present' map type modifier"
: "device failure or illegal mapping");
return OFFLOAD_FAIL;
}
DP("There are %zu bytes allocated at target address " DPxMOD " - is%s new"
"\n",
sizeof(void *), DPxPTR(PointerTgtPtrBegin),
(PointerTpr.Flags.IsNewEntry ? "" : " not"));
PointerHstPtrBegin = HstPtrBase;
HstPtrBase = *(void **)HstPtrBase;
UpdateRef =
(!FromMapper || I != 0);
}
const bool HasFlagTo = ArgTypes[I] & OMP_TGT_MAPTYPE_TO;
const bool HasFlagAlways = ArgTypes[I] & OMP_TGT_MAPTYPE_ALWAYS;
auto TPR = Device.getMappingInfo().getTargetPointer(
HDTTMap, HstPtrBegin, HstPtrBase, TgtPadding, DataSize, HstPtrName,
HasFlagTo, HasFlagAlways, IsImplicit, UpdateRef, HasCloseModifier,
HasPresentModifier, HasHoldModifier, AsyncInfo, PointerTpr.getEntry());
void *TgtPtrBegin = TPR.TargetPointer;
IsHostPtr = TPR.Flags.IsHostPointer;
if (!TgtPtrBegin && (DataSize || HasPresentModifier)) {
REPORT("Call to getTargetPointer returned null pointer (%s).\n",
HasPresentModifier ? "'present' map type modifier"
: "device failure or illegal mapping");
return OFFLOAD_FAIL;
}
DP("There are %" PRId64 " bytes allocated at target address " DPxMOD
" - is%s new\n",
DataSize, DPxPTR(TgtPtrBegin), (TPR.Flags.IsNewEntry ? "" : " not"));
if (ArgTypes[I] & OMP_TGT_MAPTYPE_RETURN_PARAM) {
uintptr_t Delta = (uintptr_t)HstPtrBegin - (uintptr_t)HstPtrBase;
void *TgtPtrBase = (void *)((uintptr_t)TgtPtrBegin - Delta);
DP("Returning device pointer " DPxMOD "\n", DPxPTR(TgtPtrBase));
ArgsBase[I] = TgtPtrBase;
}
if (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ && !IsHostPtr) {
uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase;
void *ExpectedTgtPtrBase = (void *)((uint64_t)TgtPtrBegin - Delta);
if (PointerTpr.getEntry()->addShadowPointer(ShadowPtrInfoTy{
(void **)PointerHstPtrBegin, HstPtrBase,
(void **)PointerTgtPtrBegin, ExpectedTgtPtrBase})) {
DP("Update pointer (" DPxMOD ") -> [" DPxMOD "]\n",
DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin));
void *&TgtPtrBase = AsyncInfo.getVoidPtrLocation();
TgtPtrBase = ExpectedTgtPtrBase;
int Ret =
Device.submitData(PointerTgtPtrBegin, &TgtPtrBase, sizeof(void *),
AsyncInfo, PointerTpr.getEntry());
if (Ret != OFFLOAD_SUCCESS) {
REPORT("Copying data to device failed.\n");
return OFFLOAD_FAIL;
}
if (PointerTpr.getEntry()->addEventIfNecessary(Device, AsyncInfo) !=
OFFLOAD_SUCCESS)
return OFFLOAD_FAIL;
}
}
bool IsStructMember = ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF;
if (getInfoLevel() & OMP_INFOTYPE_EMPTY_MAPPING && ArgTypes[I] != 0 &&
!IsStructMember && !IsImplicit && !TPR.isPresent() &&
!TPR.isContained() && !TPR.isHostPointer())
INFO(OMP_INFOTYPE_EMPTY_MAPPING, Device.DeviceID,
"variable %s does not have a valid device counterpart\n",
(HstPtrName) ? getNameFromMapping(HstPtrName).c_str() : "unknown");
}
return OFFLOAD_SUCCESS;
}
namespace {
struct PostProcessingInfo {
void *HstPtrBegin;
int64_t DataSize;
int64_t ArgType;
TargetPointerResultTy TPR;
PostProcessingInfo(void *HstPtr, int64_t Size, int64_t ArgType,
TargetPointerResultTy &&TPR)
: HstPtrBegin(HstPtr), DataSize(Size), ArgType(ArgType),
TPR(std::move(TPR)) {}
};
}
[[nodiscard]] static int
postProcessingTargetDataEnd(DeviceTy *Device,
SmallVector<PostProcessingInfo> &EntriesInfo) {
int Ret = OFFLOAD_SUCCESS;
for (auto &[HstPtrBegin, DataSize, ArgType, TPR] : EntriesInfo) {
bool DelEntry = !TPR.isHostPointer();
if ((ArgType & OMP_TGT_MAPTYPE_MEMBER_OF) &&
!(ArgType & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) {
DelEntry = false;
}
MappingInfoTy::HDTTMapAccessorTy HDTTMap =
Device->getMappingInfo().HostDataToTargetMap.getExclusiveAccessor();
TPR.getEntry()->lock();
auto *Entry = TPR.getEntry();
const bool IsNotLastUser = Entry->decDataEndThreadCount() != 0;
if (DelEntry && (Entry->getTotalRefCount() != 0 || IsNotLastUser)) {
HDTTMap.destroy();
DelEntry = false;
}
const bool HasFrom = ArgType & OMP_TGT_MAPTYPE_FROM;
if (HasFrom) {
Entry->foreachShadowPointerInfo([&](const ShadowPtrInfoTy &ShadowPtr) {
*ShadowPtr.HstPtrAddr = ShadowPtr.HstPtrVal;
DP("Restoring original host pointer value " DPxMOD " for host "
"pointer " DPxMOD "\n",
DPxPTR(ShadowPtr.HstPtrVal), DPxPTR(ShadowPtr.HstPtrAddr));
return OFFLOAD_SUCCESS;
});
}
TPR.setEntry(nullptr);
if (!DelEntry)
continue;
Ret = Device->getMappingInfo().eraseMapEntry(HDTTMap, Entry, DataSize);
HDTTMap.destroy();
Ret |= Device->getMappingInfo().deallocTgtPtrAndEntry(Entry, DataSize);
if (Ret != OFFLOAD_SUCCESS) {
REPORT("Deallocating data from device failed.\n");
break;
}
}
delete &EntriesInfo;
return Ret;
}
int targetDataEnd(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
void **ArgBases, void **Args, int64_t *ArgSizes,
int64_t *ArgTypes, map_var_info_t *ArgNames,
void **ArgMappers, AsyncInfoTy &AsyncInfo, bool FromMapper) {
int Ret = OFFLOAD_SUCCESS;
auto *PostProcessingPtrs = new SmallVector<PostProcessingInfo>();
for (int32_t I = ArgNum - 1; I >= 0; --I) {
if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) ||
(ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE))
continue;
if (ArgMappers && ArgMappers[I]) {
DP("Calling targetDataMapper for the %dth argument\n", I);
map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I];
Ret = targetDataMapper(Loc, Device, ArgBases[I], Args[I], ArgSizes[I],
ArgTypes[I], ArgName, ArgMappers[I], AsyncInfo,
targetDataEnd);
if (Ret != OFFLOAD_SUCCESS) {
REPORT("Call to targetDataEnd via targetDataMapper for custom mapper"
" failed.\n");
return OFFLOAD_FAIL;
}
continue;
}
void *HstPtrBegin = Args[I];
int64_t DataSize = ArgSizes[I];
bool IsImplicit = ArgTypes[I] & OMP_TGT_MAPTYPE_IMPLICIT;
bool UpdateRef = (!(ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) ||
(ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) &&
!(FromMapper && I == 0);
bool ForceDelete = ArgTypes[I] & OMP_TGT_MAPTYPE_DELETE;
bool HasPresentModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_PRESENT;
bool HasHoldModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_OMPX_HOLD;
TargetPointerResultTy TPR = Device.getMappingInfo().getTgtPtrBegin(
HstPtrBegin, DataSize, UpdateRef, HasHoldModifier, !IsImplicit,
ForceDelete, true);
void *TgtPtrBegin = TPR.TargetPointer;
if (!TPR.isPresent() && !TPR.isHostPointer() &&
(DataSize || HasPresentModifier)) {
DP("Mapping does not exist (%s)\n",
(HasPresentModifier ? "'present' map type modifier" : "ignored"));
if (HasPresentModifier) {
MESSAGE("device mapping required by 'present' map type modifier does "
"not exist for host address " DPxMOD " (%" PRId64 " bytes)",
DPxPTR(HstPtrBegin), DataSize);
return OFFLOAD_FAIL;
}
} else {
DP("There are %" PRId64 " bytes allocated at target address " DPxMOD
" - is%s last\n",
DataSize, DPxPTR(TgtPtrBegin), (TPR.Flags.IsLast ? "" : " not"));
}
if (!TPR.isPresent())
continue;
const bool HasAlways = ArgTypes[I] & OMP_TGT_MAPTYPE_ALWAYS;
const bool HasFrom = ArgTypes[I] & OMP_TGT_MAPTYPE_FROM;
if (HasFrom && (HasAlways || TPR.Flags.IsLast) &&
!TPR.Flags.IsHostPointer && DataSize != 0) {
DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
DataSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
TIMESCOPE_WITH_DETAILS_AND_IDENT(
"DevToHost", "Size=" + std::to_string(DataSize) + "B", Loc);
if (void *Event = TPR.getEntry()->getEvent()) {
if (Device.waitEvent(Event, AsyncInfo) != OFFLOAD_SUCCESS) {
REPORT("Failed to wait for event " DPxMOD ".\n", DPxPTR(Event));
return OFFLOAD_FAIL;
}
}
Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, DataSize, AsyncInfo,
TPR.getEntry());
if (Ret != OFFLOAD_SUCCESS) {
REPORT("Copying data from device failed.\n");
return OFFLOAD_FAIL;
}
if (TPR.Flags.IsLast) {
if (TPR.getEntry()->addEventIfNecessary(Device, AsyncInfo) !=
OFFLOAD_SUCCESS)
return OFFLOAD_FAIL;
}
}
PostProcessingPtrs->emplace_back(HstPtrBegin, DataSize, ArgTypes[I],
std::move(TPR));
PostProcessingPtrs->back().TPR.getEntry()->unlock();
}
AsyncInfo.addPostProcessingFunction([=, Device = &Device]() mutable -> int {
return postProcessingTargetDataEnd(Device, *PostProcessingPtrs);
});
return Ret;
}
static int targetDataContiguous(ident_t *Loc, DeviceTy &Device, void *ArgsBase,
void *HstPtrBegin, int64_t ArgSize,
int64_t ArgType, AsyncInfoTy &AsyncInfo) {
TargetPointerResultTy TPR = Device.getMappingInfo().getTgtPtrBegin(
HstPtrBegin, ArgSize, false,
false, true);
void *TgtPtrBegin = TPR.TargetPointer;
if (!TPR.isPresent()) {
DP("hst data:" DPxMOD " not found, becomes a noop\n", DPxPTR(HstPtrBegin));
if (ArgType & OMP_TGT_MAPTYPE_PRESENT) {
MESSAGE("device mapping required by 'present' motion modifier does not "
"exist for host address " DPxMOD " (%" PRId64 " bytes)",
DPxPTR(HstPtrBegin), ArgSize);
return OFFLOAD_FAIL;
}
return OFFLOAD_SUCCESS;
}
if (TPR.Flags.IsHostPointer) {
DP("hst data:" DPxMOD " unified and shared, becomes a noop\n",
DPxPTR(HstPtrBegin));
return OFFLOAD_SUCCESS;
}
if (ArgType & OMP_TGT_MAPTYPE_TO) {
DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n",
ArgSize, DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin));
int Ret = Device.submitData(TgtPtrBegin, HstPtrBegin, ArgSize, AsyncInfo,
TPR.getEntry());
if (Ret != OFFLOAD_SUCCESS) {
REPORT("Copying data to device failed.\n");
return OFFLOAD_FAIL;
}
if (TPR.getEntry()) {
int Ret = TPR.getEntry()->foreachShadowPointerInfo(
[&](ShadowPtrInfoTy &ShadowPtr) {
DP("Restoring original target pointer value " DPxMOD " for target "
"pointer " DPxMOD "\n",
DPxPTR(ShadowPtr.TgtPtrVal), DPxPTR(ShadowPtr.TgtPtrAddr));
Ret = Device.submitData(ShadowPtr.TgtPtrAddr,
(void *)&ShadowPtr.TgtPtrVal,
sizeof(void *), AsyncInfo);
if (Ret != OFFLOAD_SUCCESS) {
REPORT("Copying data to device failed.\n");
return OFFLOAD_FAIL;
}
return OFFLOAD_SUCCESS;
});
if (Ret != OFFLOAD_SUCCESS) {
DP("Updating shadow map failed\n");
return Ret;
}
}
}
if (ArgType & OMP_TGT_MAPTYPE_FROM) {
DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
ArgSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
int Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, ArgSize, AsyncInfo,
TPR.getEntry());
if (Ret != OFFLOAD_SUCCESS) {
REPORT("Copying data from device failed.\n");
return OFFLOAD_FAIL;
}
if (auto *Entry = TPR.getEntry()) {
AsyncInfo.addPostProcessingFunction([=]() -> int {
int Ret = Entry->foreachShadowPointerInfo(
[&](const ShadowPtrInfoTy &ShadowPtr) {
*ShadowPtr.HstPtrAddr = ShadowPtr.HstPtrVal;
DP("Restoring original host pointer value " DPxMOD
" for host pointer " DPxMOD "\n",
DPxPTR(ShadowPtr.HstPtrVal), DPxPTR(ShadowPtr.HstPtrAddr));
return OFFLOAD_SUCCESS;
});
Entry->unlock();
if (Ret != OFFLOAD_SUCCESS) {
DP("Updating shadow map failed\n");
return Ret;
}
return OFFLOAD_SUCCESS;
});
}
}
return OFFLOAD_SUCCESS;
}
static int targetDataNonContiguous(ident_t *Loc, DeviceTy &Device,
void *ArgsBase,
__tgt_target_non_contig *NonContig,
uint64_t Size, int64_t ArgType,
int CurrentDim, int DimSize, uint64_t Offset,
AsyncInfoTy &AsyncInfo) {
int Ret = OFFLOAD_SUCCESS;
if (CurrentDim < DimSize) {
for (unsigned int I = 0; I < NonContig[CurrentDim].Count; ++I) {
uint64_t CurOffset =
(NonContig[CurrentDim].Offset + I) * NonContig[CurrentDim].Stride;
if (CurrentDim != DimSize - 1 || I == 0) {
Ret = targetDataNonContiguous(Loc, Device, ArgsBase, NonContig, Size,
ArgType, CurrentDim + 1, DimSize,
Offset + CurOffset, AsyncInfo);
if (Ret != OFFLOAD_SUCCESS)
return Ret;
}
}
} else {
char *Ptr = (char *)ArgsBase + Offset;
DP("Transfer of non-contiguous : host ptr " DPxMOD " offset %" PRIu64
" len %" PRIu64 "\n",
DPxPTR(Ptr), Offset, Size);
Ret = targetDataContiguous(Loc, Device, ArgsBase, Ptr, Size, ArgType,
AsyncInfo);
}
return Ret;
}
static int getNonContigMergedDimension(__tgt_target_non_contig *NonContig,
int32_t DimSize) {
int RemovedDim = 0;
for (int I = DimSize - 1; I > 0; --I) {
if (NonContig[I].Count * NonContig[I].Stride == NonContig[I - 1].Stride)
RemovedDim++;
}
return RemovedDim;
}
int targetDataUpdate(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
void **ArgsBase, void **Args, int64_t *ArgSizes,
int64_t *ArgTypes, map_var_info_t *ArgNames,
void **ArgMappers, AsyncInfoTy &AsyncInfo, bool) {
for (int32_t I = 0; I < ArgNum; ++I) {
if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) ||
(ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE))
continue;
if (ArgMappers && ArgMappers[I]) {
DP("Calling targetDataMapper for the %dth argument\n", I);
map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I];
int Ret = targetDataMapper(Loc, Device, ArgsBase[I], Args[I], ArgSizes[I],
ArgTypes[I], ArgName, ArgMappers[I], AsyncInfo,
targetDataUpdate);
if (Ret != OFFLOAD_SUCCESS) {
REPORT("Call to targetDataUpdate via targetDataMapper for custom mapper"
" failed.\n");
return OFFLOAD_FAIL;
}
continue;
}
int Ret = OFFLOAD_SUCCESS;
if (ArgTypes[I] & OMP_TGT_MAPTYPE_NON_CONTIG) {
__tgt_target_non_contig *NonContig = (__tgt_target_non_contig *)Args[I];
int32_t DimSize = ArgSizes[I];
uint64_t Size =
NonContig[DimSize - 1].Count * NonContig[DimSize - 1].Stride;
int32_t MergedDim = getNonContigMergedDimension(NonContig, DimSize);
Ret = targetDataNonContiguous(
Loc, Device, ArgsBase[I], NonContig, Size, ArgTypes[I],
0, DimSize - MergedDim, 0, AsyncInfo);
} else {
Ret = targetDataContiguous(Loc, Device, ArgsBase[I], Args[I], ArgSizes[I],
ArgTypes[I], AsyncInfo);
}
if (Ret == OFFLOAD_FAIL)
return OFFLOAD_FAIL;
}
return OFFLOAD_SUCCESS;
}
static const unsigned LambdaMapping = OMP_TGT_MAPTYPE_PTR_AND_OBJ |
OMP_TGT_MAPTYPE_LITERAL |
OMP_TGT_MAPTYPE_IMPLICIT;
static bool isLambdaMapping(int64_t Mapping) {
return (Mapping & LambdaMapping) == LambdaMapping;
}
namespace {
TableMap *getTableMap(void *HostPtr) {
std::lock_guard<std::mutex> TblMapLock(PM->TblMapMtx);
HostPtrToTableMapTy::iterator TableMapIt =
PM->HostPtrToTableMap.find(HostPtr);
if (TableMapIt != PM->HostPtrToTableMap.end())
return &TableMapIt->second;
TableMap *TM = nullptr;
std::lock_guard<std::mutex> TrlTblLock(PM->TrlTblMtx);
for (HostEntriesBeginToTransTableTy::iterator Itr =
PM->HostEntriesBeginToTransTable.begin();
Itr != PM->HostEntriesBeginToTransTable.end(); ++Itr) {
TranslationTable *TransTable = &Itr->second;
__tgt_offload_entry *Cur = TransTable->HostTable.EntriesBegin;
for (uint32_t I = 0; Cur < TransTable->HostTable.EntriesEnd; ++Cur, ++I) {
if (Cur->addr != HostPtr)
continue;
TM = &(PM->HostPtrToTableMap)[HostPtr];
TM->Table = TransTable;
TM->Index = I;
return TM;
}
}
return nullptr;
}
class PrivateArgumentManagerTy {
struct FirstPrivateArgInfoTy {
char *HstPtrBegin;
char *HstPtrEnd;
int Index;
uint32_t Alignment;
uint32_t Size;
uint32_t Padding;
map_var_info_t HstPtrName = nullptr;
FirstPrivateArgInfoTy(int Index, void *HstPtr, uint32_t Size,
uint32_t Alignment, uint32_t Padding,
map_var_info_t HstPtrName = nullptr)
: HstPtrBegin(reinterpret_cast<char *>(HstPtr)),
HstPtrEnd(HstPtrBegin + Size), Index(Index), Alignment(Alignment),
Size(Size), Padding(Padding), HstPtrName(HstPtrName) {}
};
SmallVector<void *> TgtPtrs;
SmallVector<FirstPrivateArgInfoTy> FirstPrivateArgInfo;
SmallVector<char> FirstPrivateArgBuffer;
int64_t FirstPrivateArgSize = 0;
DeviceTy &Device;
AsyncInfoTy &AsyncInfo;
static constexpr const int64_t FirstPrivateArgSizeThreshold = 1024;
public:
PrivateArgumentManagerTy(DeviceTy &Dev, AsyncInfoTy &AsyncInfo)
: Device(Dev), AsyncInfo(AsyncInfo) {}
int addArg(void *HstPtr, int64_t ArgSize, int64_t ArgOffset,
bool IsFirstPrivate, void *&TgtPtr, int TgtArgsIndex,
map_var_info_t HstPtrName = nullptr,
const bool AllocImmediately = false) {
if (ArgSize > FirstPrivateArgSizeThreshold || !IsFirstPrivate ||
AllocImmediately) {
TgtPtr = Device.allocData(ArgSize, HstPtr);
if (!TgtPtr) {
DP("Data allocation for %sprivate array " DPxMOD " failed.\n",
(IsFirstPrivate ? "first-" : ""), DPxPTR(HstPtr));
return OFFLOAD_FAIL;
}
#ifdef OMPTARGET_DEBUG
void *TgtPtrBase = (void *)((intptr_t)TgtPtr + ArgOffset);
DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD
" for %sprivate array " DPxMOD " - pushing target argument " DPxMOD
"\n",
ArgSize, DPxPTR(TgtPtr), (IsFirstPrivate ? "first-" : ""),
DPxPTR(HstPtr), DPxPTR(TgtPtrBase));
#endif
if (IsFirstPrivate) {
DP("Submitting firstprivate data to the device.\n");
int Ret = Device.submitData(TgtPtr, HstPtr, ArgSize, AsyncInfo);
if (Ret != OFFLOAD_SUCCESS) {
DP("Copying data to device failed, failed.\n");
return OFFLOAD_FAIL;
}
}
TgtPtrs.push_back(TgtPtr);
} else {
DP("Firstprivate array " DPxMOD " of size %" PRId64 " will be packed\n",
DPxPTR(HstPtr), ArgSize);
TgtPtr = nullptr;
auto *LastFPArgInfo =
FirstPrivateArgInfo.empty() ? nullptr : &FirstPrivateArgInfo.back();
uint32_t Padding = 0;
uint32_t StartAlignment =
LastFPArgInfo ? LastFPArgInfo->Alignment : MaxAlignment;
if (LastFPArgInfo) {
uint32_t Offset = LastFPArgInfo->Size % StartAlignment;
if (Offset)
StartAlignment = Offset;
uint32_t RequiredAlignment =
llvm::bit_floor(getPartialStructRequiredAlignment(HstPtr));
if (RequiredAlignment > StartAlignment) {
Padding = RequiredAlignment - StartAlignment;
StartAlignment = RequiredAlignment;
}
}
FirstPrivateArgInfo.emplace_back(TgtArgsIndex, HstPtr, ArgSize,
StartAlignment, Padding, HstPtrName);
FirstPrivateArgSize += Padding + ArgSize;
}
return OFFLOAD_SUCCESS;
}
int packAndTransfer(SmallVector<void *> &TgtArgs) {
if (!FirstPrivateArgInfo.empty()) {
assert(FirstPrivateArgSize != 0 &&
"FirstPrivateArgSize is 0 but FirstPrivateArgInfo is empty");
FirstPrivateArgBuffer.resize(FirstPrivateArgSize, 0);
auto *Itr = FirstPrivateArgBuffer.begin();
for (FirstPrivateArgInfoTy &Info : FirstPrivateArgInfo) {
Itr = std::next(Itr, Info.Padding);
std::copy(Info.HstPtrBegin, Info.HstPtrEnd, Itr);
Itr = std::next(Itr, Info.Size);
}
void *TgtPtr =
Device.allocData(FirstPrivateArgSize, FirstPrivateArgBuffer.data());
if (TgtPtr == nullptr) {
DP("Failed to allocate target memory for private arguments.\n");
return OFFLOAD_FAIL;
}
TgtPtrs.push_back(TgtPtr);
DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD "\n",
FirstPrivateArgSize, DPxPTR(TgtPtr));
int Ret = Device.submitData(TgtPtr, FirstPrivateArgBuffer.data(),
FirstPrivateArgSize, AsyncInfo);
if (Ret != OFFLOAD_SUCCESS) {
DP("Failed to submit data of private arguments.\n");
return OFFLOAD_FAIL;
}
auto TP = reinterpret_cast<uintptr_t>(TgtPtr);
for (FirstPrivateArgInfoTy &Info : FirstPrivateArgInfo) {
void *&Ptr = TgtArgs[Info.Index];
assert(Ptr == nullptr && "Target pointer is already set by mistaken");
TP += Info.Padding;
Ptr = reinterpret_cast<void *>(TP);
TP += Info.Size;
DP("Firstprivate array " DPxMOD " of size %" PRId64 " mapped to " DPxMOD
"\n",
DPxPTR(Info.HstPtrBegin), Info.HstPtrEnd - Info.HstPtrBegin,
DPxPTR(Ptr));
}
}
return OFFLOAD_SUCCESS;
}
int free() {
for (void *P : TgtPtrs) {
int Ret = Device.deleteData(P);
if (Ret != OFFLOAD_SUCCESS) {
DP("Deallocation of (first-)private arrays failed.\n");
return OFFLOAD_FAIL;
}
}
TgtPtrs.clear();
return OFFLOAD_SUCCESS;
}
};
static int processDataBefore(ident_t *Loc, int64_t DeviceId, void *HostPtr,
int32_t ArgNum, void **ArgBases, void **Args,
int64_t *ArgSizes, int64_t *ArgTypes,
map_var_info_t *ArgNames, void **ArgMappers,
SmallVector<void *> &TgtArgs,
SmallVector<ptrdiff_t> &TgtOffsets,
PrivateArgumentManagerTy &PrivateArgumentManager,
AsyncInfoTy &AsyncInfo) {
auto DeviceOrErr = PM->getDevice(DeviceId);
if (!DeviceOrErr)
FATAL_MESSAGE(DeviceId, "%s", toString(DeviceOrErr.takeError()).c_str());
int Ret = targetDataBegin(Loc, *DeviceOrErr, ArgNum, ArgBases, Args, ArgSizes,
ArgTypes, ArgNames, ArgMappers, AsyncInfo);
if (Ret != OFFLOAD_SUCCESS) {
REPORT("Call to targetDataBegin failed, abort target.\n");
return OFFLOAD_FAIL;
}
SmallVector<int> TgtArgsPositions(ArgNum, -1);
for (int32_t I = 0; I < ArgNum; ++I) {
if (!(ArgTypes[I] & OMP_TGT_MAPTYPE_TARGET_PARAM)) {
if (isLambdaMapping(ArgTypes[I])) {
assert((ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) &&
"PTR_AND_OBJ must be also MEMBER_OF.");
unsigned Idx = getParentIndex(ArgTypes[I]);
int TgtIdx = TgtArgsPositions[Idx];
assert(TgtIdx != -1 && "Base address must be translated already.");
void *HstPtrVal = Args[I];
void *HstPtrBegin = ArgBases[I];
void *HstPtrBase = Args[Idx];
void *TgtPtrBase =
(void *)((intptr_t)TgtArgs[TgtIdx] + TgtOffsets[TgtIdx]);
DP("Parent lambda base " DPxMOD "\n", DPxPTR(TgtPtrBase));
uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase;
void *TgtPtrBegin = (void *)((uintptr_t)TgtPtrBase + Delta);
void *&PointerTgtPtrBegin = AsyncInfo.getVoidPtrLocation();
TargetPointerResultTy TPR =
DeviceOrErr->getMappingInfo().getTgtPtrBegin(
HstPtrVal, ArgSizes[I], false,
false);
PointerTgtPtrBegin = TPR.TargetPointer;
if (!TPR.isPresent()) {
DP("No lambda captured variable mapped (" DPxMOD ") - ignored\n",
DPxPTR(HstPtrVal));
continue;
}
if (TPR.Flags.IsHostPointer) {
DP("Unified memory is active, no need to map lambda captured"
"variable (" DPxMOD ")\n",
DPxPTR(HstPtrVal));
continue;
}
DP("Update lambda reference (" DPxMOD ") -> [" DPxMOD "]\n",
DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin));
Ret =
DeviceOrErr->submitData(TgtPtrBegin, &PointerTgtPtrBegin,
sizeof(void *), AsyncInfo, TPR.getEntry());
if (Ret != OFFLOAD_SUCCESS) {
REPORT("Copying data to device failed.\n");
return OFFLOAD_FAIL;
}
}
continue;
}
void *HstPtrBegin = Args[I];
void *HstPtrBase = ArgBases[I];
void *TgtPtrBegin;
map_var_info_t HstPtrName = (!ArgNames) ? nullptr : ArgNames[I];
ptrdiff_t TgtBaseOffset;
TargetPointerResultTy TPR;
if (ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) {
DP("Forwarding first-private value " DPxMOD " to the target construct\n",
DPxPTR(HstPtrBase));
TgtPtrBegin = HstPtrBase;
TgtBaseOffset = 0;
} else if (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE) {
TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin;
const bool IsFirstPrivate = (ArgTypes[I] & OMP_TGT_MAPTYPE_TO);
const bool AllocImmediately =
(I < ArgNum - 1 && (ArgTypes[I + 1] & OMP_TGT_MAPTYPE_MEMBER_OF));
Ret = PrivateArgumentManager.addArg(
HstPtrBegin, ArgSizes[I], TgtBaseOffset, IsFirstPrivate, TgtPtrBegin,
TgtArgs.size(), HstPtrName, AllocImmediately);
if (Ret != OFFLOAD_SUCCESS) {
REPORT("Failed to process %sprivate argument " DPxMOD "\n",
(IsFirstPrivate ? "first-" : ""), DPxPTR(HstPtrBegin));
return OFFLOAD_FAIL;
}
} else {
if (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)
HstPtrBase = *reinterpret_cast<void **>(HstPtrBase);
TPR = DeviceOrErr->getMappingInfo().getTgtPtrBegin(
HstPtrBegin, ArgSizes[I],
false,
false);
TgtPtrBegin = TPR.TargetPointer;
TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin;
#ifdef OMPTARGET_DEBUG
void *TgtPtrBase = (void *)((intptr_t)TgtPtrBegin + TgtBaseOffset);
DP("Obtained target argument " DPxMOD " from host pointer " DPxMOD "\n",
DPxPTR(TgtPtrBase), DPxPTR(HstPtrBegin));
#endif
}
TgtArgsPositions[I] = TgtArgs.size();
TgtArgs.push_back(TgtPtrBegin);
TgtOffsets.push_back(TgtBaseOffset);
}
assert(TgtArgs.size() == TgtOffsets.size() &&
"Size mismatch in arguments and offsets");
Ret = PrivateArgumentManager.packAndTransfer(TgtArgs);
if (Ret != OFFLOAD_SUCCESS) {
DP("Failed to pack and transfer first private arguments\n");
return OFFLOAD_FAIL;
}
return OFFLOAD_SUCCESS;
}
static int processDataAfter(ident_t *Loc, int64_t DeviceId, void *HostPtr,
int32_t ArgNum, void **ArgBases, void **Args,
int64_t *ArgSizes, int64_t *ArgTypes,
map_var_info_t *ArgNames, void **ArgMappers,
PrivateArgumentManagerTy &PrivateArgumentManager,
AsyncInfoTy &AsyncInfo) {
auto DeviceOrErr = PM->getDevice(DeviceId);
if (!DeviceOrErr)
FATAL_MESSAGE(DeviceId, "%s", toString(DeviceOrErr.takeError()).c_str());
int Ret = targetDataEnd(Loc, *DeviceOrErr, ArgNum, ArgBases, Args, ArgSizes,
ArgTypes, ArgNames, ArgMappers, AsyncInfo);
if (Ret != OFFLOAD_SUCCESS) {
REPORT("Call to targetDataEnd failed, abort target.\n");
return OFFLOAD_FAIL;
}
AsyncInfo.addPostProcessingFunction(
[PrivateArgumentManager =
std::move(PrivateArgumentManager)]() mutable -> int {
int Ret = PrivateArgumentManager.free();
if (Ret != OFFLOAD_SUCCESS) {
REPORT("Failed to deallocate target memory for private args\n");
return OFFLOAD_FAIL;
}
return Ret;
});
return OFFLOAD_SUCCESS;
}
}
int target(ident_t *Loc, DeviceTy &Device, void *HostPtr,
KernelArgsTy &KernelArgs, AsyncInfoTy &AsyncInfo) {
int32_t DeviceId = Device.DeviceID;
TableMap *TM = getTableMap(HostPtr);
if (!TM) {
REPORT("Host ptr " DPxMOD " does not have a matching target pointer.\n",
DPxPTR(HostPtr));
return OFFLOAD_FAIL;
}
__tgt_target_table *TargetTable = nullptr;
{
std::lock_guard<std::mutex> TrlTblLock(PM->TrlTblMtx);
assert(TM->Table->TargetsTable.size() > (size_t)DeviceId &&
"Not expecting a device ID outside the table's bounds!");
TargetTable = TM->Table->TargetsTable[DeviceId];
}
assert(TargetTable && "Global data has not been mapped\n");
DP("loop trip count is %" PRIu64 ".\n", KernelArgs.Tripcount);
SmallVector<void *> TgtArgs;
SmallVector<ptrdiff_t> TgtOffsets;
PrivateArgumentManagerTy PrivateArgumentManager(Device, AsyncInfo);
int NumClangLaunchArgs = KernelArgs.NumArgs;
int Ret = OFFLOAD_SUCCESS;
if (NumClangLaunchArgs) {
Ret = processDataBefore(Loc, DeviceId, HostPtr, NumClangLaunchArgs,
KernelArgs.ArgBasePtrs, KernelArgs.ArgPtrs,
KernelArgs.ArgSizes, KernelArgs.ArgTypes,
KernelArgs.ArgNames, KernelArgs.ArgMappers, TgtArgs,
TgtOffsets, PrivateArgumentManager, AsyncInfo);
if (Ret != OFFLOAD_SUCCESS) {
REPORT("Failed to process data before launching the kernel.\n");
return OFFLOAD_FAIL;
}
KernelArgs.NumArgs = TgtArgs.size();
}
void *TgtEntryPtr = TargetTable->EntriesBegin[TM->Index].addr;
DP("Launching target execution %s with pointer " DPxMOD " (index=%d).\n",
TargetTable->EntriesBegin[TM->Index].name, DPxPTR(TgtEntryPtr), TM->Index);
{
assert(KernelArgs.NumArgs == TgtArgs.size() && "Argument count mismatch!");
TIMESCOPE_WITH_DETAILS_AND_IDENT(
"Kernel Target",
"NumArguments=" + std::to_string(KernelArgs.NumArgs) +
";NumTeams=" + std::to_string(KernelArgs.NumTeams[0]) +
";TripCount=" + std::to_string(KernelArgs.Tripcount),
Loc);
#ifdef OMPT_SUPPORT
assert(KernelArgs.NumTeams[1] == 0 && KernelArgs.NumTeams[2] == 0 &&
"Multi dimensional launch not supported yet.");
int32_t NumTeams = KernelArgs.NumTeams[0];
InterfaceRAII TargetSubmitRAII(
RegionInterface.getCallbacks<ompt_callback_target_submit>(), NumTeams);
#endif
Ret = Device.launchKernel(TgtEntryPtr, TgtArgs.data(), TgtOffsets.data(),
KernelArgs, AsyncInfo);
}
if (Ret != OFFLOAD_SUCCESS) {
REPORT("Executing target region abort target.\n");
return OFFLOAD_FAIL;
}
if (NumClangLaunchArgs) {
Ret = processDataAfter(Loc, DeviceId, HostPtr, NumClangLaunchArgs,
KernelArgs.ArgBasePtrs, KernelArgs.ArgPtrs,
KernelArgs.ArgSizes, KernelArgs.ArgTypes,
KernelArgs.ArgNames, KernelArgs.ArgMappers,
PrivateArgumentManager, AsyncInfo);
if (Ret != OFFLOAD_SUCCESS) {
REPORT("Failed to process data after launching the kernel.\n");
return OFFLOAD_FAIL;
}
}
return OFFLOAD_SUCCESS;
}
int target_activate_rr(DeviceTy &Device, uint64_t MemorySize, void *VAddr,
bool IsRecord, bool SaveOutput,
uint64_t &ReqPtrArgOffset) {
return Device.RTL->initialize_record_replay(Device.DeviceID, MemorySize,
VAddr, IsRecord, SaveOutput,
ReqPtrArgOffset);
}
int target_replay(ident_t *Loc, DeviceTy &Device, void *HostPtr,
void *DeviceMemory, int64_t DeviceMemorySize, void **TgtArgs,
ptrdiff_t *TgtOffsets, int32_t NumArgs, int32_t NumTeams,
int32_t ThreadLimit, uint64_t LoopTripCount,
AsyncInfoTy &AsyncInfo) {
int32_t DeviceId = Device.DeviceID;
TableMap *TM = getTableMap(HostPtr);
if (!TM) {
REPORT("Host ptr " DPxMOD " does not have a matching target pointer.\n",
DPxPTR(HostPtr));
return OFFLOAD_FAIL;
}
__tgt_target_table *TargetTable = nullptr;
{
std::lock_guard<std::mutex> TrlTblLock(PM->TrlTblMtx);
assert(TM->Table->TargetsTable.size() > (size_t)DeviceId &&
"Not expecting a device ID outside the table's bounds!");
TargetTable = TM->Table->TargetsTable[DeviceId];
}
assert(TargetTable && "Global data has not been mapped\n");
void *TgtEntryPtr = TargetTable->EntriesBegin[TM->Index].addr;
DP("Launching target execution %s with pointer " DPxMOD " (index=%d).\n",
TargetTable->EntriesBegin[TM->Index].name, DPxPTR(TgtEntryPtr), TM->Index);
void *TgtPtr = Device.allocData(DeviceMemorySize, nullptr,
TARGET_ALLOC_DEFAULT);
Device.submitData(TgtPtr, DeviceMemory, DeviceMemorySize, AsyncInfo);
KernelArgsTy KernelArgs{};
KernelArgs.Version = OMP_KERNEL_ARG_VERSION;
KernelArgs.NumArgs = NumArgs;
KernelArgs.Tripcount = LoopTripCount;
KernelArgs.NumTeams[0] = NumTeams;
KernelArgs.ThreadLimit[0] = ThreadLimit;
int Ret = Device.launchKernel(TgtEntryPtr, TgtArgs, TgtOffsets, KernelArgs,
AsyncInfo);
if (Ret != OFFLOAD_SUCCESS) {
REPORT("Executing target region abort target.\n");
return OFFLOAD_FAIL;
}
return OFFLOAD_SUCCESS;
}