#include "GPUOpsLowering.h"
#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
#include "mlir/IR/Builders.h"
#include "llvm/Support/FormatVariadic.h"
using namespace mlir;
LogicalResult
GPUFuncOpLowering::matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const {
Location loc = gpuFuncOp.getLoc();
SmallVector<LLVM::GlobalOp, 3> workgroupBuffers;
workgroupBuffers.reserve(gpuFuncOp.getNumWorkgroupAttributions());
for (const auto &en : llvm::enumerate(gpuFuncOp.getWorkgroupAttributions())) {
Value attribution = en.value();
auto type = attribution.getType().dyn_cast<MemRefType>();
assert(type && type.hasStaticShape() && "unexpected type in attribution");
uint64_t numElements = type.getNumElements();
auto elementType =
typeConverter->convertType(type.getElementType()).template cast<Type>();
auto arrayType = LLVM::LLVMArrayType::get(elementType, numElements);
std::string name = std::string(
llvm::formatv("__wg_{0}_{1}", gpuFuncOp.getName(), en.index()));
auto globalOp = rewriter.create<LLVM::GlobalOp>(
gpuFuncOp.getLoc(), arrayType, false,
LLVM::Linkage::Internal, name, Attribute(),
0, gpu::GPUDialect::getWorkgroupAddressSpace());
workgroupBuffers.push_back(globalOp);
}
auto funcType = typeConverter->convertType(gpuFuncOp.getFunctionType())
.template cast<LLVM::LLVMPointerType>()
.getElementType();
TypeConverter::SignatureConversion signatureConversion(
gpuFuncOp.front().getNumArguments());
getTypeConverter()->convertFunctionSignature(
gpuFuncOp.getFunctionType(), false, signatureConversion);
SmallVector<NamedAttribute, 4> attributes;
for (const auto &attr : gpuFuncOp->getAttrs()) {
if (attr.getName() == SymbolTable::getSymbolAttrName() ||
attr.getName() == FunctionOpInterface::getTypeAttrName() ||
attr.getName() == gpu::GPUFuncOp::getNumWorkgroupAttributionsAttrName())
continue;
attributes.push_back(attr);
}
if (gpuFuncOp.isKernel())
attributes.emplace_back(kernelAttributeName, rewriter.getUnitAttr());
auto llvmFuncOp = rewriter.create<LLVM::LLVMFuncOp>(
gpuFuncOp.getLoc(), gpuFuncOp.getName(), funcType,
LLVM::Linkage::External, false, LLVM::CConv::C,
attributes);
{
OpBuilder::InsertionGuard guard(rewriter);
rewriter.setInsertionPointToStart(&gpuFuncOp.front());
unsigned numProperArguments = gpuFuncOp.getNumArguments();
auto i32Type = IntegerType::get(rewriter.getContext(), 32);
Value zero = nullptr;
if (!workgroupBuffers.empty())
zero = rewriter.create<LLVM::ConstantOp>(loc, i32Type,
rewriter.getI32IntegerAttr(0));
for (const auto &en : llvm::enumerate(workgroupBuffers)) {
LLVM::GlobalOp global = en.value();
Value address = rewriter.create<LLVM::AddressOfOp>(loc, global);
auto elementType =
global.getType().cast<LLVM::LLVMArrayType>().getElementType();
Value memory = rewriter.create<LLVM::GEPOp>(
loc, LLVM::LLVMPointerType::get(elementType, global.getAddrSpace()),
address, ArrayRef<Value>{zero, zero});
Value attribution = gpuFuncOp.getWorkgroupAttributions()[en.index()];
auto type = attribution.getType().cast<MemRefType>();
auto descr = MemRefDescriptor::fromStaticShape(
rewriter, loc, *getTypeConverter(), type, memory);
signatureConversion.remapInput(numProperArguments + en.index(), descr);
}
unsigned numWorkgroupAttributions = gpuFuncOp.getNumWorkgroupAttributions();
auto int64Ty = IntegerType::get(rewriter.getContext(), 64);
for (const auto &en : llvm::enumerate(gpuFuncOp.getPrivateAttributions())) {
Value attribution = en.value();
auto type = attribution.getType().cast<MemRefType>();
assert(type && type.hasStaticShape() && "unexpected type in attribution");
auto ptrType = LLVM::LLVMPointerType::get(
typeConverter->convertType(type.getElementType())
.template cast<Type>(),
allocaAddrSpace);
Value numElements = rewriter.create<LLVM::ConstantOp>(
gpuFuncOp.getLoc(), int64Ty,
rewriter.getI64IntegerAttr(type.getNumElements()));
Value allocated = rewriter.create<LLVM::AllocaOp>(
gpuFuncOp.getLoc(), ptrType, numElements, 0);
auto descr = MemRefDescriptor::fromStaticShape(
rewriter, loc, *getTypeConverter(), type, allocated);
signatureConversion.remapInput(
numProperArguments + numWorkgroupAttributions + en.index(), descr);
}
}
rewriter.inlineRegionBefore(gpuFuncOp.getBody(), llvmFuncOp.getBody(),
llvmFuncOp.end());
if (failed(rewriter.convertRegionTypes(&llvmFuncOp.getBody(), *typeConverter,
&signatureConversion)))
return failure();
rewriter.eraseOp(gpuFuncOp);
return success();
}
static const char formatStringPrefix[] = "printfFormat_";
template <typename T>
static LLVM::LLVMFuncOp getOrDefineFunction(T &moduleOp, const Location loc,
ConversionPatternRewriter &rewriter,
StringRef name,
LLVM::LLVMFunctionType type) {
LLVM::LLVMFuncOp ret;
if (!(ret = moduleOp.template lookupSymbol<LLVM::LLVMFuncOp>(name))) {
ConversionPatternRewriter::InsertionGuard guard(rewriter);
rewriter.setInsertionPointToStart(moduleOp.getBody());
ret = rewriter.create<LLVM::LLVMFuncOp>(loc, name, type,
LLVM::Linkage::External);
}
return ret;
}
LogicalResult GPUPrintfOpToHIPLowering::matchAndRewrite(
gpu::PrintfOp gpuPrintfOp, gpu::PrintfOpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const {
Location loc = gpuPrintfOp->getLoc();
mlir::Type llvmI8 = typeConverter->convertType(rewriter.getI8Type());
mlir::Type i8Ptr = LLVM::LLVMPointerType::get(llvmI8);
mlir::Type llvmIndex = typeConverter->convertType(rewriter.getIndexType());
mlir::Type llvmI32 = typeConverter->convertType(rewriter.getI32Type());
mlir::Type llvmI64 = typeConverter->convertType(rewriter.getI64Type());
auto moduleOp = gpuPrintfOp->getParentOfType<gpu::GPUModuleOp>();
auto ocklBegin =
getOrDefineFunction(moduleOp, loc, rewriter, "__ockl_printf_begin",
LLVM::LLVMFunctionType::get(llvmI64, {llvmI64}));
LLVM::LLVMFuncOp ocklAppendArgs;
if (!adaptor.args().empty()) {
ocklAppendArgs = getOrDefineFunction(
moduleOp, loc, rewriter, "__ockl_printf_append_args",
LLVM::LLVMFunctionType::get(
llvmI64, {llvmI64, llvmI32, llvmI64, llvmI64, llvmI64,
llvmI64, llvmI64, llvmI64, llvmI64, llvmI32}));
}
auto ocklAppendStringN = getOrDefineFunction(
moduleOp, loc, rewriter, "__ockl_printf_append_string_n",
LLVM::LLVMFunctionType::get(
llvmI64,
{llvmI64, i8Ptr, llvmI64, llvmI32}));
Value zeroI64 = rewriter.create<LLVM::ConstantOp>(
loc, llvmI64, rewriter.getI64IntegerAttr(0));
auto printfBeginCall = rewriter.create<LLVM::CallOp>(loc, ocklBegin, zeroI64);
Value printfDesc = printfBeginCall.getResult(0);
unsigned stringNumber = 0;
SmallString<16> stringConstName;
do {
stringConstName.clear();
(formatStringPrefix + Twine(stringNumber++)).toStringRef(stringConstName);
} while (moduleOp.lookupSymbol(stringConstName));
llvm::SmallString<20> formatString(adaptor.format());
formatString.push_back('\0');
size_t formatStringSize = formatString.size_in_bytes();
auto globalType = LLVM::LLVMArrayType::get(llvmI8, formatStringSize);
LLVM::GlobalOp global;
{
ConversionPatternRewriter::InsertionGuard guard(rewriter);
rewriter.setInsertionPointToStart(moduleOp.getBody());
global = rewriter.create<LLVM::GlobalOp>(
loc, globalType,
true, LLVM::Linkage::Internal, stringConstName,
rewriter.getStringAttr(formatString));
}
Value globalPtr = rewriter.create<LLVM::AddressOfOp>(loc, global);
Value zero = rewriter.create<LLVM::ConstantOp>(
loc, llvmIndex, rewriter.getIntegerAttr(llvmIndex, 0));
Value stringStart = rewriter.create<LLVM::GEPOp>(
loc, i8Ptr, globalPtr, mlir::ValueRange({zero, zero}));
Value stringLen = rewriter.create<LLVM::ConstantOp>(
loc, llvmI64, rewriter.getI64IntegerAttr(formatStringSize));
Value oneI32 = rewriter.create<LLVM::ConstantOp>(
loc, llvmI32, rewriter.getI32IntegerAttr(1));
Value zeroI32 = rewriter.create<LLVM::ConstantOp>(
loc, llvmI32, rewriter.getI32IntegerAttr(0));
auto appendFormatCall = rewriter.create<LLVM::CallOp>(
loc, ocklAppendStringN,
ValueRange{printfDesc, stringStart, stringLen,
adaptor.args().empty() ? oneI32 : zeroI32});
printfDesc = appendFormatCall.getResult(0);
constexpr size_t argsPerAppend = 7;
size_t nArgs = adaptor.args().size();
for (size_t group = 0; group < nArgs; group += argsPerAppend) {
size_t bound = std::min(group + argsPerAppend, nArgs);
size_t numArgsThisCall = bound - group;
SmallVector<mlir::Value, 2 + argsPerAppend + 1> arguments;
arguments.push_back(printfDesc);
arguments.push_back(rewriter.create<LLVM::ConstantOp>(
loc, llvmI32, rewriter.getI32IntegerAttr(numArgsThisCall)));
for (size_t i = group; i < bound; ++i) {
Value arg = adaptor.args()[i];
if (auto floatType = arg.getType().dyn_cast<FloatType>()) {
if (!floatType.isF64())
arg = rewriter.create<LLVM::FPExtOp>(
loc, typeConverter->convertType(rewriter.getF64Type()), arg);
arg = rewriter.create<LLVM::BitcastOp>(loc, llvmI64, arg);
}
if (arg.getType().getIntOrFloatBitWidth() != 64)
arg = rewriter.create<LLVM::ZExtOp>(loc, llvmI64, arg);
arguments.push_back(arg);
}
for (size_t extra = numArgsThisCall; extra < argsPerAppend; ++extra) {
arguments.push_back(zeroI64);
}
auto isLast = (bound == nArgs) ? oneI32 : zeroI32;
arguments.push_back(isLast);
auto call = rewriter.create<LLVM::CallOp>(loc, ocklAppendArgs, arguments);
printfDesc = call.getResult(0);
}
rewriter.eraseOp(gpuPrintfOp);
return success();
}
LogicalResult GPUPrintfOpToLLVMCallLowering::matchAndRewrite(
gpu::PrintfOp gpuPrintfOp, gpu::PrintfOpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const {
Location loc = gpuPrintfOp->getLoc();
mlir::Type llvmI8 = typeConverter->convertType(rewriter.getIntegerType(8));
mlir::Type i8Ptr = LLVM::LLVMPointerType::get(llvmI8, addressSpace);
mlir::Type llvmIndex = typeConverter->convertType(rewriter.getIndexType());
auto moduleOp = gpuPrintfOp->getParentOfType<gpu::GPUModuleOp>();
auto printfType = LLVM::LLVMFunctionType::get(rewriter.getI32Type(), {i8Ptr},
true);
LLVM::LLVMFuncOp printfDecl =
getOrDefineFunction(moduleOp, loc, rewriter, "printf", printfType);
unsigned stringNumber = 0;
SmallString<16> stringConstName;
do {
stringConstName.clear();
(formatStringPrefix + Twine(stringNumber++)).toStringRef(stringConstName);
} while (moduleOp.lookupSymbol(stringConstName));
llvm::SmallString<20> formatString(adaptor.format());
formatString.push_back('\0');
auto globalType =
LLVM::LLVMArrayType::get(llvmI8, formatString.size_in_bytes());
LLVM::GlobalOp global;
{
ConversionPatternRewriter::InsertionGuard guard(rewriter);
rewriter.setInsertionPointToStart(moduleOp.getBody());
global = rewriter.create<LLVM::GlobalOp>(
loc, globalType,
true, LLVM::Linkage::Internal, stringConstName,
rewriter.getStringAttr(formatString), 0, addressSpace);
}
Value globalPtr = rewriter.create<LLVM::AddressOfOp>(loc, global);
Value zero = rewriter.create<LLVM::ConstantOp>(
loc, llvmIndex, rewriter.getIntegerAttr(llvmIndex, 0));
Value stringStart = rewriter.create<LLVM::GEPOp>(
loc, i8Ptr, globalPtr, mlir::ValueRange({zero, zero}));
auto argsRange = adaptor.args();
SmallVector<Value, 4> printfArgs;
printfArgs.reserve(argsRange.size() + 1);
printfArgs.push_back(stringStart);
printfArgs.append(argsRange.begin(), argsRange.end());
rewriter.create<LLVM::CallOp>(loc, printfDecl, printfArgs);
rewriter.eraseOp(gpuPrintfOp);
return success();
}