#include "mlir/Conversion/SCFToGPU/SCFToGPU.h"
#include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Arith/IR/Arith.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/GPU/Transforms/ParallelLoopMapper.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
#include "mlir/Dialect/SCF/IR/SCF.h"
#include "mlir/IR/AffineExpr.h"
#include "mlir/IR/Builders.h"
#include "mlir/IR/IRMapping.h"
#include "mlir/Interfaces/SideEffectInterfaces.h"
#include "mlir/Pass/Pass.h"
#include "mlir/Transforms/DialectConversion.h"
#include "mlir/Transforms/Passes.h"
#include "mlir/Transforms/RegionUtils.h"
#include "llvm/ADT/Sequence.h"
#include "llvm/Support/Debug.h"
#include <optional>
#define DEBUG_TYPE "loops-to-gpu"
using namespace mlir;
using namespace mlir::affine;
using namespace mlir::scf;
static constexpr StringLiteral kVisitedAttrName = "SCFToGPU_visited";
static Value getDim3Value(const gpu::KernelDim3 &dim3, unsigned pos) {
switch (pos) {
case 0:
return dim3.x;
case 1:
return dim3.y;
case 2:
return dim3.z;
default:
llvm_unreachable("dim3 position out of bounds");
}
return nullptr;
}
static Operation::operand_range getLowerBoundOperands(AffineForOp forOp) {
return forOp.getLowerBoundOperands();
}
static Operation::operand_range getUpperBoundOperands(AffineForOp forOp) {
return forOp.getUpperBoundOperands();
}
static Value getOrCreateStep(AffineForOp forOp, OpBuilder &builder) {
return builder.create<arith::ConstantIndexOp>(forOp.getLoc(),
forOp.getStepAsInt());
}
static Value getOrEmitLowerBound(AffineForOp forOp, OpBuilder &builder) {
return lowerAffineLowerBound(forOp, builder);
}
static Value getOrEmitUpperBound(AffineForOp forOp, OpBuilder &builder) {
return lowerAffineUpperBound(forOp, builder);
}
static LogicalResult checkAffineLoopNestMappableImpl(AffineForOp forOp,
unsigned numDims) {
Region &limit = forOp.getRegion();
for (unsigned i = 0, e = numDims; i < e; ++i) {
Operation *nested = &forOp.getBody()->front();
if (!areValuesDefinedAbove(getLowerBoundOperands(forOp), limit) ||
!areValuesDefinedAbove(getUpperBoundOperands(forOp), limit))
return forOp.emitError(
"loops with bounds depending on other mapped loops "
"are not supported");
if (i == e - 1)
break;
auto begin = forOp.getBody()->begin(), end = forOp.getBody()->end();
if (forOp.getBody()->empty() || std::next(begin, 2) != end)
return forOp.emitError("expected perfectly nested loops in the body");
if (!(forOp = dyn_cast<AffineForOp>(nested)))
return nested->emitError("expected a nested loop");
}
return success();
}
static LogicalResult checkAffineLoopNestMappable(AffineForOp forOp,
unsigned numBlockDims,
unsigned numThreadDims) {
if (numBlockDims < 1 || numThreadDims < 1) {
LLVM_DEBUG(llvm::dbgs() << "nothing to map");
return success();
}
if (numBlockDims > 3) {
return forOp.emitError("cannot map to more than 3 block dimensions");
}
if (numThreadDims > 3) {
return forOp.emitError("cannot map to more than 3 thread dimensions");
}
return checkAffineLoopNestMappableImpl(forOp, numBlockDims + numThreadDims);
}
namespace {
struct AffineLoopToGpuConverter {
std::optional<AffineForOp> collectBounds(AffineForOp forOp,
unsigned numLoops);
void createLaunch(AffineForOp rootForOp, AffineForOp innermostForOp,
unsigned numBlockDims, unsigned numThreadDims);
SmallVector<Value, 6> dims;
SmallVector<Value, 6> lbs;
SmallVector<Value, 6> ivs;
SmallVector<Value, 6> steps;
};
}
std::optional<AffineForOp>
AffineLoopToGpuConverter::collectBounds(AffineForOp forOp, unsigned numLoops) {
OpBuilder builder(forOp.getOperation());
dims.reserve(numLoops);
lbs.reserve(numLoops);
ivs.reserve(numLoops);
steps.reserve(numLoops);
AffineForOp currentLoop = forOp;
for (unsigned i = 0; i < numLoops; ++i) {
Value lowerBound = getOrEmitLowerBound(currentLoop, builder);
Value upperBound = getOrEmitUpperBound(currentLoop, builder);
if (!lowerBound || !upperBound) {
return std::nullopt;
}
Value range = builder.create<arith::SubIOp>(currentLoop.getLoc(),
upperBound, lowerBound);
Value step = getOrCreateStep(currentLoop, builder);
if (getConstantIntValue(step) != static_cast<int64_t>(1))
range =
builder.create<arith::CeilDivSIOp>(currentLoop.getLoc(), range, step);
dims.push_back(range);
lbs.push_back(lowerBound);
ivs.push_back(currentLoop.getInductionVar());
steps.push_back(step);
if (i != numLoops - 1)
currentLoop = cast<AffineForOp>(¤tLoop.getBody()->front());
}
return currentLoop;
}
void AffineLoopToGpuConverter::createLaunch(AffineForOp rootForOp,
AffineForOp innermostForOp,
unsigned numBlockDims,
unsigned numThreadDims) {
OpBuilder builder(rootForOp.getOperation());
Value constOne =
(numBlockDims < 3 || numThreadDims < 3)
? builder.create<arith::ConstantIndexOp>(rootForOp.getLoc(), 1)
: nullptr;
Value gridSizeX = numBlockDims > 0 ? dims[0] : constOne;
Value gridSizeY = numBlockDims > 1 ? dims[1] : constOne;
Value gridSizeZ = numBlockDims > 2 ? dims[2] : constOne;
Value blockSizeX = numThreadDims > 0 ? dims[numBlockDims] : constOne;
Value blockSizeY = numThreadDims > 1 ? dims[numBlockDims + 1] : constOne;
Value blockSizeZ = numThreadDims > 2 ? dims[numBlockDims + 2] : constOne;
auto launchOp = builder.create<gpu::LaunchOp>(
rootForOp.getLoc(), gridSizeX, gridSizeY, gridSizeZ, blockSizeX,
blockSizeY, blockSizeZ);
Operation &terminator = innermostForOp.getBody()->back();
Location terminatorLoc = terminator.getLoc();
terminator.erase();
builder.setInsertionPointToEnd(innermostForOp.getBody());
builder.create<gpu::TerminatorOp>(terminatorLoc, std::nullopt);
launchOp.getBody().front().getOperations().splice(
launchOp.getBody().front().begin(),
innermostForOp.getBody()->getOperations());
builder.setInsertionPointToStart(&launchOp.getBody().front());
auto *lbArgumentIt = lbs.begin();
auto *stepArgumentIt = steps.begin();
for (const auto &en : llvm::enumerate(ivs)) {
Value id =
en.index() < numBlockDims
? getDim3Value(launchOp.getBlockIds(), en.index())
: getDim3Value(launchOp.getThreadIds(), en.index() - numBlockDims);
Value step = steps[en.index()];
if (getConstantIntValue(step) != static_cast<int64_t>(1))
id = builder.create<arith::MulIOp>(rootForOp.getLoc(), step, id);
Value ivReplacement =
builder.create<arith::AddIOp>(rootForOp.getLoc(), *lbArgumentIt, id);
en.value().replaceAllUsesWith(ivReplacement);
std::advance(lbArgumentIt, 1);
std::advance(stepArgumentIt, 1);
}
rootForOp.erase();
}
static LogicalResult convertAffineLoopNestToGPULaunch(AffineForOp forOp,
unsigned numBlockDims,
unsigned numThreadDims) {
if (failed(checkAffineLoopNestMappable(forOp, numBlockDims, numThreadDims)))
return failure();
AffineLoopToGpuConverter converter;
auto maybeInnerLoop =
converter.collectBounds(forOp, numBlockDims + numThreadDims);
if (!maybeInnerLoop)
return failure();
converter.createLaunch(forOp, *maybeInnerLoop, numBlockDims, numThreadDims);
return success();
}
LogicalResult mlir::convertAffineLoopNestToGPULaunch(AffineForOp forOp,
unsigned numBlockDims,
unsigned numThreadDims) {
return ::convertAffineLoopNestToGPULaunch(forOp, numBlockDims, numThreadDims);
}
namespace {
struct ParallelToGpuLaunchLowering : public OpRewritePattern<ParallelOp> {
using OpRewritePattern<ParallelOp>::OpRewritePattern;
LogicalResult matchAndRewrite(ParallelOp parallelOp,
PatternRewriter &rewriter) const override;
};
}
static Value deriveStaticUpperBound(Value upperBound,
PatternRewriter &rewriter) {
if (auto op = upperBound.getDefiningOp<arith::ConstantIndexOp>()) {
return op;
}
if (auto minOp = upperBound.getDefiningOp<AffineMinOp>()) {
for (const AffineExpr &result : minOp.getMap().getResults()) {
if (auto constExpr = dyn_cast<AffineConstantExpr>(result)) {
return rewriter.create<arith::ConstantIndexOp>(minOp.getLoc(),
constExpr.getValue());
}
}
}
if (auto minOp = upperBound.getDefiningOp<arith::MinSIOp>()) {
for (Value operand : {minOp.getLhs(), minOp.getRhs()}) {
if (auto staticBound = deriveStaticUpperBound(operand, rewriter))
return staticBound;
}
}
if (auto multiplyOp = upperBound.getDefiningOp<arith::MulIOp>()) {
if (auto lhs = dyn_cast_or_null<arith::ConstantIndexOp>(
deriveStaticUpperBound(multiplyOp.getOperand(0), rewriter)
.getDefiningOp()))
if (auto rhs = dyn_cast_or_null<arith::ConstantIndexOp>(
deriveStaticUpperBound(multiplyOp.getOperand(1), rewriter)
.getDefiningOp())) {
if ((lhs.value() < 0) != (rhs.value() < 0))
return {};
return rewriter.create<arith::ConstantIndexOp>(
multiplyOp.getLoc(), lhs.value() * rhs.value());
}
}
return {};
}
static bool isMappedToProcessor(gpu::Processor processor) {
return processor != gpu::Processor::Sequential;
}
static unsigned getLaunchOpArgumentNum(gpu::Processor processor) {
switch (processor) {
case gpu::Processor::BlockX:
return 0;
case gpu::Processor::BlockY:
return 1;
case gpu::Processor::BlockZ:
return 2;
case gpu::Processor::ThreadX:
return 3;
case gpu::Processor::ThreadY:
return 4;
case gpu::Processor::ThreadZ:
return 5;
default:;
}
llvm_unreachable(
"invalid processor type while retrieving launch op argument number");
}
static LogicalResult processParallelLoop(
ParallelOp parallelOp, gpu::LaunchOp launchOp, IRMapping &cloningMap,
SmallVectorImpl<Operation *> &worklist,
DenseMap<gpu::Processor, Value> &bounds, PatternRewriter &rewriter) {
ArrayAttr mapping =
parallelOp->getAttrOfType<ArrayAttr>(gpu::getMappingAttrName());
if (!mapping || parallelOp.getNumResults() != 0)
return failure();
Location loc = parallelOp.getLoc();
auto launchIndependent = [&launchOp](Value val) {
return val.getParentRegion()->isAncestor(launchOp->getParentRegion());
};
auto ensureLaunchIndependent = [&rewriter,
launchIndependent](Value val) -> Value {
if (launchIndependent(val))
return val;
if (auto constOp = val.getDefiningOp<arith::ConstantOp>())
return rewriter.create<arith::ConstantOp>(constOp.getLoc(),
constOp.getValue());
return {};
};
for (auto config : llvm::zip(
mapping, parallelOp.getInductionVars(), parallelOp.getLowerBound(),
parallelOp.getUpperBound(), parallelOp.getStep())) {
Attribute mappingAttribute;
Value iv, lowerBound, upperBound, step;
std::tie(mappingAttribute, iv, lowerBound, upperBound, step) = config;
auto annotation =
dyn_cast<gpu::ParallelLoopDimMappingAttr>(mappingAttribute);
if (!annotation)
return parallelOp.emitOpError()
<< "expected mapping attribute for lowering to GPU";
Value newIndex;
gpu::Processor processor = annotation.getProcessor();
if (isMappedToProcessor(processor)) {
Value operand =
launchOp.getBody().getArgument(getLaunchOpArgumentNum(processor));
AffineMap lowerAndStep = AffineMap::get(
1, 2,
rewriter.getAffineDimExpr(0) * rewriter.getAffineSymbolExpr(0) +
rewriter.getAffineSymbolExpr(1));
newIndex = rewriter.create<AffineApplyOp>(
loc, annotation.getMap().compose(lowerAndStep),
ValueRange{operand, ensureLaunchIndependent(step),
ensureLaunchIndependent(lowerBound)});
if (annotation.getBound()) {
if (!launchIndependent(lowerBound) &&
!isa_and_nonnull<arith::ConstantOp>(lowerBound.getDefiningOp()))
return failure();
if (!launchIndependent(step) &&
!isa_and_nonnull<arith::ConstantOp>(step.getDefiningOp()))
return failure();
bool boundIsPrecise =
launchIndependent(upperBound) ||
isa_and_nonnull<arith::ConstantOp>(upperBound.getDefiningOp());
{
PatternRewriter::InsertionGuard guard(rewriter);
rewriter.setInsertionPoint(launchOp);
if (!boundIsPrecise) {
upperBound = deriveStaticUpperBound(upperBound, rewriter);
if (!upperBound) {
return rewriter.notifyMatchFailure(
parallelOp,
"cannot derive loop-invariant upper bound for number of"
"iterations");
}
}
AffineMap stepMap = AffineMap::get(
1, 2,
((rewriter.getAffineDimExpr(0) - rewriter.getAffineSymbolExpr(0))
.ceilDiv(rewriter.getAffineSymbolExpr(1))));
Value launchBound = rewriter.create<AffineApplyOp>(
loc, annotation.getBound().compose(stepMap),
ValueRange{
ensureLaunchIndependent(
cloningMap.lookupOrDefault(upperBound)),
ensureLaunchIndependent(
cloningMap.lookupOrDefault(lowerBound)),
ensureLaunchIndependent(cloningMap.lookupOrDefault(step))});
if (bounds.contains(processor)) {
return rewriter.notifyMatchFailure(
parallelOp, "cannot redefine the bound for processor " +
Twine(static_cast<int64_t>(processor)));
}
bounds[processor] = launchBound;
}
if (!boundIsPrecise) {
Value originalBound = std::get<3>(config);
arith::CmpIOp pred = rewriter.create<arith::CmpIOp>(
loc, arith::CmpIPredicate::slt, newIndex,
cloningMap.lookupOrDefault(originalBound));
scf::IfOp ifOp = rewriter.create<scf::IfOp>(loc, pred, false);
rewriter.setInsertionPointToStart(&ifOp.getThenRegion().front());
worklist.push_back(launchOp.getOperation());
}
}
} else {
auto loopOp = rewriter.create<scf::ForOp>(
loc, cloningMap.lookupOrDefault(lowerBound),
cloningMap.lookupOrDefault(upperBound),
cloningMap.lookupOrDefault(step));
newIndex = loopOp.getInductionVar();
rewriter.setInsertionPointToStart(loopOp.getBody());
worklist.push_back(launchOp.getOperation());
}
cloningMap.map(iv, newIndex);
}
for (const auto &namedAttr : parallelOp->getAttrs()) {
if (namedAttr.getName() == gpu::getMappingAttrName() ||
namedAttr.getName() == ParallelOp::getOperandSegmentSizeAttr())
continue;
launchOp->setAttr(namedAttr.getName(), namedAttr.getValue());
}
Block *body = parallelOp.getBody();
worklist.reserve(worklist.size() + body->getOperations().size());
for (Operation &op : llvm::reverse(body->without_terminator()))
worklist.push_back(&op);
return success();
}
LogicalResult
ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp,
PatternRewriter &rewriter) const {
parallelOp->setAttr(kVisitedAttrName, rewriter.getUnitAttr());
if (auto parentLoop = parallelOp->getParentOfType<ParallelOp>())
return failure();
Location loc = parallelOp.getLoc();
Value constantOne =
rewriter.create<arith::ConstantIndexOp>(parallelOp.getLoc(), 1);
gpu::LaunchOp launchOp = rewriter.create<gpu::LaunchOp>(
parallelOp.getLoc(), constantOne, constantOne, constantOne, constantOne,
constantOne, constantOne);
rewriter.setInsertionPointToEnd(&launchOp.getBody().front());
rewriter.create<gpu::TerminatorOp>(loc);
rewriter.setInsertionPointToStart(&launchOp.getBody().front());
IRMapping cloningMap;
llvm::DenseMap<gpu::Processor, Value> launchBounds;
SmallVector<Operation *, 16> worklist;
if (failed(processParallelLoop(parallelOp, launchOp, cloningMap, worklist,
launchBounds, rewriter)))
return failure();
bool seenSideeffects = false;
bool leftNestingScope = false;
while (!worklist.empty()) {
Operation *op = worklist.pop_back_val();
if (auto nestedParallel = dyn_cast<ParallelOp>(op)) {
if (seenSideeffects)
return failure();
if (failed(processParallelLoop(nestedParallel, launchOp, cloningMap,
worklist, launchBounds, rewriter)))
return failure();
} else if (op == launchOp.getOperation()) {
auto *parent = rewriter.getInsertionPoint()->getParentOp();
rewriter.setInsertionPointAfter(parent);
leftNestingScope = true;
seenSideeffects = false;
} else {
Operation *clone = rewriter.clone(*op, cloningMap);
cloningMap.map(op->getResults(), clone->getResults());
seenSideeffects |=
!isMemoryEffectFree(clone) || clone->getNumRegions() != 0;
if (seenSideeffects && leftNestingScope)
return failure();
}
}
for (auto bound : launchBounds)
launchOp.setOperand(getLaunchOpArgumentNum(std::get<0>(bound)),
std::get<1>(bound));
rewriter.eraseOp(parallelOp);
return success();
}
void mlir::populateParallelLoopToGPUPatterns(RewritePatternSet &patterns) {
patterns.add<ParallelToGpuLaunchLowering>(patterns.getContext());
}
void mlir::configureParallelLoopToGPULegality(ConversionTarget &target) {
target.addLegalDialect<memref::MemRefDialect>();
target.addDynamicallyLegalOp<scf::ParallelOp>([](scf::ParallelOp parallelOp) {
return !parallelOp->hasAttr(gpu::getMappingAttrName()) ||
parallelOp->hasAttr(kVisitedAttrName);
});
}
void mlir::finalizeParallelLoopToGPUConversion(Operation *op) {
op->walk([](scf::ParallelOp parallelOp) {
parallelOp->removeAttr(kVisitedAttrName);
});
}