* Copyright 2024 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "akg/Conversion/SCFToGPUExt/SCFToGPUExt.h"
#include <optional>
#include "llvm/ADT/Sequence.h"
#include "llvm/Support/Debug.h"
#include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Arith/IR/Arith.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/GPU/Transforms/ParallelLoopMapper.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
#include "mlir/Dialect/SCF/IR/SCF.h"
#include "mlir/IR/AffineExpr.h"
#include "mlir/IR/Builders.h"
#include "mlir/IR/IRMapping.h"
#include "mlir/Interfaces/SideEffectInterfaces.h"
#include "mlir/Pass/Pass.h"
#include "mlir/Transforms/DialectConversion.h"
#include "mlir/Transforms/Passes.h"
#include "mlir/Transforms/RegionUtils.h"
#define DEBUG_TYPE "akg-loops-to-gpu"
using namespace mlir;
using namespace mlir::affine;
using namespace mlir::scf;
static constexpr StringLiteral kVisitedAttrName = "SCFToGPU_visited";
static Value getDim3Value(const gpu::KernelDim3 &dim3, unsigned pos) {
switch (pos) {
case 0:
return dim3.x;
case 1:
return dim3.y;
case 2:
return dim3.z;
default:
llvm_unreachable("dim3 position out of bounds");
}
return nullptr;
}
static Operation::operand_range getLowerBoundOperands(affine::AffineForOp forOp) {
return forOp.getLowerBoundOperands();
}
static Operation::operand_range getUpperBoundOperands(affine::AffineForOp forOp) {
return forOp.getUpperBoundOperands();
}
static Value getOrCreateStep(affine::AffineForOp forOp, OpBuilder &builder) {
return builder.create<arith::ConstantIndexOp>(forOp.getLoc(), forOp.getStepAsInt());
}
static Value getOrEmitLowerBound(affine::AffineForOp forOp, OpBuilder &builder) {
return lowerAffineLowerBound(forOp, builder);
}
static Value getOrEmitUpperBound(affine::AffineForOp forOp, OpBuilder &builder) {
return lowerAffineUpperBound(forOp, builder);
}
static LogicalResult checkAffineLoopNestMappableImpl(affine::AffineForOp forOp, unsigned numDims) {
Region &limit = forOp.getRegion();
for (unsigned i = 0, e = numDims; i < e; ++i) {
Operation *nested = &forOp.getBody()->front();
if (!areValuesDefinedAbove(getLowerBoundOperands(forOp), limit) ||
!areValuesDefinedAbove(getUpperBoundOperands(forOp), limit))
return forOp.emitError(
"loops with bounds depending on other mapped loops "
"are not supported");
if (i == e - 1)
break;
auto begin = forOp.getBody()->begin(), end = forOp.getBody()->end();
if (forOp.getBody()->empty() || std::next(begin, 2) != end)
return forOp.emitError("expected perfectly nested loops in the body");
if (!(forOp = dyn_cast<affine::AffineForOp>(nested)))
return nested->emitError("expected a nested loop");
}
return success();
}
static LogicalResult checkAffineLoopNestMappable(affine::AffineForOp forOp, unsigned numBlockDims,
unsigned numThreadDims) {
if (numBlockDims < 1 || numThreadDims < 1) {
LLVM_DEBUG(llvm::dbgs() << "nothing to map");
return success();
}
if (numBlockDims > 3) {
return forOp.emitError("cannot map to more than 3 block dimensions");
}
if (numThreadDims > 3) {
return forOp.emitError("cannot map to more than 3 thread dimensions");
}
return checkAffineLoopNestMappableImpl(forOp, numBlockDims + numThreadDims);
}
namespace {
struct AffineLoopToGpuConverter {
std::optional<affine::AffineForOp> collectBounds(affine::AffineForOp forOp, unsigned numLoops);
void createLaunch(affine::AffineForOp rootForOp, affine::AffineForOp innermostForOp, unsigned numBlockDims,
unsigned numThreadDims);
SmallVector<Value, 6> dims;
SmallVector<Value, 6> lbs;
SmallVector<Value, 6> ivs;
SmallVector<Value, 6> steps;
};
}
std::optional<affine::AffineForOp> AffineLoopToGpuConverter::collectBounds(affine::AffineForOp forOp,
unsigned numLoops) {
OpBuilder builder(forOp.getOperation());
dims.reserve(numLoops);
lbs.reserve(numLoops);
ivs.reserve(numLoops);
steps.reserve(numLoops);
affine::AffineForOp currentLoop = forOp;
for (unsigned i = 0; i < numLoops; ++i) {
Value lowerBound = getOrEmitLowerBound(currentLoop, builder);
Value upperBound = getOrEmitUpperBound(currentLoop, builder);
if (!lowerBound || !upperBound) {
return std::nullopt;
}
Value range = builder.create<arith::SubIOp>(currentLoop.getLoc(), upperBound, lowerBound);
Value step = getOrCreateStep(currentLoop, builder);
if (getConstantIntValue(step) != static_cast<int64_t>(1))
range = builder.create<arith::CeilDivSIOp>(currentLoop.getLoc(), range, step);
dims.push_back(range);
lbs.push_back(lowerBound);
ivs.push_back(currentLoop.getInductionVar());
steps.push_back(step);
if (i != numLoops - 1)
currentLoop = cast<affine::AffineForOp>(¤tLoop.getBody()->front());
}
return currentLoop;
}
void AffineLoopToGpuConverter::createLaunch(affine::AffineForOp rootForOp, affine::AffineForOp innermostForOp,
unsigned numBlockDims, unsigned numThreadDims) {
OpBuilder builder(rootForOp.getOperation());
Value constOne =
(numBlockDims < 3 || numThreadDims < 3) ? builder.create<arith::ConstantIndexOp>(rootForOp.getLoc(), 1) : nullptr;
Value gridSizeX = numBlockDims > 0 ? dims[0] : constOne;
Value gridSizeY = numBlockDims > 1 ? dims[1] : constOne;
Value gridSizeZ = numBlockDims > 2 ? dims[2] : constOne;
Value blockSizeX = numThreadDims > 0 ? dims[numBlockDims] : constOne;
Value blockSizeY = numThreadDims > 1 ? dims[numBlockDims + 1] : constOne;
Value blockSizeZ = numThreadDims > 2 ? dims[numBlockDims + 2] : constOne;
auto launchOp = builder.create<gpu::LaunchOp>(rootForOp.getLoc(), gridSizeX, gridSizeY, gridSizeZ, blockSizeX,
blockSizeY, blockSizeZ);
Operation &terminator = innermostForOp.getBody()->back();
Location terminatorLoc = terminator.getLoc();
terminator.erase();
builder.setInsertionPointToEnd(innermostForOp.getBody());
builder.create<gpu::TerminatorOp>(terminatorLoc, std::nullopt);
launchOp.getBody().front().getOperations().splice(launchOp.getBody().front().begin(),
innermostForOp.getBody()->getOperations());
builder.setInsertionPointToStart(&launchOp.getBody().front());
auto *lbArgumentIt = lbs.begin();
auto *stepArgumentIt = steps.begin();
for (const auto &en : llvm::enumerate(ivs)) {
Value id = en.index() < numBlockDims ? getDim3Value(launchOp.getBlockIds(), en.index())
: getDim3Value(launchOp.getThreadIds(), en.index() - numBlockDims);
Value step = steps[en.index()];
if (getConstantIntValue(step) != static_cast<int64_t>(1))
id = builder.create<arith::MulIOp>(rootForOp.getLoc(), step, id);
Value ivReplacement = builder.create<arith::AddIOp>(rootForOp.getLoc(), *lbArgumentIt, id);
en.value().replaceAllUsesWith(ivReplacement);
std::advance(lbArgumentIt, 1);
std::advance(stepArgumentIt, 1);
}
rootForOp.erase();
}
static LogicalResult convertAffineLoopNestToGPULaunch(affine::AffineForOp forOp, unsigned numBlockDims,
unsigned numThreadDims) {
if (failed(checkAffineLoopNestMappable(forOp, numBlockDims, numThreadDims)))
return failure();
AffineLoopToGpuConverter converter;
auto maybeInnerLoop = converter.collectBounds(forOp, numBlockDims + numThreadDims);
if (!maybeInnerLoop)
return failure();
converter.createLaunch(forOp, *maybeInnerLoop, numBlockDims, numThreadDims);
return success();
}
LogicalResult mlir::convertAffineLoopNestToGPULaunch(affine::AffineForOp forOp, unsigned numBlockDims,
unsigned numThreadDims) {
return ::convertAffineLoopNestToGPULaunch(forOp, numBlockDims, numThreadDims);
}
namespace {
struct ParallelToGpuLaunchLowering : public OpRewritePattern<ParallelOp> {
using OpRewritePattern<ParallelOp>::OpRewritePattern;
LogicalResult matchAndRewrite(ParallelOp parallelOp, PatternRewriter &rewriter) const override;
};
}
static Value deriveStaticUpperBound(Value upperBound, PatternRewriter &rewriter) {
if (auto op = upperBound.getDefiningOp<arith::ConstantIndexOp>()) {
return op;
}
if (auto minOp = upperBound.getDefiningOp<affine::AffineMinOp>()) {
for (const AffineExpr &result : minOp.getMap().getResults()) {
if (auto constExpr = dyn_cast<AffineConstantExpr>(result)) {
return rewriter.create<arith::ConstantIndexOp>(minOp.getLoc(), constExpr.getValue());
}
}
}
if (auto minOp = upperBound.getDefiningOp<arith::MinSIOp>()) {
for (Value operand : {minOp.getLhs(), minOp.getRhs()}) {
if (auto staticBound = deriveStaticUpperBound(operand, rewriter))
return staticBound;
}
}
if (auto multiplyOp = upperBound.getDefiningOp<arith::MulIOp>()) {
if (auto lhs = dyn_cast_or_null<arith::ConstantIndexOp>(
deriveStaticUpperBound(multiplyOp.getOperand(0), rewriter).getDefiningOp()))
if (auto rhs = dyn_cast_or_null<arith::ConstantIndexOp>(
deriveStaticUpperBound(multiplyOp.getOperand(1), rewriter).getDefiningOp())) {
if ((lhs.value() < 0) != (rhs.value() < 0))
return {};
return rewriter.create<arith::ConstantIndexOp>(multiplyOp.getLoc(), lhs.value() * rhs.value());
}
}
return {};
}
static bool isMappedToProcessor(gpu::Processor processor) { return processor != gpu::Processor::Sequential; }
static unsigned getLaunchOpArgumentNum(gpu::Processor processor) {
switch (processor) {
case gpu::Processor::BlockX:
return 0;
case gpu::Processor::BlockY:
return 1;
case gpu::Processor::BlockZ:
return 2;
case gpu::Processor::ThreadX:
return 3;
case gpu::Processor::ThreadY:
return 4;
case gpu::Processor::ThreadZ:
return 5;
default:;
}
llvm_unreachable("invalid processor type while retrieving launch op argument number");
}
static LogicalResult processParallelLoop(ParallelOp parallelOp, gpu::LaunchOp launchOp, IRMapping &cloningMap,
SmallVectorImpl<Operation *> &worklist,
DenseMap<gpu::Processor, Value> &bounds, PatternRewriter &rewriter) {
ArrayAttr mapping = parallelOp->getAttrOfType<ArrayAttr>(gpu::getMappingAttrName());
if (!mapping || parallelOp.getNumResults() != 0)
return failure();
Location loc = parallelOp.getLoc();
auto launchIndependent = [&launchOp](Value val) {
return val.getParentRegion()->isAncestor(launchOp->getParentRegion());
};
auto ensureLaunchIndependent = [&rewriter, launchIndependent](Value val) -> Value {
if (launchIndependent(val))
return val;
if (auto constOp = val.getDefiningOp<arith::ConstantOp>())
return rewriter.create<arith::ConstantOp>(constOp.getLoc(), constOp.getValue());
return {};
};
for (auto config : llvm::zip(mapping, parallelOp.getInductionVars(), parallelOp.getLowerBound(),
parallelOp.getUpperBound(), parallelOp.getStep())) {
Attribute mappingAttribute;
Value iv, lowerBound, upperBound, step;
std::tie(mappingAttribute, iv, lowerBound, upperBound, step) = config;
auto annotation = dyn_cast<gpu::ParallelLoopDimMappingAttr>(mappingAttribute);
if (!annotation)
return parallelOp.emitOpError() << "expected mapping attribute for lowering to GPU";
Value newIndex;
gpu::Processor processor = annotation.getProcessor();
if (isMappedToProcessor(processor)) {
Value operand = launchOp.getBody().getArgument(getLaunchOpArgumentNum(processor));
AffineMap lowerAndStep = AffineMap::get(
1, 2, rewriter.getAffineDimExpr(0) * rewriter.getAffineSymbolExpr(0) + rewriter.getAffineSymbolExpr(1));
newIndex = rewriter.create<AffineApplyOp>(
loc, annotation.getMap().compose(lowerAndStep),
ValueRange{operand, ensureLaunchIndependent(step), ensureLaunchIndependent(lowerBound)});
if (annotation.getBound()) {
if (!launchIndependent(lowerBound) && !isa_and_nonnull<arith::ConstantOp>(lowerBound.getDefiningOp()))
return failure();
if (!launchIndependent(step) && !isa_and_nonnull<arith::ConstantOp>(step.getDefiningOp()))
return failure();
bool boundIsPrecise =
launchIndependent(upperBound) || isa_and_nonnull<arith::ConstantOp>(upperBound.getDefiningOp());
{
PatternRewriter::InsertionGuard guard(rewriter);
rewriter.setInsertionPoint(launchOp);
if (!boundIsPrecise) {
upperBound = deriveStaticUpperBound(upperBound, rewriter);
if (!upperBound) {
return rewriter.notifyMatchFailure(parallelOp,
"cannot derive loop-invariant upper bound for number of"
"iterations");
}
}
AffineMap stepMap = AffineMap::get(1, 2,
((rewriter.getAffineDimExpr(0) - rewriter.getAffineSymbolExpr(0))
.ceilDiv(rewriter.getAffineSymbolExpr(1))));
Value launchBound = rewriter.create<affine::AffineApplyOp>(
loc, annotation.getBound().compose(stepMap),
ValueRange{ensureLaunchIndependent(cloningMap.lookupOrDefault(upperBound)),
ensureLaunchIndependent(cloningMap.lookupOrDefault(lowerBound)),
ensureLaunchIndependent(cloningMap.lookupOrDefault(step))});
if (bounds.contains(processor)) {
return rewriter.notifyMatchFailure(
parallelOp, "cannot redefine the bound for processor " + Twine(static_cast<int64_t>(processor)));
}
bounds[processor] = launchBound;
}
if (!boundIsPrecise) {
Value originalBound = std::get<3>(config);
arith::CmpIOp pred = rewriter.create<arith::CmpIOp>(loc, arith::CmpIPredicate::slt, newIndex,
cloningMap.lookupOrDefault(originalBound));
scf::IfOp ifOp = rewriter.create<scf::IfOp>(loc, pred, false);
rewriter.setInsertionPointToStart(&ifOp.getThenRegion().front());
worklist.push_back(launchOp.getOperation());
}
}
} else {
auto loopOp =
rewriter.create<scf::ForOp>(loc, cloningMap.lookupOrDefault(lowerBound), cloningMap.lookupOrDefault(upperBound),
cloningMap.lookupOrDefault(step));
newIndex = loopOp.getInductionVar();
rewriter.setInsertionPointToStart(loopOp.getBody());
worklist.push_back(launchOp.getOperation());
}
cloningMap.map(iv, newIndex);
}
for (const auto &namedAttr : parallelOp->getAttrs()) {
if (namedAttr.getName() == gpu::getMappingAttrName() ||
namedAttr.getName() == ParallelOp::getOperandSegmentSizeAttr())
continue;
launchOp->setAttr(namedAttr.getName(), namedAttr.getValue());
}
Block *body = parallelOp.getBody();
worklist.reserve(worklist.size() + body->getOperations().size());
for (Operation &op : llvm::reverse(body->without_terminator())) worklist.push_back(&op);
return success();
}
LogicalResult ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp, PatternRewriter &rewriter) const {
parallelOp->setAttr(kVisitedAttrName, rewriter.getUnitAttr());
if (auto parentLoop = parallelOp->getParentOfType<ParallelOp>())
return failure();
Location loc = parallelOp.getLoc();
Value constantOne = rewriter.create<arith::ConstantIndexOp>(parallelOp.getLoc(), 1);
gpu::LaunchOp launchOp = rewriter.create<gpu::LaunchOp>(parallelOp.getLoc(), constantOne, constantOne, constantOne,
constantOne, constantOne, constantOne);
rewriter.setInsertionPointToEnd(&launchOp.getBody().front());
rewriter.create<gpu::TerminatorOp>(loc);
rewriter.setInsertionPointToStart(&launchOp.getBody().front());
IRMapping cloningMap;
llvm::DenseMap<gpu::Processor, Value> launchBounds;
SmallVector<Operation *, 16> worklist;
if (failed(processParallelLoop(parallelOp, launchOp, cloningMap, worklist, launchBounds, rewriter)))
return failure();
bool seenSideeffects = false;
bool leftNestingScope = false;
while (!worklist.empty()) {
Operation *op = worklist.pop_back_val();
if (auto nestedParallel = dyn_cast<ParallelOp>(op)) {
if (seenSideeffects)
return failure();
if (failed(processParallelLoop(nestedParallel, launchOp, cloningMap, worklist, launchBounds, rewriter)))
return failure();
} else if (op == launchOp.getOperation()) {
auto *parent = rewriter.getInsertionPoint()->getParentOp();
rewriter.setInsertionPointAfter(parent);
leftNestingScope = true;
seenSideeffects = false;
} else {
Operation *clone = rewriter.clone(*op, cloningMap);
cloningMap.map(op->getResults(), clone->getResults());
seenSideeffects |= !isMemoryEffectFree(clone) || clone->getNumRegions() != 0;
if (seenSideeffects && leftNestingScope)
return failure();
}
}
for (auto bound : launchBounds) launchOp.setOperand(getLaunchOpArgumentNum(std::get<0>(bound)), std::get<1>(bound));
rewriter.eraseOp(parallelOp);
return success();
}
void mlir::populateParallelLoopToGPUPatterns(RewritePatternSet &patterns) {
patterns.add<ParallelToGpuLaunchLowering>(patterns.getContext());
}
void mlir::configureParallelLoopToGPULegality(ConversionTarget &target) {
target.addLegalDialect<memref::MemRefDialect>();
target.addDynamicallyLegalOp<scf::ParallelOp>([](scf::ParallelOp parallelOp) {
return !parallelOp->hasAttr(gpu::getMappingAttrName()) || parallelOp->hasAttr(kVisitedAttrName);
});
}
void mlir::finalizeParallelLoopToGPUConversion(Operation *op) {
op->walk([](scf::ParallelOp parallelOp) { parallelOp->removeAttr(kVisitedAttrName); });
}