#include "mlir/Dialect/SCF/Transforms/Passes.h"
#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Arith/IR/Arith.h"
#include "mlir/Dialect/SCF/IR/SCF.h"
#include "mlir/Dialect/SCF/Transforms/Transforms.h"
#include "mlir/Dialect/SCF/Utils/Utils.h"
namespace mlir {
#define GEN_PASS_DEF_SCFPARALLELLOOPTILING
#include "mlir/Dialect/SCF/Transforms/Passes.h.inc"
}
using namespace mlir;
using namespace mlir::scf;
std::pair<ParallelOp, ParallelOp>
mlir::scf::tileParallelLoop(ParallelOp op, ArrayRef<int64_t> tileSizes,
bool noMinMaxBounds) {
OpBuilder b(op);
auto zero = b.create<arith::ConstantIndexOp>(op.getLoc(), 0);
SmallVector<Value, 2> tileSizeConstants;
tileSizeConstants.reserve(op.getUpperBound().size());
for (size_t i = 0, end = op.getUpperBound().size(); i != end; ++i) {
if (i < tileSizes.size())
tileSizeConstants.push_back(
b.create<arith::ConstantIndexOp>(op.getLoc(), tileSizes[i]));
else
tileSizeConstants.push_back(
b.create<arith::ConstantIndexOp>(op.getLoc(), 1));
}
SmallVector<Value, 2> newSteps;
newSteps.reserve(op.getStep().size());
for (auto step : llvm::zip(op.getStep(), tileSizeConstants)) {
newSteps.push_back(b.create<arith::MulIOp>(op.getLoc(), std::get<0>(step),
std::get<1>(step)));
}
auto outerLoop = b.create<ParallelOp>(op.getLoc(), op.getLowerBound(),
op.getUpperBound(), newSteps);
b.setInsertionPointToStart(outerLoop.getBody());
auto minMap = AffineMap::get(
3, 0,
{getAffineDimExpr(0, b.getContext()),
getAffineDimExpr(1, b.getContext()) -
getAffineDimExpr(2, b.getContext())},
b.getContext());
SmallVector<Value, 2> newBounds;
newBounds.reserve(op.getUpperBound().size());
bool needInboundCheck = false;
for (auto [lowerBound, upperBound, newStep, iv, step, tileSizeConstant] :
llvm::zip(outerLoop.getLowerBound(), outerLoop.getUpperBound(),
outerLoop.getStep(), outerLoop.getInductionVars(),
op.getStep(), tileSizeConstants)) {
auto lowerBoundConstant =
dyn_cast_or_null<arith::ConstantIndexOp>(lowerBound.getDefiningOp());
auto upperBoundConstant =
dyn_cast_or_null<arith::ConstantIndexOp>(upperBound.getDefiningOp());
auto stepConstant =
dyn_cast_or_null<arith::ConstantIndexOp>(step.getDefiningOp());
auto tileSize =
cast<arith::ConstantIndexOp>(tileSizeConstant.getDefiningOp()).value();
if (lowerBoundConstant && upperBoundConstant && stepConstant) {
auto numIterations = llvm::divideCeil(upperBoundConstant.value() -
lowerBoundConstant.value(),
stepConstant.value());
if (numIterations % tileSize == 0) {
newBounds.push_back(newStep);
continue;
}
}
if (noMinMaxBounds) {
newBounds.push_back(newStep);
needInboundCheck = true;
continue;
}
newBounds.push_back(
b.create<affine::AffineMinOp>(op.getLoc(), b.getIndexType(), minMap,
ValueRange{newStep, upperBound, iv}));
}
auto innerLoop = b.create<ParallelOp>(
op.getLoc(), SmallVector<Value, 2>(newBounds.size(), zero), newBounds,
op.getStep());
if (noMinMaxBounds && needInboundCheck) {
b.setInsertionPointToStart(innerLoop.getBody());
Value inbound =
b.create<arith::ConstantIntOp>(op.getLoc(), 1, b.getIntegerType(1));
for (auto [outerUpperBound, outerIV, innerIV, innerStep] :
llvm::zip(outerLoop.getUpperBound(), outerLoop.getInductionVars(),
innerLoop.getInductionVars(), innerLoop.getStep())) {
Value index = b.create<arith::AddIOp>(
op.getLoc(), b.create<arith::MulIOp>(op.getLoc(), innerIV, innerStep),
outerIV);
Value dimInbound = b.create<arith::CmpIOp>(
op.getLoc(), arith::CmpIPredicate::ult, index, outerUpperBound);
inbound = b.create<arith::AndIOp>(op.getLoc(), inbound, dimInbound);
}
auto ifInbound = b.create<IfOp>(op.getLoc(),
ArrayRef<Type>{}, inbound,
false);
ifInbound.getThenRegion().takeBody(op.getRegion());
Block &thenBlock = ifInbound.getThenRegion().front();
Operation *reduceOp = thenBlock.getTerminator();
b.setInsertionPointToEnd(&thenBlock);
b.create<scf::YieldOp>(reduceOp->getLoc());
reduceOp->erase();
b.setInsertionPointToStart(innerLoop.getBody());
for (const auto &ivs : llvm::enumerate(llvm::zip(
innerLoop.getInductionVars(), outerLoop.getInductionVars()))) {
auto newIndex = b.create<arith::AddIOp>(
op.getLoc(), std::get<0>(ivs.value()), std::get<1>(ivs.value()));
thenBlock.getArgument(ivs.index())
.replaceAllUsesExcept(newIndex, newIndex);
}
thenBlock.eraseArguments(0, thenBlock.getNumArguments());
} else {
innerLoop.getRegion().takeBody(op.getRegion());
b.setInsertionPointToStart(innerLoop.getBody());
for (auto ivs : llvm::zip(innerLoop.getInductionVars(),
outerLoop.getInductionVars())) {
Value innerIndex = std::get<0>(ivs);
auto newIndex = b.create<arith::AddIOp>(op.getLoc(), std::get<0>(ivs),
std::get<1>(ivs));
innerIndex.replaceAllUsesExcept(newIndex, newIndex);
}
}
op.erase();
return std::make_pair(outerLoop, innerLoop);
}
namespace {
struct ParallelLoopTiling
: public impl::SCFParallelLoopTilingBase<ParallelLoopTiling> {
ParallelLoopTiling() = default;
explicit ParallelLoopTiling(ArrayRef<int64_t> tileSizes,
bool noMinMaxBounds = false) {
this->tileSizes = tileSizes;
this->noMinMaxBounds = noMinMaxBounds;
}
void runOnOperation() override {
for (auto tileSize : tileSizes)
if (tileSize == 0) {
mlir::emitError(mlir::UnknownLoc::get(&Pass::getContext()),
"tile size cannot be 0");
return signalPassFailure();
}
auto *parentOp = getOperation();
SmallVector<ParallelOp, 2> innermostPloops;
getInnermostParallelLoops(parentOp, innermostPloops);
for (ParallelOp ploop : innermostPloops) {
if (ploop.getNumReductions() == 0)
tileParallelLoop(ploop, tileSizes, noMinMaxBounds);
}
}
};
}
std::unique_ptr<Pass>
mlir::createParallelLoopTilingPass(ArrayRef<int64_t> tileSizes,
bool noMinMaxBounds) {
return std::make_unique<ParallelLoopTiling>(tileSizes, noMinMaxBounds);
}