#include "mlir/Dialect/Arith/Transforms/Passes.h"
#include "mlir/IR/BuiltinOps.h"
#include "mlir/Pass/Pass.h"
#include "mlir/Pass/PassManager.h"
#include "mlir/Transforms/Passes.h"
#include "third_party/nvidia/include/Dialect/NVWS/Transforms/Passes.h"
#include "triton/Dialect/TritonGPU/Transforms/Passes.h"
#include "triton/Dialect/TritonGPU/Transforms/Utility.h"
using namespace mlir;
using namespace triton;
using namespace triton::gpu;
namespace ttng = triton::nvidia_gpu;
namespace mlir::triton::gpu {
#define GEN_PASS_DEF_TRITONGPUAUTOMATICWARPSPECIALIZATION
#include "triton/Dialect/TritonGPU/Transforms/Passes.h.inc"
}
namespace {
struct AutomaticWarpSpecialization
: triton::gpu::impl::TritonGPUAutomaticWarpSpecializationBase<
AutomaticWarpSpecialization> {
using TritonGPUAutomaticWarpSpecializationBase::
TritonGPUAutomaticWarpSpecializationBase;
void runOnOperation() override;
};
}
void AutomaticWarpSpecialization::runOnOperation() {
OpPassManager pm;
pm.addPass(createTritonGPUPartitionScheduling());
pm.addPass(createTritonGPULoadMMASpecialization({numStages}));
pm.addPass(createTritonGPURewritePartitionDependencies());
pm.addPass(createSCCPPass());
pm.addPass(createCSEPass());
pm.addPass(createNVWSAssignStagePhase());
pm.addPass(createNVWSLowerAref());
pm.addPass(createTritonGPUPartitionLoops());
pm.addPass(createNVWSLowerWarpGroup());
if (failed(runPipeline(pm, getOperation())))
return signalPassFailure();
RewritePatternSet patterns(&getContext());
populateForOpDeadArgumentElimination(patterns);
scf::ForOp::getCanonicalizationPatterns(patterns, &getContext());
scf::IfOp::getCanonicalizationPatterns(patterns, &getContext());
WarpSpecializeOp::getCanonicalizationPatterns(patterns, &getContext());
if (failed(applyPatternsGreedily(getOperation(), std::move(patterns))))
return signalPassFailure();
pm.clear();
pm.addPass(createTritonGPUOptimizePartitionWarps());
pm.addPass(createTritonGPUScheduleLoops());
if (failed(runPipeline(pm, getOperation())))
return signalPassFailure();
}