// RUN: triton-opt %s --convert-nv-gpu-to-llvm -split-input-file | FileCheck %s
// CHECK-LABEL: @nvvm_syncs
llvm.func @nvvm_syncs() {
// CHECK: wgmma.fence.sync.aligned;
nvgpu.wgmma_fence
// CHECK: wgmma.commit_group.sync.aligned;
nvgpu.wgmma_commit_group
// CHECK: barrier.cluster.wait.aligned;
nvgpu.cluster_wait
// CHECK: fence.proxy.async.shared::cta;
nvgpu.fence_async_shared {bCluster = false}
// CHECK: fence.proxy.async.shared::cluster;
nvgpu.fence_async_shared {bCluster = true}
// CHECK: barrier.cluster.arrive.aligned;
nvgpu.cluster_arrive {relaxed = false}
// CHECK: barrier.cluster.arrive.relaxed.aligned;
nvgpu.cluster_arrive {relaxed = true}
llvm.return
}
// CHECK-LABEL: @cluster_id
llvm.func @cluster_id() -> i32 {
// CHECK: %cluster_ctaid.x;
// CHECK-SAME: %cluster_ctaid.y;
// CHECK-SAME: %cluster_ctaid.z;
// CHECK-SAME: %cluster_nctaid.x;
// CHECK-SAME: %cluster_nctaid.y;
%id = nvgpu.cluster_id
llvm.return %id : i32
}
// -----
// CHECK-LABEL: @st_matrix
llvm.func @st_matrix(%i: i32, %ptr: !llvm.ptr<3>) {
// CHECK: stmatrix.sync.aligned.m8n8.x4.shared.b16 [$0], {$1, $2, $3, $4};
nvgpu.stmatrix %ptr, %i, %i, %i, %i : !llvm.ptr<3>, i32, i32, i32, i32
llvm.return
}
// -----
!struct_128xf32 = !llvm.struct<(
f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32,
f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32,
f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32,
f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32,
f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32,
f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32,
f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32,
f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32
)>
!struct_64xf32 = !llvm.struct<(
f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32,
f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32,
f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32,
f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32,
f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32,
f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32,
f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32,
f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32
)>
// CHECK-LABEL: @wgmma
llvm.func @wgmma(%desc: i64, %in: !struct_64xf32) {
// CHECK: wgmma.mma_async.sync.aligned.m64n256k32.f32.e5m2.e5m2
%false = llvm.mlir.constant(false) : i1
%acc0 = nvgpu.wgmma %desc, %desc, %false {
eltTypeA = 3 : i32,
eltTypeB = 3 : i32,
eltTypeC = 7 : i32,
layoutA = 0 : i32,
layoutB = 1 : i32,
m = 64 : i32,
n = 256 : i32,
k = 32 : i32
} : (i64, i64, i1) -> !struct_128xf32
// CHECK: // wait for regs: $0,$1,$2,{{.*}},$127
// CHECK: wgmma.wait_group.sync.aligned 0;
%out = nvgpu.wgmma_wait_group %in {pendings = 0 : i32} : !struct_64xf32
llvm.return
}