// RUN: triton-opt %s -split-input-file --convert-triton-amdgpu-to-llvm=arch=gfx942 | FileCheck %s
#blocked = #ttg.blocked<{sizePerThread = [2], threadsPerWarp = [64], warpsPerCTA = [4], order = [0]}>
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} {
// CHECK-LABEL: buffer_atomic_cas_i64
tt.func public @buffer_atomic_cas_i64(%arg0: !tt.ptr<i64> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, %arg1: !tt.ptr<i64> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}) {
// CHECK: %[[cas_val:.*]] = llvm.mlir.constant(2 : i64) : i64
// CHECK: %[[cas_val_cast:.*]] = llvm.bitcast %[[cas_val]] : i64 to i64
// CHECK: %[[cas_val_insert:.*]] = llvm.insertvalue %[[cas_val_cast]], %{{.*}}[1] : !llvm.struct<(i64, i64)>
%val = arith.constant dense<2> : tensor<512xi64, #blocked>
// CHECK: %[[cas_cmp:.*]] = llvm.mlir.constant(0 : i64) : i64
// CHECK: %[[cas_cmp_cast:.*]] = llvm.bitcast %[[cas_cmp]] : i64 to i64
// CHECK: %[[cas_cmp_insert:.*]] = llvm.insertvalue %[[cas_cmp_cast]], %{{.*}}[1] : !llvm.struct<(i64, i64)>
%cmp = arith.constant dense<0> : tensor<512xi64, #blocked>
%c512_i32 = arith.constant 512 : i32
%0 = tt.get_program_id x : i32
%1 = arith.muli %0, %c512_i32 : i32
%offsets = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #blocked>
%scalar_ptr = tt.addptr %arg0, %1 : !tt.ptr<i64>, i32
// CHECK: %[[cas_val_extract:.*]] = llvm.extractvalue %[[cas_val_insert]][0] : !llvm.struct<(i64, i64)>
// CHECK: %[[cas_cmp_extract:.*]] = llvm.extractvalue %[[cas_cmp_insert]][0] : !llvm.struct<(i64, i64)>
// CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}
// CHECK: llvm.fence syncscope("agent") release
// CHECK: %[[cas_val_insert2:.*]] = llvm.insertelement %[[cas_val_extract]], %{{.*}} : vector<1xi64>
// CHECK: %[[cas_cmp_insert2:.*]] = llvm.insertelement %[[cas_cmp_extract]], %{{.*}} : vector<1xi64>
// CHECK: %[[cas_val_cast2:.*]] = llvm.bitcast %[[cas_val_insert2]] : vector<1xi64> to i64
// CHECK: %[[cas_cmp_cast2:.*]] = llvm.bitcast %[[cas_cmp_insert2]] : vector<1xi64> to i64
// CHECK: %[[dst:.*]] = rocdl.raw.ptr.buffer.atomic.cmpswap %[[cas_val_cast2]], %[[cas_cmp_cast2]], %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i64
// CHECK: %[[dst:.*]] = rocdl.raw.ptr.buffer.atomic.cmpswap %{{.*}}, %{{.*}}, %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i64
// CHECK: llvm.fence syncscope("agent") acquire
%4 = amdgpu.buffer_atomic_cas acq_rel, gpu, %cmp, %val, %scalar_ptr[%offsets] : tensor<512xi64, #blocked>
%5 = tt.addptr %arg1, %1 : !tt.ptr<i64>, i32
amdgpu.buffer_store %4, %5[%offsets] : tensor<512xi64, #blocked>
tt.return
}
}