// RUN: triton-opt %s -split-input-file --convert-triton-gpu-to-llvm=compute-capability=70 2>&1 | FileCheck %s
#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
// CHECK-LABEL: clamp
module attributes {"triton_gpu.target" = "cuda:70", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
tt.func public @clamp(%x : tensor<1024xf32, #blocked>, %limit : tensor<1024xf32, #blocked>) attributes {noinline = false} {
%cst = arith.constant dense<0.000000e+00> : tensor<1024xf32, #blocked>
%neg_limit = arith.subf %cst, %limit : tensor<1024xf32, #blocked>
// CHECK: llvm.fcmp "une" %[[REG:[a-zA-Z0-9]+]], %[[REG]]
// CHECK-NEXT: llvm.intr.maxnum
// CHECK-NEXT: llvm.intr.minnum
// CHECK-NEXT: llvm.mlir.constant
// CHECK-NEXT: llvm.select
%12 = tt.clampf %x, %neg_limit, %limit, propagateNan = all : tensor<1024xf32, #blocked>
tt.return
}
}