// RUN: triton-opt --split-input-file %s | FileCheck %s

module {
  // CHECK-LABEL: proton_record
  tt.func @proton_record() {
    // CHECK: proton.record start "name0"
    // CHECK: proton.record end "name0"
    // CHECK-NEXT: tt.return
    proton.record start "name0"
    proton.record end "name0"
    tt.return
  }
} // end module

// -----

#shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0]}>
#smem = #ttg.shared_memory
module attributes {"ttg.num-warps" = 8 : i32} {
  // CHECK-LABEL: protongpu_ops
  tt.func @protongpu_ops() {
    // CHECK: ttg.local_alloc
    // CHECK-NEXT: proton_gpu.global_scratch_alloc
    // CHECK-NEXT: proton_gpu.segment_alloc
    // CHECK-NEXT: proton_gpu.init_ctx
    // CHECK-NEXT: proton_gpu.read_counter
    // CHECK-NEXT: proton_gpu.circular_store start
    // CHECK-NEXT: gpu.barrier
    // CHECK-NEXT: proton_gpu.save_ctx
    // CHECK-NEXT: proton_gpu.finalize
    // CHECK-NEXT: tt.return
    %0 = ttg.local_alloc : () -> !ttg.memdesc<64xi32, #shared, #smem, mutable>
    %1 = proton_gpu.global_scratch_alloc {alignment = 128 : i32, nbytes = 384 : i32} : !tt.ptr<i32>
    %seg = proton_gpu.segment_alloc %0 : !ttg.memdesc<64xi32, #shared, #smem, mutable> -> !proton_gpu.segment<256, #shared, warp>
    proton_gpu.init_ctx %1 : !tt.ptr<i32>
    %3 = proton_gpu.read_counter : i32
    proton_gpu.circular_store start %seg, %3 {scopeId = 0 : i32} : !proton_gpu.segment<256, #shared, warp>, i32
    gpu.barrier
    proton_gpu.save_ctx %seg, %1: !proton_gpu.segment<256, #shared, warp>, !tt.ptr<i32>
    proton_gpu.finalize %seg, %1 : !proton_gpu.segment<256, #shared, warp>, !tt.ptr<i32>
    tt.return
  }
} // end module