// RUN: triton-opt %s -allow-unregistered-dialect -test-print-allocation -verify-diagnostics -o /dev/null
// RUN: triton-opt %s -allow-unregistered-dialect -test-print-allocation="get-scratch-size-function=ValidConstant" 2>&1 | FileCheck %s --check-prefix=CHECK-128

// Check there are no lines with a size different to 128 and we have at least a line with size 128.

// CHECK-128-NOT: scratch offset = {{.*}}, size = {{^(128)}}
// CHECK-128: scratch offset = {{.*}}, size = 128
// CHECK-128-NOT: scratch offset = {{.*}}, size = {{^(128)}}

#AL = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
#sliceAd0 = #ttg.slice<{dim = 0, parent = #AL}>
#BL = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}>
#A_SHARED = #ttg.swizzled_shared<{vec = 2, perPhase = 2, maxPhase = 4, order = [1, 0]}>
#A_SHARED_T = #ttg.swizzled_shared<{vec = 2, perPhase = 2, maxPhase = 4, order = [0, 1]}>
#B_SHARED = #ttg.swizzled_shared<{vec = 2, perPhase = 2, maxPhase = 4, order = [1, 0]}>
#C = #ttg.nvidia_mma<{versionMajor = 2, warpsPerCTA = [4, 1], instrShape = [16, 8]}>
#A_DOT = #ttg.dot_op<{opIdx = 0, parent = #C, kWidth = 2}>
#B_DOT = #ttg.dot_op<{opIdx = 1, parent = #C, kWidth = 2}>
#NVMMA_SHARED_0 = #ttg.nvmma_shared<{swizzlingByteWidth = 0, transposed = false, elementBitWidth = 16}>
#NVMMA_SHARED_32 = #ttg.nvmma_shared<{swizzlingByteWidth = 32, transposed = false, elementBitWidth = 16}>
#NVMMA_SHARED_64 = #ttg.nvmma_shared<{swizzlingByteWidth = 64, transposed = false, elementBitWidth = 16}>
#NVMMA_SHARED_128 = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = false, elementBitWidth = 16}>
#NVMMA_SHARED_FP4PADDED = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = false, elementBitWidth = 8, fp4Padded = true}>
#PADDED_SHARED_0 = #ttg.padded_shared<[256:+8] {order = [1, 0]}>
#PADDED_SHARED_1 = #ttg.padded_shared<[128:+4, 256:+8] {order = [1, 0]}>
#PADDED_SHARED_2 = #ttg.padded_shared<[64:+2, 128:+4, 256:+8] {order = [1, 0]}>

#smem = #ttg.shared_memory

module attributes {"ttg.num-warps" = 4 : i32, "ttg.num-ctas" = 1 : i32} {

// expected-remark @below {{empty}}
// expected-remark @below {{size = 0}}
tt.func @empty(%A : !tt.ptr<f16>) {
  %cst_2 = arith.constant dense<0.000000e+00> : tensor<16x32xf16, #AL>
  %0 = ttg.convert_layout %cst_2 : tensor<16x32xf16, #AL> -> tensor<16x32xf16, #AL>
  tt.return
}

// expected-remark @below {{matmul_loop}}
// expected-remark @below {{size = 8192}}
tt.func @matmul_loop(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B : !tt.ptr<f16>) {
  %a_ptr_init = tt.splat %A : !tt.ptr<f16> -> tensor<128x32x!tt.ptr<f16>, #AL>
  %b_ptr_init = tt.splat %B : !tt.ptr<f16> -> tensor<32x128x!tt.ptr<f16>, #BL>

  %a_mask = arith.constant dense<true> : tensor<128x32xi1, #AL>
  %a_other = arith.constant dense<0.00e+00> : tensor<128x32xf16, #AL>
  %b_mask = arith.constant dense<true> : tensor<32x128xi1, #BL>
  %b_other = arith.constant dense<0.00e+00> : tensor<32x128xf16, #BL>
  %c_init = arith.constant dense<0.00e+00> : tensor<128x128xf32, #C>

  %a_off = arith.constant dense<4> : tensor<128x32xi32, #AL>
  %b_off = arith.constant dense<4> : tensor<32x128xi32, #BL>

  scf.for %iv = %lb to %ub step %step iter_args(%a_ptr = %a_ptr_init, %b_ptr = %b_ptr_init, %prev_c = %c_init) -> (tensor<128x32x!tt.ptr<f16>, #AL>, tensor<32x128x!tt.ptr<f16>, #BL>, tensor<128x128xf32, #C>) {
    %a_ = tt.load %a_ptr, %a_mask, %a_other : tensor<128x32x!tt.ptr<f16>, #AL>
    // expected-remark @below {{scratch offset = 0, size = 8192}}
    %a = ttg.convert_layout %a_ : tensor<128x32xf16, #AL> -> tensor<128x32xf16, #A_DOT>
    %b_ = tt.load %b_ptr, %b_mask, %b_other : tensor<32x128x!tt.ptr<f16>, #BL>
    // expected-remark @below {{scratch offset = 0, size = 8192}}
    %b = ttg.convert_layout %b_ : tensor<32x128xf16, #BL> -> tensor<32x128xf16, #B_DOT>

    %c = tt.dot %a, %b, %prev_c : tensor<128x32xf16, #A_DOT> * tensor<32x128xf16, #B_DOT> -> tensor<128x128xf32, #C>

    %next_a_ptr = tt.addptr %a_ptr, %a_off : tensor<128x32x!tt.ptr<f16>, #AL>, tensor<128x32xi32, #AL>
    %next_b_ptr = tt.addptr %b_ptr, %b_off : tensor<32x128x!tt.ptr<f16>, #BL>, tensor<32x128xi32, #BL>
    scf.yield %next_a_ptr, %next_b_ptr, %c : tensor<128x32x!tt.ptr<f16>, #AL>, tensor<32x128x!tt.ptr<f16>, #BL>, tensor<128x128xf32, #C>
  }
  tt.return
}

// Shared memory is available after a tensor's liveness range ends
// expected-remark @below {{reusable}}
// expected-remark @below {{size = 8192}}
tt.func @reusable(%A : !tt.ptr<f16>) {
  %cst1 = arith.constant dense<true> : tensor<128x32xi1, #AL>
  %cst2 = arith.constant dense<0.000000e+00> : tensor<128x32xf16, #AL>
  %cst3 = arith.constant dense<true> : tensor<32x128xi1, #AL>
  %cst4 = arith.constant dense<0.000000e+00> : tensor<32x128xf16, #AL>
  %c_init = arith.constant dense<0.00e+00> : tensor<128x128xf32, #C>

  %a_ptr = tt.splat %A : !tt.ptr<f16> -> tensor<128x32x!tt.ptr<f16>, #AL>
  %b_ptr = tt.splat %A : !tt.ptr<f16> -> tensor<32x128x!tt.ptr<f16>, #AL>
  %a1_ = tt.load %a_ptr, %cst1, %cst2 : tensor<128x32x!tt.ptr<f16>, #AL>
  // expected-remark @below {{scratch offset = 0, size = 8192}}
  %a1 = ttg.convert_layout %a1_ : tensor<128x32xf16, #AL> -> tensor<128x32xf16, #A_DOT>
  %a2_ = tt.load %b_ptr, %cst3, %cst4 : tensor<32x128x!tt.ptr<f16>, #AL>
  // expected-remark @below {{scratch offset = 0, size = 8192}}
  %a2 = ttg.convert_layout %a2_ : tensor<32x128xf16, #AL> -> tensor<32x128xf16, #B_DOT>
  %a3_ = tt.load %a_ptr, %cst1, %cst2 : tensor<128x32x!tt.ptr<f16>, #AL>
  // expected-remark @below {{scratch offset = 0, size = 8192}}
  %a3 = ttg.convert_layout %a3_ : tensor<128x32xf16, #AL> -> tensor<128x32xf16, #A_DOT>
  %c = tt.dot %a1, %a2, %c_init : tensor<128x32xf16, #A_DOT> * tensor<32x128xf16, #B_DOT> -> tensor<128x128xf32, #C>
  %a4_ = tt.load %b_ptr, %cst3, %cst4 : tensor<32x128x!tt.ptr<f16>, #AL>
  // expected-remark @below {{scratch offset = 0, size = 8192}}
  %a4 = ttg.convert_layout %a4_ : tensor<32x128xf16, #AL> -> tensor<32x128xf16, #B_DOT>
  %c1 = tt.dot %a3, %a4, %c : tensor<128x32xf16, #A_DOT> * tensor<32x128xf16, #B_DOT> -> tensor<128x128xf32, #C>
  tt.return
}

// A tensor's shared memory offset is larger than it needs to accommodate further tensors
// %cst0->%c
// %cst1->%cst4
// %cst3->%g->%h->%i
// expected-remark @below {{preallocate}}
// expected-remark @below {{size = 12288}}
tt.func @preallocate(%A : !tt.ptr<f16>) {
  // expected-remark @below {{offset = 2048, size = 512}}
  %cst0 = ttg.local_alloc : () -> !ttg.memdesc<1x16x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
  // expected-remark @below {{offset = 3072, size = 512}}
  %cst1 = ttg.local_alloc : () -> !ttg.memdesc<1x16x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
  // expected-remark @below {{offset = 3584, size = 512}}
  %cst2 = ttg.local_alloc : () -> !ttg.memdesc<1x16x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
  // expected-remark @below {{offset = 0, size = 1024}}
  %a = ttg.local_alloc : () -> !ttg.memdesc<32x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
  // expected-remark @below {{offset = 1024, size = 1024}}
  %b = ttg.local_alloc : () -> !ttg.memdesc<32x16xf16, #A_SHARED, #ttg.shared_memory, mutable>

  ttg.local_dealloc %cst0 : !ttg.memdesc<1x16x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
  // expected-remark @below {{offset = 2048, size = 1024}}
  %c = ttg.local_alloc : () -> !ttg.memdesc<32x16xf16, #A_SHARED, #ttg.shared_memory, mutable>

  ttg.local_dealloc %cst1 : !ttg.memdesc<1x16x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
  ttg.local_dealloc %cst2 : !ttg.memdesc<1x16x16xf16, #A_SHARED, #ttg.shared_memory, mutable>

  // expected-remark @below {{offset = 3072, size = 1024}}
  %cst4 = ttg.local_alloc : () -> !ttg.memdesc<32x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
  // expected-remark @below {{offset = 4096, size = 2048}}
  %e = ttg.local_alloc : () -> !ttg.memdesc<64x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
  ttg.local_dealloc %a : !ttg.memdesc<32x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
  // expected-remark @below {{offset = 6144, size = 2048}}
  %d = ttg.local_alloc : () -> !ttg.memdesc<64x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
  ttg.local_dealloc %b : !ttg.memdesc<32x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
  // expected-remark @below {{offset = 8192, size = 2048}}
  %f = ttg.local_alloc : () -> !ttg.memdesc<64x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
  ttg.local_dealloc %cst4 : !ttg.memdesc<32x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
  ttg.local_dealloc %c : !ttg.memdesc<32x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
  // expected-remark @below {{offset = 10240, size = 2048}}
  %cst5 = ttg.local_alloc : () -> !ttg.memdesc<64x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
  // expected-remark @below {{offset = 0, size = 4096}}
  %g = ttg.local_alloc : () -> !ttg.memdesc<128x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
  ttg.local_dealloc %e : !ttg.memdesc<64x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
  // expected-remark @below {{offset = 0, size = 4096}}
  %h = ttg.local_alloc : () -> !ttg.memdesc<128x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
  ttg.local_dealloc %d : !ttg.memdesc<64x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
  // expected-remark @below {{offset = 0, size = 4096}}
  %i = ttg.local_alloc : () -> !ttg.memdesc<128x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
  ttg.local_dealloc %f : !ttg.memdesc<64x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
  ttg.local_dealloc %cst5 : !ttg.memdesc<64x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
  tt.return
}

// Unused tensors are immediately released
// expected-remark @below {{unused}}
// expected-remark @below {{size = 1024}}
tt.func @unused(%A : !tt.ptr<f16>) {
  %cst = arith.constant dense<0.000000e+00> : tensor<32x16xf16, #AL>
  // expected-remark @below {{0, size = 1024}}
  %cst0 = ttg.local_alloc %cst : (tensor<32x16xf16, #AL>) -> !ttg.memdesc<32x16xf16, #A_SHARED, #ttg.shared_memory>
  // expected-remark @below {{offset = 0, size = 512}}
  %cst1 = ttg.local_alloc : () -> !ttg.memdesc<1x16x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
  // expected-remark @below {{offset = 0, size = 512}}
  %cst2 = ttg.local_alloc : () -> !ttg.memdesc<1x16x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
  tt.return
}

// cst0 is alive through the entire function, it cannot be released before the end of the function
// expected-remark @below {{longlive}}
// expected-remark @below {{size = 2560}}
tt.func @longlive(%A : !tt.ptr<f16>) {
  // expected-remark @below {{offset = 2048, size = 512}}
  %cst0 = ttg.local_alloc : () -> !ttg.memdesc<1x16x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
  // expected-remark @below {{offset = 1024, size = 512}}
  %cst1 = ttg.local_alloc : () -> !ttg.memdesc<1x16x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
  // expected-remark @below {{offset = 1536, size = 512}}
  %cst2 = ttg.local_alloc : () -> !ttg.memdesc<1x16x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
  // expected-remark @below {{offset = 0, size = 1024}}
  %a = ttg.local_alloc : () -> !ttg.memdesc<32x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
  ttg.local_dealloc %cst1 : !ttg.memdesc<1x16x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
  ttg.local_dealloc %cst2 : !ttg.memdesc<1x16x16xf16, #A_SHARED, #ttg.shared_memory, mutable>

  // expected-remark @below {{offset = 1024, size = 512}}
  %cst3 = ttg.local_alloc : () -> !ttg.memdesc<1x16x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
  // expected-remark @below {{offset = 1536, size = 512}}
  %cst4 = ttg.local_alloc : () -> !ttg.memdesc<1x16x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
  // expected-remark @below {{offset = 0, size = 1024}}
  %b = ttg.local_alloc : () -> !ttg.memdesc<32x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
  // expected-remark @below {{offset = 0, size = 512}}
  %cst5 = ttg.local_alloc : () -> !ttg.memdesc<1x16x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
  // expected-remark @below {{offset = 0, size = 512}}
  %cst6 = ttg.local_alloc : () -> !ttg.memdesc<1x16x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
  // expected-remark @below {{offset = 0, size = 1024}}
  %c = ttg.local_alloc : () -> !ttg.memdesc<32x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
  ttg.local_dealloc %cst3 : !ttg.memdesc<1x16x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
  ttg.local_dealloc %cst4 : !ttg.memdesc<1x16x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
  // expected-remark @below {{offset = 0, size = 1024}}
  %d = ttg.local_alloc : () -> !ttg.memdesc<32x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
  ttg.local_dealloc %cst0 : !ttg.memdesc<1x16x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
  tt.return
}

// This example triggers graph coloring with > 1 colors.
// expected-remark @below {{multi_color}}
// expected-remark @below {{size = 1376}}
tt.func @multi_color(%A : !tt.ptr<f16>) {
  // expected-remark @below {{offset = 1024, size = 64}}
  %cst = ttg.local_alloc : () -> !ttg.memdesc<4x8xf16, #A_SHARED, #ttg.shared_memory, mutable>
  // expected-remark @below {{offset = 1344, size = 32}}
  %cst_0 = ttg.local_alloc : () -> !ttg.memdesc<4x4xf16, #A_SHARED, #ttg.shared_memory, mutable>
  // expected-remark @below {{offset = 1088, size = 128}}
  %cst_1 = ttg.local_alloc : () -> !ttg.memdesc<16x4xf16, #A_SHARED, #ttg.shared_memory, mutable>
  %cst_2 = arith.constant dense<0.000000e+00> : tensor<16x32xf16, #AL>
  // expected-remark @below {{scratch offset = 0, size = 1024}}
  %0 = ttg.convert_layout %cst_2 : tensor<16x32xf16, #AL> -> tensor<16x32xf16, #BL>
  %1 = ttg.local_load %cst : !ttg.memdesc<4x8xf16, #A_SHARED, #ttg.shared_memory, mutable> -> tensor<4x8xf16, #AL>
  // expected-remark @below {{offset = 0, size = 128}}
  %cst_3 = ttg.local_alloc : () -> !ttg.memdesc<4x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
  %2 = ttg.local_load %cst_0 : !ttg.memdesc<4x4xf16, #A_SHARED, #ttg.shared_memory, mutable> -> tensor<4x4xf16, #AL>
  // expected-remark @below {{scratch offset = 0, size = 1024}}
  %3 = ttg.convert_layout %cst_2 : tensor<16x32xf16, #AL> -> tensor<16x32xf16, #BL>
  // expected-remark @below {{offset = 512, size = 256}}
  %cst_4 = ttg.local_alloc : () -> !ttg.memdesc<4x32xf16, #A_SHARED, #ttg.shared_memory, mutable>
  // expected-remark @below {{offset = 0, size = 64}}
  %cst_5 = ttg.local_alloc : () -> !ttg.memdesc<4x8xf16, #A_SHARED, #ttg.shared_memory, mutable>
  %4 = ttg.local_load %cst_5 : !ttg.memdesc<4x8xf16, #A_SHARED, #ttg.shared_memory, mutable> -> tensor<4x8xf16, #AL>
  %5 = ttg.local_load %cst_5 : !ttg.memdesc<4x8xf16, #A_SHARED, #ttg.shared_memory, mutable> -> tensor<4x8xf16, #AL>
  // expected-remark @below {{offset = 0, size = 512}}
  %cst_6 = ttg.local_alloc : () -> !ttg.memdesc<8x32xf16, #A_SHARED, #ttg.shared_memory, mutable>
  // expected-remark @below {{offset = 1216, size = 128}}
  %cst_7 = ttg.local_alloc : () -> !ttg.memdesc<2x32xf16, #A_SHARED, #ttg.shared_memory, mutable>
  %6 = ttg.local_load %cst_0 : !ttg.memdesc<4x4xf16, #A_SHARED, #ttg.shared_memory, mutable> -> tensor<4x4xf16, #AL>
  // expected-remark @below {{offset = 0, size = 512}}
  %cst_8 = ttg.local_alloc : () -> !ttg.memdesc<1x16x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
  // expected-remark @below {{offset = 0, size = 32}}
  %cst_9 = ttg.local_alloc : () -> !ttg.memdesc<4x4xf16, #A_SHARED, #ttg.shared_memory, mutable>
  // expected-remark @below {{offset = 0, size = 512}}
  %cst_10 = ttg.local_alloc : () -> !ttg.memdesc<1x16x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
  %7 = ttg.local_load %cst_1 : !ttg.memdesc<16x4xf16, #A_SHARED, #ttg.shared_memory, mutable> -> tensor<16x4xf16, #AL>
  %8 = ttg.local_load %cst_4 : !ttg.memdesc<4x32xf16, #A_SHARED, #ttg.shared_memory, mutable> -> tensor<4x32xf16, #AL>
  // expected-remark @below {{scratch offset = 0, size = 1024}}
  %9 = ttg.convert_layout %cst_2 : tensor<16x32xf16, #AL> -> tensor<16x32xf16, #BL>
  %cst_11 = arith.constant dense<0.000000e+00> : tensor<4x4xf16, #AL>
  %10 = ttg.local_load %cst_7 : !ttg.memdesc<2x32xf16, #A_SHARED, #ttg.shared_memory, mutable> -> tensor<2x32xf16, #AL>
  %cst_12 = arith.constant dense<0.000000e+00> : tensor<4x16xf16, #AL>
  %cst_13 = arith.constant dense<0.000000e+00> : tensor<8x32xf16, #AL>
  tt.return
}

// This example triggers graph coloring with multiple rounds
// expected-remark @below {{multi_color_multi_rounds}}
// expected-remark @below {{size = 9376}}
tt.func @multi_color_multi_rounds(%arg0: !tt.ptr<f16>) {
  // expected-remark @below {{offset = 9344, size = 32}}
  %cst = ttg.local_alloc : () -> !ttg.memdesc<4x4xf16, #A_SHARED, #ttg.shared_memory, mutable>
  // expected-remark @below {{offset = 9216, size = 128}}
  %cst_0 = ttg.local_alloc : () -> !ttg.memdesc<16x4xf16, #A_SHARED, #ttg.shared_memory, mutable>
  // expected-remark @below {{offset = 0, size = 8192}}
  %cst_1 = ttg.local_alloc : () -> !ttg.memdesc<1024x4xf16, #A_SHARED, #ttg.shared_memory, mutable>
  %cst_2 = arith.constant dense<0.000000e+00> : tensor<16x32xf16, #AL>
  // expected-remark @below {{scratch offset = 8192, size = 1024}}
  %0 = ttg.convert_layout %cst_2 : tensor<16x32xf16, #AL> -> tensor<16x32xf16, #BL>
  %1 = ttg.local_load %cst : !ttg.memdesc<4x4xf16, #A_SHARED, #ttg.shared_memory, mutable> -> tensor<4x4xf16, #AL>
  // expected-remark @below {{offset = 8704, size = 128}}
  %cst_3 = ttg.local_alloc : () -> !ttg.memdesc<2x32xf16, #A_SHARED, #ttg.shared_memory, mutable>
  %2 = ttg.local_load %cst : !ttg.memdesc<4x4xf16, #A_SHARED, #ttg.shared_memory, mutable> -> tensor<4x4xf16, #AL>
  // expected-remark @below {{offset = 8192, size = 512}}
  %cst_4 = ttg.local_alloc : () -> !ttg.memdesc<1x16x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
  %3 = ttg.local_load %cst_0 : !ttg.memdesc<16x4xf16, #A_SHARED, #ttg.shared_memory, mutable> -> tensor<16x4xf16, #AL>
  %4 = ttg.local_load %cst_1 : !ttg.memdesc<1024x4xf16, #A_SHARED, #ttg.shared_memory, mutable> -> tensor<1024x4xf16, #AL>
  // expected-remark @below {{scratch offset = 0, size = 1024}}
  %5 = ttg.convert_layout %cst_2 : tensor<16x32xf16, #AL> -> tensor<16x32xf16, #BL>
  %6 = ttg.local_load %cst_3 : !ttg.memdesc<2x32xf16, #A_SHARED, #ttg.shared_memory, mutable> -> tensor<2x32xf16, #AL>
  tt.return
}


// expected-remark @below {{alloc}}
// expected-remark @below {{size = 512}}
tt.func @alloc(%A : !tt.ptr<f16>) {
  // expected-remark @below {{offset = 0, size = 512}}
  %cst0 = ttg.local_alloc : () -> !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
  %cst1 = arith.constant dense<0.000000e+00> : tensor<16x32xf16, #AL>
  // expected-remark @below {{offset = 0, size = 512}}
  %cst2 = ttg.local_alloc : () -> !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
  tt.return
}


// expected-remark @below {{dealloc}}
// expected-remark @below {{size = 2048}}
tt.func @dealloc(%A : !tt.ptr<f16>) {
  // expected-remark @below {{offset = 0, size = 1024}}
  %cst0 = ttg.local_alloc : () -> !ttg.memdesc<32x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
  // expected-remark @below {{offset = 1024, size = 1024}}
  %cst1 = ttg.local_alloc : () -> !ttg.memdesc<32x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
  ttg.local_dealloc %cst0 : !ttg.memdesc<32x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
  tt.return
}

// expected-remark @below {{scratch}}
// expected-remark @below {{size = 128}}
tt.func @scratch() {
  %cst0 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #AL>
  // expected-remark @below {{scratch offset = 0, size = 128}}
  %b = "tt.reduce" (%cst0) ({
  ^bb0(%arg0: f16, %arg1: f16):
    %add = arith.addf %arg0, %arg1 : f16
    tt.reduce.return %add : f16
  }) {axis = 0 : i32} : (tensor<16x16xf16, #AL>) -> tensor<16xf16, #sliceAd0>
  tt.return
}

// expected-remark @below {{trans}}
// expected-remark @below {{size = 1024}}
tt.func @trans(%A : !tt.ptr<f16>) {
  // expected-remark @below {{offset = 0, size = 1024}}
  %tensor = ttg.local_alloc : () -> !ttg.memdesc<16x32xf16, #A_SHARED, #ttg.shared_memory, mutable>
  %b = ttg.memdesc_trans %tensor {order=array<i32: 1,0>} : !ttg.memdesc<16x32xf16, #A_SHARED, #ttg.shared_memory, mutable> -> !ttg.memdesc<32x16xf16, #A_SHARED_T, #ttg.shared_memory, mutable>
  tt.return
}


// expected-remark @below {{extract_slice}}
// expected-remark @below {{size = 512}}
tt.func @extract_slice(%A : !tt.ptr<f16>) {
  // expected-remark @below {{offset = 0, size = 512}}
  %cst0 = ttg.local_alloc : () -> !ttg.memdesc<1x16x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
  %index = arith.constant 0 : i32
  %cst1 = ttg.memdesc_index %cst0[%index] : !ttg.memdesc<1x16x16xf16, #A_SHARED, #ttg.shared_memory, mutable> -> !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
  tt.return
}

// expected-remark @below {{atomic_scalar}}
// expected-remark @below {{size = 8196}}
tt.func @atomic_scalar(%arg3: !tt.ptr<i32>) -> i32 {
  %c0_i32 = arith.constant 0 : i32
  %1 = arith.constant dense<1.0> : tensor<128x32xf16, #AL>
  // expected-remark @below {{offset = 0, size = 8192}}
  %2 = ttg.local_alloc %1 : (tensor<128x32xf16, #AL>) -> !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory>
  // expected-remark @below {{scratch offset = 8192, size = 4}}
  %4 = tt.atomic_cas acq_rel, gpu, %arg3, %c0_i32, %c0_i32 : (!tt.ptr<i32>, i32, i32) -> i32
  %3 = ttg.local_load %2 : !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory> -> tensor<128x32xf16, #AL>
  tt.return %4 : i32
}

// expected-remark @below {{atomic_scalar_no_use}}
// expected-remark @below {{size = 8192}}
tt.func @atomic_scalar_no_use(%arg3: !tt.ptr<i32>) {
  %c0_i32 = arith.constant 0 : i32
  %1 = arith.constant dense<1.0> : tensor<128x32xf16, #AL>
  // expected-remark @below {{offset = 0, size = 8192}}
  %2 = ttg.local_alloc %1 : (tensor<128x32xf16, #AL>) -> !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory>
  %4 = tt.atomic_cas acq_rel, gpu, %arg3, %c0_i32, %c0_i32 : (!tt.ptr<i32>, i32, i32) -> i32
  %3 = ttg.local_load %2 : !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory> -> tensor<128x32xf16, #AL>
  tt.return
}

// B0 -> (B1) -> B0
// Memory used by B1 can be reused by B0.
// expected-remark @below {{if}}
// expected-remark @below {{size = 2048}}
tt.func @if(%i1 : i1) {
  // expected-remark @below {{offset = 1024, size = 512}}
  %cst0 = ttg.local_alloc : () -> !ttg.memdesc<1x16x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
  // expected-remark @below {{offset = 1536, size = 512}}
  %cst1 = ttg.local_alloc : () -> !ttg.memdesc<1x16x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
  scf.if %i1 {
    // expected-remark @below {{offset = 0, size = 1024}}
    %a = ttg.local_alloc : () -> !ttg.memdesc<32x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
    // expected-remark @below {{offset = 0, size = 1024}}
    %b = ttg.local_alloc : () -> !ttg.memdesc<32x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
    ttg.local_dealloc %cst0 : !ttg.memdesc<1x16x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
    ttg.local_dealloc %cst1 : !ttg.memdesc<1x16x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
  }
  // expected-remark @below {{offset = 1024, size = 512}}
  %cst2 = ttg.local_alloc : () -> !ttg.memdesc<1x16x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
  // expected-remark @below {{offset = 1536, size = 512}}
  %cst3 = ttg.local_alloc : () -> !ttg.memdesc<1x16x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
  // expected-remark @below {{offset = 0, size = 1024}}
  %a = ttg.local_alloc : () -> !ttg.memdesc<32x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
  ttg.local_dealloc %cst2 : !ttg.memdesc<1x16x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
  ttg.local_dealloc %cst3 : !ttg.memdesc<1x16x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
  tt.return
}

// B0 -> (B1) -> (B2) -> B0
// Memory used by B0 cannot be reused by B1 or B2.
// expected-remark @below {{if_else}}
// expected-remark @below {{size = 3072}}
tt.func @if_else(%i1 : i1) {
  // expected-remark @below {{offset = 1536, size = 512}}
  %cst0 = ttg.local_alloc : () -> !ttg.memdesc<1x16x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
  // expected-remark @below {{offset = 2048, size = 512}}
  %cst1 = ttg.local_alloc : () -> !ttg.memdesc<1x16x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
  scf.if %i1 {
    // expected-remark @below {{offset = 0, size = 1024}}
    %a = ttg.local_alloc : () -> !ttg.memdesc<32x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
    // expected-remark @below {{offset = 0, size = 1024}}
    %b = ttg.local_alloc : () -> !ttg.memdesc<32x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
  } else {
    // expected-remark @below {{offset = 1024, size = 512}}
    %cst2 = ttg.local_alloc : () -> !ttg.memdesc<1x16x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
    // expected-remark @below {{offset = 2560, size = 512}}
    %cst3 = ttg.local_alloc : () -> !ttg.memdesc<1x16x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
    // expected-remark @below {{offset = 0, size = 1024}}
    %a = ttg.local_alloc : () -> !ttg.memdesc<32x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
    ttg.local_dealloc %cst2 : !ttg.memdesc<1x16x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
    ttg.local_dealloc %cst3 : !ttg.memdesc<1x16x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
  }
  // expected-remark @below {{offset = 0, size = 1024}}
  %a = ttg.local_alloc : () -> !ttg.memdesc<32x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
  ttg.local_dealloc %cst0 : !ttg.memdesc<1x16x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
  ttg.local_dealloc %cst1 : !ttg.memdesc<1x16x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
  tt.return
}

// Block arguments and yields are memory aliases that do not trigger a new
// allocation.
// expected-remark @below {{for}}
// expected-remark @below {{size = 24576}}
tt.func @for(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B : !tt.ptr<f16>) {
  // expected-remark @below {{offset = 0, size = 8192}}
  %a_shared_init = ttg.local_alloc : () -> !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable>
  // expected-remark @below {{offset = 8192, size = 8192}}
  %b_shared_init = ttg.local_alloc : () -> !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable>
  // expected-remark @below {{offset = 16384, size = 8192}}
  %c_shared_init = ttg.local_alloc : () -> !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable>
  %a_shared, %b_shared, %c_shared = scf.for %iv = %lb to %ub step %step iter_args(%a_shared = %a_shared_init, %b_shared = %b_shared_init, %c_shared = %c_shared_init) -> (!ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable>, !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable>, !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable>) {
    scf.yield %b_shared, %a_shared, %a_shared : !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable>, !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable>, !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable>
  }
  tt.return
  // CHECK-NEXT: size = 24576
}

// expected-remark @below {{for_if_slice}}
// expected-remark @below {{size = 24576}}
tt.func @for_if_slice(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B : !tt.ptr<f16>, %i1 : i1) {
  // expected-remark @below {{offset = 0, size = 8192}}
  %a_shared_init = ttg.local_alloc : () -> !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable>
  // expected-remark @below {{offset = 8192, size = 8192}}
  %b_shared_init = ttg.local_alloc : () -> !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable>
  // expected-remark @below {{offset = 16384, size = 8192}}
  %c_shared_init = ttg.local_alloc : () -> !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable>
  %a_shared, %b_shared, %c_shared = scf.for %iv = %lb to %ub step %step iter_args(%a_shared = %a_shared_init, %b_shared = %b_shared_init, %c_shared = %c_shared_init) -> (!ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable>, !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable>, !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable>) {
    scf.if %i1 {
      %zero = arith.constant 0 : i32
      %index = arith.constant 8 : i32
      %cst0 = ttg.memdesc_index %a_shared[%index] : !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable> -> !ttg.memdesc<32xf16, #A_SHARED, #ttg.shared_memory, mutable>
      scf.yield
    }
    scf.yield %b_shared, %a_shared, %a_shared : !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable>, !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable>, !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable>
  }
  tt.return
}

// c0 cannot be released in the loop
// expected-remark @below {{for_use_ancestor}}
// expected-remark @below {{size = 32768}}
tt.func @for_use_ancestor(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B : !tt.ptr<f16>, %i1 : i1) {
  // expected-remark @below {{offset = 0, size = 8192}}
  %a_shared_init = ttg.local_alloc : () -> !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable>
  // expected-remark @below {{offset = 8192, size = 8192}}
  %b_shared_init = ttg.local_alloc : () -> !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable>
  // expected-remark @below {{offset = 16384, size = 8192}}
  %c_shared_init = ttg.local_alloc : () -> !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable>
  %a_shared, %b_shared = scf.for %iv = %lb to %ub step %step iter_args(%a_shared = %a_shared_init, %b_shared = %b_shared_init) -> (!ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable>, !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable>) {
    %c0 = ttg.memdesc_trans %c_shared_init {order=array<i32: 1,0>} : !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable> -> !ttg.memdesc<32x128xf16, #A_SHARED_T, #ttg.shared_memory, mutable>
    // expected-remark @below {{offset = 24576, size = 8192}}
    %c1 = ttg.local_alloc : () -> !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable>
    scf.yield %b_shared, %a_shared: !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable>, !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable>
  }
  tt.return
}

// a_shared_init, b_shared_init, and c_shared_init's liveness ranges are span over the entire function before cst2.
// So they cannot be reused by cst0 and cst1, but can be reused by cst2.
// expected-remark @below {{for_for_if}}
// expected-remark @below {{size = 40960}}
tt.func @for_for_if(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B : !tt.ptr<f16>, %i1 : i1) {
  // expected-remark @below {{offset = 0, size = 8192}}
  %a_shared_init = ttg.local_alloc : () -> !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable>
  // expected-remark @below {{offset = 8192, size = 8192}}
  %b_shared_init = ttg.local_alloc : () -> !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable>
  // expected-remark @below {{offset = 16384, size = 8192}}
  %c_shared_init = ttg.local_alloc : () -> !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable>
  %a_shared, %b_shared, %c_shared = scf.for %iv = %lb to %ub step %step iter_args(%a_shared = %a_shared_init, %b_shared = %b_shared_init, %c_shared = %c_shared_init) -> (!ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable>, !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable>, !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable>) {
    %c_shared_next = scf.for %jv = %lb to %ub step %step iter_args(%c_shared_next = %c_shared) -> (!ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable>) {
      %c_shared_next_next = scf.if %i1 -> !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable> {
        // expected-remark @below {{offset = 24576, size = 8192}}
        %cst0 = ttg.local_alloc : () -> !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable>
        scf.yield %cst0 : !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable>
      } else {
        // expected-remark @below {{offset = 32768, size = 8192}}
        %cst1 = ttg.local_alloc : () -> !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable>
        scf.yield %cst1 : !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable>
      }
      scf.yield %c_shared_next_next : !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable>
    }
    scf.yield %a_shared, %b_shared, %c_shared_next : !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable>, !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable>, !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable>
  }
  // expected-remark @below {{offset = 0, size = 8192}}
  %cst2 = ttg.local_alloc : () -> !ttg.memdesc<128x32xf16, #A_SHARED, #ttg.shared_memory, mutable>
  tt.return
}

// expected-remark @below {{alloc1}}
// expected-remark @below {{size = 512}}
tt.func @alloc1(%A : !tt.ptr<f16>) {
  // expected-remark @below {{offset = 0, size = 512}}
  %cst0 = ttg.local_alloc : () -> !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
  tt.return
}

// expected-remark @below {{alloc2}}
// expected-remark @below {{size = 1024}}
tt.func @alloc2(%A : !tt.ptr<f16>) {
  // expected-remark @below {{offset = 0, size = 1024}}
  %cst0 = ttg.local_alloc : () -> !ttg.memdesc<32x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
  tt.return
}

// expected-remark @below {{alloc3}}
// expected-remark @below {{size = 1024}}
tt.func @alloc3(%cond : i1) {
  scf.if %cond {
    // expected-remark @below {{offset = 0, size = 512}}
    %cst0 = ttg.local_alloc : () -> !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
  } else {
    // expected-remark @below {{offset = 0, size = 1024}}
    %cst0 = ttg.local_alloc : () -> !ttg.memdesc<16x32xf16, #A_SHARED, #ttg.shared_memory, mutable>
  }
  tt.return
}

// expected-remark @below {{alloc4}}
// expected-remark @below {{size = 1024}}
tt.func @alloc4(%A : !tt.ptr<f16>, %cond : i1) {
  scf.if %cond {
    // expected-remark @below {{virtual offset = 0, size = 1024}}
    tt.call @alloc3(%cond) : (i1) -> ()
  } else {
    // expected-remark @below {{virtual offset = 0, size = 512}}
    tt.call @alloc1(%A) : (!tt.ptr<f16>) -> ()
  }
  tt.return
}

// expected-remark @below {{single_call}}
// expected-remark @below {{size = 512}}
tt.func @single_call(%A : !tt.ptr<f16>) {
  // expected-remark @below {{offset = 0, size = 512}}
  %cst0 = ttg.local_alloc : () -> !ttg.memdesc<1x16x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
  %cst1 = arith.constant dense<0.000000e+00> : tensor<16x32xf16, #AL>
  // expected-remark @below {{virtual offset = 0, size = 512}}
  tt.call @alloc1(%A) : (!tt.ptr<f16>) -> ()
  tt.return
}

// expected-remark @below {{multiple_calls}}
// expected-remark @below {{size = 1024}}
tt.func @multiple_calls(%A : !tt.ptr<f16>) {
  // expected-remark @below {{offset = 0, size = 512}}
  %cst0 = ttg.local_alloc : () -> !ttg.memdesc<1x16x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
  // expected-remark @below {{virtual offset = 0, size = 512}}
  tt.call @alloc1(%A) : (!tt.ptr<f16>) -> ()
  %cst1 = arith.constant dense<0.000000e+00> : tensor<16x32xf16, #AL>
  // expected-remark @below {{virtual offset = 0, size = 1024}}
  tt.call @alloc2(%A) : (!tt.ptr<f16>) -> ()
  tt.return
}

// expected-remark @below {{if_else_calls}}
// expected-remark @below {{size = 1024}}
tt.func @if_else_calls(%A : !tt.ptr<f16>, %cond : i1) {
  %cst = arith.constant dense<0.000000e+00> : tensor<16x32xf16, #AL>
  scf.if %cond {
    // expected-remark @below {{offset = 0, size = 512}}
    %cst0 = ttg.local_alloc : () -> !ttg.memdesc<1x16x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
    // expected-remark @below {{offset = 0, size = 1024}}
    %cst1 = ttg.local_alloc %cst : (tensor<16x32xf16, #AL>) -> !ttg.memdesc<16x32xf16, #A_SHARED, #ttg.shared_memory, mutable>
    // expected-remark @below {{virtual offset = 0, size = 512}}
    tt.call @alloc1(%A) : (!tt.ptr<f16>) -> ()
  } else {
    %cst0 = arith.constant dense<0.000000e+00> : tensor<16x32xf16, #AL>
    // expected-remark @below {{virtual offset = 0, size = 1024}}
    tt.call @alloc2(%A) : (!tt.ptr<f16>) -> ()
  }
  tt.return
}

// expected-remark @below {{for_calls}}
// expected-remark @below {{size = 512}}
tt.func @for_calls(%A : !tt.ptr<f16>, %cond : i1) {
  // expected-remark @below {{offset = 0, size = 512}}
  %cst0 = ttg.local_alloc : () -> !ttg.memdesc<1x16x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
  %cst1 = arith.constant dense<0.000000e+00> : tensor<16x32xf16, #AL>
  %lb = arith.constant 0 : index
  %ub = arith.constant 10 : index
  %step = arith.constant 1 : index
  scf.for %iv = %lb to %ub step %step {
    // expected-remark @below {{virtual offset = 0, size = 512}}
    tt.call @alloc1(%A) : (!tt.ptr<f16>) -> ()
  }
  tt.return
  // CHECK-NEXT: size = 512
}

// expected-remark @below {{call_graph_1}}
// expected-remark @below {{size = 1024}}
tt.func @call_graph_1(%A : !tt.ptr<f16>, %cond : i1) {
  // expected-remark @below {{offset = 0, size = 512}}
  %cst0 = ttg.local_alloc : () -> !ttg.memdesc<1x16x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
  // expected-remark @below {{virtual offset = 0, size = 1024}}
  tt.call @alloc3(%cond) : (i1) -> ()
  tt.return
}

// expected-remark @below {{call_graph_2}}
// expected-remark @below {{size = 1024}}
tt.func @call_graph_2(%A : !tt.ptr<f16>, %cond : i1) {
  // expected-remark @below {{offset = 0, size = 512}}
  %cst0 = ttg.local_alloc : () -> !ttg.memdesc<1x16x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
  // expected-remark @below {{virtual offset = 0, size = 1024}}
  tt.call @alloc4(%A, %cond) : (!tt.ptr<f16>, i1) -> ()
  tt.return
}

// expected-remark @below {{scan_alloc}}
// expected-remark @below {{size = 128}}
tt.func @scan_alloc(%x : tensor<8x16xf32, #AL>) {
  // expected-remark @below {{offset = 0, size = 128}}
  %a = "tt.scan"(%x) <{axis = 0 : i32, reverse = false}>({
  ^bb0(%arg0: f32, %arg1: f32):
    %add = arith.addf %arg0, %arg1 : f32
    tt.scan.return %add : f32
  }) : (tensor<8x16xf32, #AL>) -> tensor<8x16xf32, #AL>
  tt.return
}

// expected-remark @below {{warp_specialize_default_region}}
// expected-remark @below {{size = 33}}
// expected-remark @below {{offset = 32, size = 1}}
tt.func @warp_specialize_default_region() {
  // expected-remark @below {{offset = 0, size = 16}}
  %0 = ttg.local_alloc : () -> !ttg.memdesc<2xi64, #A_SHARED, #smem, mutable>
  ttg.warp_specialize()
  default {
    // expected-remark @below {{offset = 16, size = 16}}
    %1 = ttg.local_alloc : () -> !ttg.memdesc<2xi64, #A_SHARED, #smem, mutable>
    ttg.warp_yield
  }
  partition0() num_warps(1) {
    ttg.warp_return
  } : () -> ()
  "use"(%0) : (!ttg.memdesc<2xi64, #A_SHARED, #smem, mutable>) -> ()

  tt.return
}

// expected-remark @below {{nonoverlapping_liveness_in_default_region}}
// expected-remark @below {{size = 33}}
// expected-remark @below {{offset = 32, size = 1}}
tt.func @nonoverlapping_liveness_in_default_region() {
  // expected-remark @below {{offset = 0, size = 16}}
  %0 = ttg.local_alloc : () -> !ttg.memdesc<2xi64, #A_SHARED, #smem, mutable>
  ttg.warp_specialize()
  default {
    // expected-remark @below {{offset = 16, size = 16}}
    %1 = ttg.local_alloc : () -> !ttg.memdesc<2xi64, #A_SHARED, #smem, mutable>
    "use"(%1) : (!ttg.memdesc<2xi64, #A_SHARED, #smem, mutable>) -> ()
    // expected-remark @below {{offset = 16, size = 16}}
    %2 = ttg.local_alloc : () -> !ttg.memdesc<2xi64, #A_SHARED, #smem, mutable>
    "use"(%2) : (!ttg.memdesc<2xi64, #A_SHARED, #smem, mutable>) -> ()
    ttg.warp_yield
  }
  partition0() num_warps(1) {
    ttg.warp_return
  } : () -> ()
  "use"(%0) : (!ttg.memdesc<2xi64, #A_SHARED, #smem, mutable>) -> ()

  tt.return
}

// expected-remark @below {{overlapping_liveness_in_default_region}}
// expected-remark @below {{size = 49}}
// expected-remark @below {{offset = 48, size = 1}}
tt.func @overlapping_liveness_in_default_region() {
  // expected-remark @below {{offset = 0, size = 16}}
  %0 = ttg.local_alloc : () -> !ttg.memdesc<2xi64, #A_SHARED, #smem, mutable>
  ttg.warp_specialize()
  default {
    // expected-remark @below {{offset = 16, size = 16}}
    %1 = ttg.local_alloc : () -> !ttg.memdesc<2xi64, #A_SHARED, #smem, mutable>
    // expected-remark @below {{offset = 32, size = 16}}
    %2 = ttg.local_alloc : () -> !ttg.memdesc<2xi64, #A_SHARED, #smem, mutable>
    "use"(%1) : (!ttg.memdesc<2xi64, #A_SHARED, #smem, mutable>) -> ()
    "use"(%2) : (!ttg.memdesc<2xi64, #A_SHARED, #smem, mutable>) -> ()
    ttg.warp_yield
  }
  partition0() num_warps(1) {
    ttg.warp_return
  } : () -> ()
  "use"(%0) : (!ttg.memdesc<2xi64, #A_SHARED, #smem, mutable>) -> ()

  tt.return
}

// expected-remark @below {{alias_through_default_outputs}}
// expected-remark @below {{size = 33}}
// expected-remark @below {{offset = 32, size = 1}}
tt.func @alias_through_default_outputs() {
  // expected-remark @below {{offset = 0, size = 16}}
  %0 = ttg.local_alloc : () -> !ttg.memdesc<2xi64, #A_SHARED, #smem, mutable>
  %1 = ttg.warp_specialize()
  default {
    ttg.warp_yield %0 : !ttg.memdesc<2xi64, #A_SHARED, #smem, mutable>
  }
  partition0() num_warps(1) {
    ttg.warp_return
  } : () -> !ttg.memdesc<2xi64, #A_SHARED, #smem, mutable>
  // expected-remark @below {{offset = 16, size = 16}}
  %2 = ttg.local_alloc : () -> !ttg.memdesc<2xi64, #A_SHARED, #smem, mutable>
  "use"(%1) : (!ttg.memdesc<2xi64, #A_SHARED, #smem, mutable>) -> ()
  tt.return
}

// expected-remark @below {{implicit_capture_liveness}}
// expected-remark @below {{size = 33}}
// expected-remark @below {{offset = 32, size = 1}}
tt.func @implicit_capture_liveness() {
  // expected-remark @below {{offset = 0, size = 16}}
  %0 = ttg.local_alloc : () -> !ttg.memdesc<2xi64, #A_SHARED, #smem, mutable>
  ttg.warp_specialize()
  default {
    // expected-remark @below {{offset = 16, size = 16}}
    %1 = ttg.local_alloc : () -> !ttg.memdesc<2xi64, #A_SHARED, #smem, mutable>
    "use"(%0) : (!ttg.memdesc<2xi64, #A_SHARED, #smem, mutable>) -> ()
    ttg.warp_yield
  }
  partition0() num_warps(1) {
    ttg.warp_return
  } : () -> ()
  tt.return
}

// expected-remark @below {{implicit_and_explicit_capture_liveness}}
// expected-remark @below {{size = 45}}
// expected-remark @below {{offset = 44, size = 1}}
tt.func @implicit_and_explicit_capture_liveness() {
  // expected-remark @below {{offset = 0, size = 16}}
  %0 = ttg.local_alloc : () -> !ttg.memdesc<2xi64, #A_SHARED, #smem, mutable>
  // expected-remark @below {{offset = 16, size = 16}}
  %1 = ttg.local_alloc : () -> !ttg.memdesc<2xi64, #A_SHARED, #smem, mutable>
  // expected-remark @below {{offset = 32, size = 12}}
  ttg.warp_specialize(%1)
  default {
    "use"(%0) : (!ttg.memdesc<2xi64, #A_SHARED, #smem, mutable>) -> ()
    ttg.warp_yield
  }
  partition0(%arg0: !ttg.memdesc<2xi64, #A_SHARED, #smem, mutable>) num_warps(1) {
    ttg.warp_return
  } : (!ttg.memdesc<2xi64, #A_SHARED, #smem, mutable>) -> ()
  tt.return
}

// expected-remark @below {{explicit_capture_liveness}}
// expected-remark @below {{size = 33}}
// expected-remark @below {{offset = 32, size = 1}}
tt.func @explicit_capture_liveness() {
  // expected-remark @below {{offset = 0, size = 16}}
  %0 = ttg.local_alloc : () -> !ttg.memdesc<2xi64, #A_SHARED, #smem, mutable>
  // expected-remark @below {{offset = 16, size = 12}}
  ttg.warp_specialize(%0)
  default {
    // expected-remark @below {{offset = 16, size = 16}}
    %1 = ttg.local_alloc : () -> !ttg.memdesc<2xi64, #A_SHARED, #smem, mutable>
    ttg.warp_yield
  }
  partition0(%arg0: !ttg.memdesc<2xi64, #A_SHARED, #smem, mutable>) num_warps(1) {
    ttg.warp_return
  } : (!ttg.memdesc<2xi64, #A_SHARED, #smem, mutable>) -> ()
  tt.return
}

// expected-remark @below {{implicit_capture_liveness_default}}
// expected-remark @below {{size = 33}}
// expected-remark @below {{offset = 32, size = 1}}
tt.func @implicit_capture_liveness_default() {
  // expected-remark @below {{offset = 0, size = 16}}
  %0 = ttg.local_alloc : () -> !ttg.memdesc<2xi64, #A_SHARED, #smem, mutable>
  ttg.warp_specialize()
  default {
    // FIXME: This is correct, but not optimal. The memory for `%0` should be
    // reused for the next allocation. The same problem happens with `scf.if`.
    "use"(%0) : (!ttg.memdesc<2xi64, #A_SHARED, #smem, mutable>) -> ()
    // expected-remark @below {{offset = 16, size = 16}}
    %1 = ttg.local_alloc : () -> !ttg.memdesc<2xi64, #A_SHARED, #smem, mutable>
    ttg.warp_yield
  }
  partition0() num_warps(1) {
    ttg.warp_return
  } : () -> ()
  tt.return
}

// expected-remark @below {{liveness_in_partition}}
// expected-remark @below {{size = 36}}
// expected-remark @below {{offset = 32, size = 4}}
tt.func @liveness_in_partition() {
  ttg.warp_specialize()
  default {
    ttg.warp_yield
  }
  partition0() num_warps(4) {
    // expected-remark @below {{offset = 0, size = 16}}
    %0 = ttg.local_alloc : () -> !ttg.memdesc<2xi64, #A_SHARED, #smem, mutable>
    // expected-remark @below {{offset = 16, size = 16}}
    %2 = ttg.local_alloc : () -> !ttg.memdesc<2xi64, #A_SHARED, #smem, mutable>
    "use"(%0) : (!ttg.memdesc<2xi64, #A_SHARED, #smem, mutable>) -> ()
    ttg.warp_return
  } : () -> ()
  tt.return
}

// expected-remark @below {{aliasing_in_partition}}
// expected-remark @below {{size = 36}}
// expected-remark @below {{offset = 32, size = 4}}
tt.func @aliasing_in_partition() {
  ttg.warp_specialize()
  default {
    ttg.warp_yield
  }
  partition0() num_warps(4) {
    // expected-remark @below {{offset = 0, size = 16}}
    %0 = ttg.local_alloc : () -> !ttg.memdesc<2x1xi64, #A_SHARED, #smem, mutable>
    %c0_i32 = arith.constant 0 : i32
    %1 = ttg.memdesc_index %0[%c0_i32] : !ttg.memdesc<2x1xi64, #A_SHARED, #smem, mutable> -> !ttg.memdesc<1xi64, #A_SHARED, #smem, mutable>
    // expected-remark @below {{offset = 16, size = 16}}
    %2 = ttg.local_alloc : () -> !ttg.memdesc<2xi64, #A_SHARED, #smem, mutable>
    "use"(%1) : (!ttg.memdesc<1xi64, #A_SHARED, #smem, mutable>) -> ()
    ttg.warp_return
  } : () -> ()
  tt.return
}

// expected-remark @below {{partition_region_interference}}
// expected-remark @below {{size = 88}}
// expected-remark @below {{offset = 80, size = 8}}
tt.func @partition_region_interference() {
  // expected-remark @below {{offset = 0, size = 16}}
  %0 = ttg.local_alloc : () -> !ttg.memdesc<2xi64, #A_SHARED, #smem, mutable>
  ttg.warp_specialize()
  default {
    // expected-remark @below {{offset = 16, size = 16}}
    %1 = ttg.local_alloc : () -> !ttg.memdesc<2xi64, #A_SHARED, #smem, mutable>
    ttg.warp_yield
  }
  partition0() num_warps(4) {
    // expected-remark @below {{offset = 32, size = 16}}
    %1 = ttg.local_alloc : () -> !ttg.memdesc<2xi64, #A_SHARED, #smem, mutable>
    // expected-remark @below {{offset = 48, size = 16}}
    %2 = ttg.local_alloc : () -> !ttg.memdesc<2xi64, #A_SHARED, #smem, mutable>
    "use"(%1) : (!ttg.memdesc<2xi64, #A_SHARED, #smem, mutable>) -> ()
    ttg.warp_return
  }
  partition1() num_warps(4) {
    // expected-remark @below {{offset = 64, size = 16}}
    %1 = ttg.local_alloc : () -> !ttg.memdesc<2xi64, #A_SHARED, #smem, mutable>
    // expected-remark @below {{offset = 64, size = 16}}
    %2 = ttg.local_alloc : () -> !ttg.memdesc<2xi64, #A_SHARED, #smem, mutable>
    ttg.warp_return
  } : () -> ()
  "use"(%0) : (!ttg.memdesc<2xi64, #A_SHARED, #smem, mutable>) -> ()
  tt.return
}

// expected-remark @below {{two_different_ws}}
// expected-remark @below {{size = 17}}
// expected-remark @below {{offset = 16, size = 1}}
tt.func @two_different_ws() {
  ttg.warp_specialize()
  default {
    // expected-remark @below {{offset = 0, size = 16}}
    ttg.local_alloc : () -> !ttg.memdesc<2xi64, #A_SHARED, #smem, mutable>
    ttg.warp_yield
  }
  partition0() num_warps(1) {
    ttg.warp_return
  } : () -> ()
  ttg.warp_specialize()
  default {
    ttg.warp_yield
  }
  partition0() num_warps(1) {
    // expected-remark @below {{offset = 0, size = 16}}
    ttg.local_alloc : () -> !ttg.memdesc<2xi64, #A_SHARED, #smem, mutable>
    ttg.warp_return
  } : () -> ()
  tt.return
}

// expected-remark @below {{ptr_allocation_datalayout}}
// expected-remark @below {{size = 8}}
tt.func @ptr_allocation_datalayout(%arg0: !tt.ptr<i32>) {
  // expected-remark @below {{offset = 0, size = 8}}
  ttg.warp_specialize(%arg0)
  default {
    ttg.warp_yield
  } : (!tt.ptr<i32>) -> ()
  tt.return
}

// expected-remark @below {{tightly_packed_captures}}
// expected-remark @below {{size = 9}}
tt.func @tightly_packed_captures(%arg0: i8, %arg1: i64) {
  // expected-remark @below {{offset = 0, size = 9}}
  ttg.warp_specialize(%arg0, %arg1)
  default {
    ttg.warp_yield
  } : (i8, i64) -> ()
  tt.return
}
// expected-remark @below {{nvmma_alignment}}
// expected-remark @below {{size = 1088}}
tt.func @nvmma_alignment(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B : !tt.ptr<f16>) {
  // expected-remark @below {{offset = 0, size = 128}}
  %fp4 = ttg.local_alloc : () -> !ttg.memdesc<8x8xi8, #NVMMA_SHARED_FP4PADDED, #ttg.shared_memory, mutable>
  // expected-remark @below {{offset = 0, size = 64}}
  %a = ttg.local_alloc : () -> !ttg.memdesc<32xf16, #A_SHARED, #ttg.shared_memory, mutable>
  // expected-remark @below {{offset = 128, size = 64}}
  %b = ttg.local_alloc : () -> !ttg.memdesc<8x8xi8, #NVMMA_SHARED_0, #ttg.shared_memory, mutable>
  // expected-remark @below {{offset = 256, size = 64}}
  %c = ttg.local_alloc : () -> !ttg.memdesc<8x8xi8, #NVMMA_SHARED_32, #ttg.shared_memory, mutable>
  // expected-remark @below {{offset = 512, size = 64}}
  %d = ttg.local_alloc : () -> !ttg.memdesc<8x8xi8, #NVMMA_SHARED_64, #ttg.shared_memory, mutable>
  // expected-remark @below {{offset = 1024, size = 64}}
  %e = ttg.local_alloc : () -> !ttg.memdesc<8x8xi8, #NVMMA_SHARED_128, #ttg.shared_memory, mutable>

  ttg.local_dealloc %a : !ttg.memdesc<32xf16, #A_SHARED, #ttg.shared_memory, mutable>
  tt.return
}


// expected-remark @below {{padded_shared_layout_size}}
// expected-remark @below {{size = 1058}}
tt.func @padded_shared_layout_size() {
  // expected-remark @+2 {{offset = 0, size = 510}}
  // 255 * 2B = 510B
  %alloc0 = ttg.local_alloc : () -> !ttg.memdesc<1x255xf16, #PADDED_SHARED_0, #ttg.shared_memory, mutable>
  // expected-remark @+2 {{offset = 0, size = 512}}
  // 256 * 2B = 512B
  %alloc1 = ttg.local_alloc : () -> !ttg.memdesc<1x256xf16, #PADDED_SHARED_0, #ttg.shared_memory, mutable>
  // expected-remark @+2 {{offset = 0, size = 530}}
  // (257 + 8) * 2B = 530B
  %alloc2 = ttg.local_alloc : () -> !ttg.memdesc<1x257xf16, #PADDED_SHARED_0, #ttg.shared_memory, mutable>
  // expected-remark @+2 {{offset = 0, size = 1038}}
  // (511 + 8) * 2B = 1038B
  %alloc3 = ttg.local_alloc : () -> !ttg.memdesc<1x511xf16, #PADDED_SHARED_0, #ttg.shared_memory, mutable>
  // expected-remark @+2 {{offset = 0, size = 1040}}
  // (512 + 8 * 1) * 2B = 1040B
  %alloc4 = ttg.local_alloc : () -> !ttg.memdesc<1x512xf16, #PADDED_SHARED_0, #ttg.shared_memory, mutable>
  // expected-remark @+2 {{offset = 0, size = 1058}}
  // (513 + 8 * 2) * 2B = 1058B
  %alloc5 = ttg.local_alloc : () -> !ttg.memdesc<1x513xf16, #PADDED_SHARED_0, #ttg.shared_memory, mutable>
  // expected-remark @+2 {{offset = 0, size = 512}}
  // 16 * 16 * 2B = 512B
  %alloc6 = ttg.local_alloc : () -> !ttg.memdesc<16x16xf16, #PADDED_SHARED_0, #ttg.shared_memory, mutable>
  // expected-remark @+2 {{offset = 0, size = 1040}}
  // (16 * 32 + 8 * 1) * 2B = 1040B
  %alloc7 = ttg.local_alloc : () -> !ttg.memdesc<16x32xf16, #PADDED_SHARED_0, #ttg.shared_memory, mutable>
  // expected-remark @+2 {{offset = 0, size = 1008}}
  // (31 * 16 + 8) * 2B = 1008B
  %alloc8 = ttg.local_alloc : () -> !ttg.memdesc<31x16xf16, #PADDED_SHARED_0, #ttg.shared_memory, mutable>
  tt.return
}

// expected-remark @below {{padded_shared_layout_element_type}}
// expected-remark @below {{size = 16864}}
tt.func @padded_shared_layout_element_type() {
  // expected-remark @+2 {{offset = 0, size = 4216}}
  // (16 * 256 + 8 * 15) * 1B = 4216B
  %alloc0 = ttg.local_alloc : () -> !ttg.memdesc<16x256xi8, #PADDED_SHARED_0, #ttg.shared_memory, mutable>
  // expected-remark @+2 {{offset = 0, size = 8432}}
  // (16 * 256 + 8 * 15) * 2B = 8432B
  %alloc1 = ttg.local_alloc : () -> !ttg.memdesc<16x256xf16, #PADDED_SHARED_0, #ttg.shared_memory, mutable>
  // expected-remark @+2 {{offset = 0, size = 16864}}
  // (16 * 256 + 8 * 15) * 4B = 16864B
  %alloc2 = ttg.local_alloc : () -> !ttg.memdesc<16x256xf32, #PADDED_SHARED_0, #ttg.shared_memory, mutable>
  tt.return
}

// expected-remark @below {{padded_shared_layout_multi_tier}}
// expected-remark @below {{size = 4466}}
tt.func @padded_shared_layout_multi_tier() {
  // expected-remark @+2 {{offset = 0, size = 4340}}
  // (16 * 256 + 4 * 31 + 8 * 15) * 1B = 4340B
  %alloc0 = ttg.local_alloc : () -> !ttg.memdesc<16x256xi8, #PADDED_SHARED_1, #ttg.shared_memory, mutable>
  // expected-remark @+2 {{offset = 0, size = 4466}}
  // (16 * 256 + 2 * 63 + 4 * 31 + 8 * 15) * 1B = 4466B
  %alloc1 = ttg.local_alloc : () -> !ttg.memdesc<16x256xi8, #PADDED_SHARED_2, #ttg.shared_memory, mutable>
  tt.return
}
}