// RUN: akg-opt --alloc-buffer-shrink -split-input-file %s | FileCheck %s
// CHECK-LABEL: func.func @akg_fused_clone_40_auto_fallback(%arg0: memref<4x2x116x14x14xf32> {hacc.arg_type = #hacc.arg_type<input>, hacc.input_idx = #hacc.input_idx<0>}, %arg1: memref<4x116x14x14xf32> {hacc.arg_type = #hacc.arg_type<input>, hacc.input_idx = #hacc.input_idx<1>}, %arg2: memref<4x2x116x14x14xf32> {hacc.arg_type = #hacc.arg_type<output>, hacc.output_idx = #hacc.output_idx<0>}) -> memref<4x2x116x14x14xf32> attributes {OperatorType = "Elementwise", enable_auto_mark_buffer_size, hacc.block_dim = 1 : i64, hacc.entry, hacc.function_kind = #hacc.function_kind<DEVICE>} {
// CHECK-NEXT: %c4 = arith.constant 4 : index
// CHECK-NEXT: %c116 = arith.constant 116 : index
// CHECK-NEXT: %c2 = arith.constant 2 : index
// CHECK-NEXT: %c196 = arith.constant 196 : index
// CHECK-NEXT: %c4_0 = arith.constant 4 : index
// CHECK-NEXT: %c116_1 = arith.constant 116 : index
// CHECK-NEXT: %c2_2 = arith.constant 2 : index
// CHECK-NEXT: %c196_3 = arith.constant 196 : index
// CHECK-NEXT: %c196_4 = arith.constant 196 : index
// CHECK-NEXT: %c2_5 = arith.constant 2 : index
// CHECK-NEXT: %c116_6 = arith.constant 116 : index
// CHECK-NEXT: %c4_7 = arith.constant 4 : index
// CHECK-NEXT: %c0 = arith.constant 0 : index
// CHECK-NEXT: %c1 = arith.constant 1 : index
// CHECK-NEXT: %collapse_shape = memref.collapse_shape %arg0 {{.*}} : memref<4x2x116x14x14xf32> into memref<4x232x196xf32>
// CHECK-NEXT: %expand_shape = memref.expand_shape %arg1 {{.*}} output_shape [4, 58, 2, 14, 14] : memref<4x116x14x14xf32> into memref<4x58x2x14x14xf32>
// CHECK-NEXT: %collapse_shape_8 = memref.collapse_shape %expand_shape {{.*}} : memref<4x58x2x14x14xf32> into memref<4x58x2x196xf32>
// CHECK-NEXT: %subview = memref.subview %collapse_shape[0, 0, 0] [4, 116, 196] [1, 1, 1] : memref<4x232x196xf32> to memref<4x116x196xf32, strided<[45472, 196, 1]>>
// CHECK-NEXT: %expand_shape_9 = memref.expand_shape %subview {{.*}} output_shape [4, 58, 2, 14, 14] : memref<4x116x196xf32, strided<[45472, 196, 1]>> into memref<4x58x2x14x14xf32, strided<[45472, 392, 196, 14, 1]>>
// CHECK-NEXT: %collapse_shape_10 = memref.collapse_shape %expand_shape_9 {{.*}} : memref<4x58x2x14x14xf32, strided<[45472, 392, 196, 14, 1]>> into memref<4x58x2x196xf32, strided<[45472, 392, 196, 1]>>
// CHECK-NEXT: %alloc = memref.alloc() {alignment = 64 : i64} : memref<1x116x2x196xf32>
// CHECK-NEXT: %c0_11 = arith.constant 0 : index
// CHECK-NEXT: %subview_12 = memref.subview %alloc[0, 0, 0, 0] [1, 58, 2, 196] [1, 1, 1, 1] : memref<1x116x2x196xf32> to memref<1x58x2x196xf32, strided<[45472, 392, 196, 1]>>
// CHECK-NEXT: %subview_13 = memref.subview %alloc[0, 58, 0, 0] [1, 58, 2, 196] [1, 1, 1, 1] : memref<1x116x2x196xf32> to memref<1x58x2x196xf32, strided<[45472, 392, 196, 1], offset: 22736>>
// CHECK-NEXT: %collapse_shape_14 = memref.collapse_shape %arg2 {{.*}} : memref<4x2x116x14x14xf32> into memref<4x2x116x196xf32>
// CHECK-NEXT: %c0_15 = arith.constant 0 : index
// CHECK-NEXT: %c1_16 = arith.constant 1 : index
// CHECK-NEXT: scf.for %arg3 = %c0_15 to %c1_16 step %c1_16 {
// CHECK-NEXT: scf.for %arg4 = %c0_15 to %c1_16 step %c1_16 {
// CHECK-NEXT: %c40 = arith.constant 40 : index
// CHECK-NEXT: %0 = affine.apply #map(%arg3, %arg4)
// CHECK-NEXT: %c4_17 = arith.constant 4 : index
// CHECK-NEXT: %1 = affine.apply #map1(%0)[%c4_0]
// CHECK-NEXT: %2 = affine.min #map2(%1, %c4_17)
// CHECK-NEXT: %c1_18 = arith.constant 1 : index
// CHECK-NEXT: %3 = arith.addi %0, %c1_18 : index
// CHECK-NEXT: %4 = affine.apply #map1(%3)[%c4_0]
// CHECK-NEXT: %5 = affine.min #map2(%4, %c4_17)
// CHECK-NEXT: scf.for %arg5 = %2 to %5 step %c1_16 {
// CHECK-NEXT: %c0_19 = arith.constant 0 : index
// CHECK-NEXT: %c1_20 = arith.constant 1 : index
// CHECK-NEXT: scf.for %arg6 = %c0_19 to %c1_20 step %c1_20 {
// CHECK-NEXT: scf.for %arg7 = %c0_19 to %c1_20 step %c1_20 {
// CHECK-NEXT: %c40_21 = arith.constant 40 : index
// CHECK-NEXT: %6 = affine.apply #map(%arg6, %arg7)
// CHECK-NEXT: %c116_22 = arith.constant 116 : index
// CHECK-NEXT: %7 = affine.apply #map1(%6)[%c116_1]
// CHECK-NEXT: %8 = affine.min #map2(%7, %c116_22)
// CHECK-NEXT: %c1_23 = arith.constant 1 : index
// CHECK-NEXT: %9 = arith.addi %6, %c1_23 : index
// CHECK-NEXT: %10 = affine.apply #map1(%9)[%c116_1]
// CHECK-NEXT: %11 = affine.min #map2(%10, %c116_22)
// CHECK-NEXT: scf.for %arg8 = %8 to %11 step %c1_20 {
// CHECK-NEXT: %c0_24 = arith.constant 0 : index
// CHECK-NEXT: %c1_25 = arith.constant 1 : index
// CHECK-NEXT: scf.for %arg9 = %c0_24 to %c1_25 step %c1_25 {
// CHECK-NEXT: scf.for %arg10 = %c0_24 to %c1_25 step %c1_25 {
// CHECK-NEXT: %c40_26 = arith.constant 40 : index
// CHECK-NEXT: %12 = affine.apply #map(%arg9, %arg10)
// CHECK-NEXT: %c2_27 = arith.constant 2 : index
// CHECK-NEXT: %13 = affine.apply #map1(%12)[%c2_2]
// CHECK-NEXT: %14 = affine.min #map2(%13, %c2_27)
// CHECK-NEXT: %c1_28 = arith.constant 1 : index
// CHECK-NEXT: %15 = arith.addi %12, %c1_28 : index
// CHECK-NEXT: %16 = affine.apply #map1(%15)[%c2_2]
// CHECK-NEXT: %17 = affine.min #map2(%16, %c2_27)
// CHECK-NEXT: scf.for %arg11 = %14 to %17 step %c1_25 {
// CHECK-NEXT: %c0_29 = arith.constant 0 : index
// CHECK-NEXT: %c1_30 = arith.constant 1 : index
// CHECK-NEXT: scf.for %arg12 = %c0_29 to %c1_30 step %c1_30 {
// CHECK-NEXT: scf.for %arg13 = %c0_29 to %c1_30 step %c1_30 {
// CHECK-NEXT: %c40_31 = arith.constant 40 : index
// CHECK-NEXT: %18 = affine.apply #map(%arg12, %arg13)
// CHECK-NEXT: %c196_32 = arith.constant 196 : index
// CHECK-NEXT: %19 = affine.apply #map1(%18)[%c196_3]
// CHECK-NEXT: %20 = affine.min #map2(%19, %c196_32)
// CHECK-NEXT: %c1_33 = arith.constant 1 : index
// CHECK-NEXT: %21 = arith.addi %18, %c1_33 : index
// CHECK-NEXT: %22 = affine.apply #map1(%21)[%c196_3]
// CHECK-NEXT: %23 = affine.min #map2(%22, %c196_32)
// CHECK-NEXT: scf.for %arg14 = %20 to %23 step %c1_30 {
// CHECK-NEXT: %24 = affine.apply #map3(%arg8)
// CHECK-NEXT: %25 = arith.cmpi sge, %24, %c0 {skip_vectorize} : index
// CHECK-NEXT: scf.if %25 {
// CHECK-NEXT: %29 = memref.load %collapse_shape_10[%arg5, %arg8, %arg11, %arg14] : memref<4x58x2x196xf32, strided<[45472, 392, 196, 1]>>
// CHECK-NEXT: memref.store %29, %subview_12[%c0_11, %arg8, %arg11, %arg14] : memref<1x58x2x196xf32, strided<[45472, 392, 196, 1]>>
// CHECK-NEXT: }
// CHECK-NEXT: %26 = affine.apply #map3(%arg8)
// CHECK-NEXT: %27 = arith.cmpi sge, %26, %c0 {skip_vectorize} : index
// CHECK-NEXT: scf.if %27 {
// CHECK-NEXT: %29 = memref.load %collapse_shape_8[%arg5, %arg8, %arg11, %arg14] : memref<4x58x2x196xf32>
// CHECK-NEXT: memref.store %29, %subview_13[%c0_11, %arg8, %arg11, %arg14] : memref<1x58x2x196xf32, strided<[45472, 392, 196, 1], offset: 22736>>
// CHECK-NEXT: }
// CHECK-NEXT: %28 = memref.load %alloc[%c0_11, %arg8, %arg11, %arg14] : memref<1x116x2x196xf32>
// CHECK-NEXT: memref.store %28, %collapse_shape_14[%arg5, %arg11, %arg8, %arg14] : memref<4x2x116x196xf32>
// CHECK-NEXT: }
// CHECK-NEXT: }
// CHECK-NEXT: }
// CHECK-NEXT: }
// CHECK-NEXT: }
// CHECK-NEXT: }
// CHECK-NEXT: }
// CHECK-NEXT: }
// CHECK-NEXT: }
// CHECK-NEXT: }
// CHECK-NEXT: }
// CHECK-NEXT: } {map_for_to_forall}
// CHECK-NEXT: return %arg2 : memref<4x2x116x14x14xf32>
// CHECK-NEXT: }
#map = affine_map<(d0, d1) -> (d0 + d1)>
#map1 = affine_map<(d0)[s0] -> (d0 * s0)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#map3 = affine_map<(d0) -> (-d0 + 57)>
module {
memref.global "private" @global_seed : memref<i64> = dense<0>
func.func @akg_fused_clone_40_auto_fallback(%arg0: memref<4x2x116x14x14xf32> {hacc.arg_type = #hacc.arg_type<input>, hacc.input_idx = #hacc.input_idx<0>}, %arg1: memref<4x116x14x14xf32> {hacc.arg_type = #hacc.arg_type<input>, hacc.input_idx = #hacc.input_idx<1>}, %arg2: memref<4x2x116x14x14xf32> {hacc.arg_type = #hacc.arg_type<output>, hacc.output_idx = #hacc.output_idx<0>}) -> memref<4x2x116x14x14xf32> attributes {OperatorType = "Elementwise", enable_auto_mark_buffer_size, hacc.block_dim = 1 : i64, hacc.entry, hacc.function_kind = #hacc.function_kind<DEVICE>} {
%c4 = arith.constant 4 : index
%c116 = arith.constant 116 : index
%c2 = arith.constant 2 : index
%c196 = arith.constant 196 : index
%c4_0 = arith.constant 4 : index
%c116_1 = arith.constant 116 : index
%c2_2 = arith.constant 2 : index
%c196_3 = arith.constant 196 : index
%c196_4 = arith.constant 196 : index
%c2_5 = arith.constant 2 : index
%c116_6 = arith.constant 116 : index
%c4_7 = arith.constant 4 : index
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%collapse_shape = memref.collapse_shape %arg0 [[0], [1, 2], [3, 4]] : memref<4x2x116x14x14xf32> into memref<4x232x196xf32>
%expand_shape = memref.expand_shape %arg1 [[0], [1, 2], [3], [4]] output_shape [4, 58, 2, 14, 14] : memref<4x116x14x14xf32> into memref<4x58x2x14x14xf32>
%collapse_shape_8 = memref.collapse_shape %expand_shape [[0], [1], [2], [3, 4]] : memref<4x58x2x14x14xf32> into memref<4x58x2x196xf32>
%subview = memref.subview %collapse_shape[0, 0, 0] [4, 116, 196] [1, 1, 1] : memref<4x232x196xf32> to memref<4x116x196xf32, strided<[45472, 196, 1]>>
%expand_shape_9 = memref.expand_shape %subview [[0], [1, 2], [3, 4]] output_shape [4, 58, 2, 14, 14] : memref<4x116x196xf32, strided<[45472, 196, 1]>> into memref<4x58x2x14x14xf32, strided<[45472, 392, 196, 14, 1]>>
%collapse_shape_10 = memref.collapse_shape %expand_shape_9 [[0], [1], [2], [3, 4]] : memref<4x58x2x14x14xf32, strided<[45472, 392, 196, 14, 1]>> into memref<4x58x2x196xf32, strided<[45472, 392, 196, 1]>>
%alloc = memref.alloc() {alignment = 64 : i64} : memref<4x116x2x196xf32>
%subview_11 = memref.subview %alloc[0, 0, 0, 0] [4, 58, 2, 196] [1, 1, 1, 1] : memref<4x116x2x196xf32> to memref<4x58x2x196xf32, strided<[45472, 392, 196, 1]>>
%subview_12 = memref.subview %alloc[0, 58, 0, 0] [4, 58, 2, 196] [1, 1, 1, 1] : memref<4x116x2x196xf32> to memref<4x58x2x196xf32, strided<[45472, 392, 196, 1], offset: 22736>>
%collapse_shape_13 = memref.collapse_shape %arg2 [[0], [1], [2], [3, 4]] : memref<4x2x116x14x14xf32> into memref<4x2x116x196xf32>
%c0_14 = arith.constant 0 : index
%c1_15 = arith.constant 1 : index
scf.for %arg3 = %c0_14 to %c1_15 step %c1_15 {
scf.for %arg4 = %c0_14 to %c1_15 step %c1_15 {
%c40 = arith.constant 40 : index
%0 = affine.apply #map(%arg3, %arg4)
%c4_16 = arith.constant 4 : index
%1 = affine.apply #map1(%0)[%c4_0]
%2 = affine.min #map2(%1, %c4_16)
%c1_17 = arith.constant 1 : index
%3 = arith.addi %0, %c1_17 : index
%4 = affine.apply #map1(%3)[%c4_0]
%5 = affine.min #map2(%4, %c4_16)
scf.for %arg5 = %2 to %5 step %c1_15 {
%c0_18 = arith.constant 0 : index
%c1_19 = arith.constant 1 : index
scf.for %arg6 = %c0_18 to %c1_19 step %c1_19 {
scf.for %arg7 = %c0_18 to %c1_19 step %c1_19 {
%c40_20 = arith.constant 40 : index
%6 = affine.apply #map(%arg6, %arg7)
%c116_21 = arith.constant 116 : index
%7 = affine.apply #map1(%6)[%c116_1]
%8 = affine.min #map2(%7, %c116_21)
%c1_22 = arith.constant 1 : index
%9 = arith.addi %6, %c1_22 : index
%10 = affine.apply #map1(%9)[%c116_1]
%11 = affine.min #map2(%10, %c116_21)
scf.for %arg8 = %8 to %11 step %c1_19 {
%c0_23 = arith.constant 0 : index
%c1_24 = arith.constant 1 : index
scf.for %arg9 = %c0_23 to %c1_24 step %c1_24 {
scf.for %arg10 = %c0_23 to %c1_24 step %c1_24 {
%c40_25 = arith.constant 40 : index
%12 = affine.apply #map(%arg9, %arg10)
%c2_26 = arith.constant 2 : index
%13 = affine.apply #map1(%12)[%c2_2]
%14 = affine.min #map2(%13, %c2_26)
%c1_27 = arith.constant 1 : index
%15 = arith.addi %12, %c1_27 : index
%16 = affine.apply #map1(%15)[%c2_2]
%17 = affine.min #map2(%16, %c2_26)
scf.for %arg11 = %14 to %17 step %c1_24 {
%c0_28 = arith.constant 0 : index
%c1_29 = arith.constant 1 : index
scf.for %arg12 = %c0_28 to %c1_29 step %c1_29 {
scf.for %arg13 = %c0_28 to %c1_29 step %c1_29 {
%c40_30 = arith.constant 40 : index
%18 = affine.apply #map(%arg12, %arg13)
%c196_31 = arith.constant 196 : index
%19 = affine.apply #map1(%18)[%c196_3]
%20 = affine.min #map2(%19, %c196_31)
%c1_32 = arith.constant 1 : index
%21 = arith.addi %18, %c1_32 : index
%22 = affine.apply #map1(%21)[%c196_3]
%23 = affine.min #map2(%22, %c196_31)
scf.for %arg14 = %20 to %23 step %c1_29 {
%24 = affine.apply #map3(%arg8)
%25 = arith.cmpi sge, %24, %c0 {skip_vectorize} : index
scf.if %25 {
%29 = memref.load %collapse_shape_10[%arg5, %arg8, %arg11, %arg14] : memref<4x58x2x196xf32, strided<[45472, 392, 196, 1]>>
memref.store %29, %subview_11[%arg5, %arg8, %arg11, %arg14] : memref<4x58x2x196xf32, strided<[45472, 392, 196, 1]>>
}
%26 = affine.apply #map3(%arg8)
%27 = arith.cmpi sge, %26, %c0 {skip_vectorize} : index
scf.if %27 {
%29 = memref.load %collapse_shape_8[%arg5, %arg8, %arg11, %arg14] : memref<4x58x2x196xf32>
memref.store %29, %subview_12[%arg5, %arg8, %arg11, %arg14] : memref<4x58x2x196xf32, strided<[45472, 392, 196, 1], offset: 22736>>
}
%28 = memref.load %alloc[%arg5, %arg8, %arg11, %arg14] : memref<4x116x2x196xf32>
memref.store %28, %collapse_shape_13[%arg5, %arg11, %arg8, %arg14] : memref<4x2x116x196xf32>
}
}
}
}
}
}
}
}
}
}
}
} {map_for_to_forall}
return %arg2 : memref<4x2x116x14x14xf32>
}
}