al.scope 接口文档
1. 硬件背景
昇腾处理器包含多种类型的计算单元(例如,用于矩阵运算的 Cube Unit 和用于向量/标量运算的 Vector Unit)。al.scope 允许内核开发者显式地告诉 Triton 编译器,特定代码区域应该目标哪种硬件单元,从而实现更精细的性能调优和资源利用。
2. 接口说明
| Python with al.scope(core_mode: str): # 此代码块内的 Triton 语句(如 tl.load, tl.store, 算术运算等) # 将根据指定的 core_mode 进行编译和执行。 ... |
al.scope 是 triton.language.extra.ascend 模块中的一个上下文管理器(Context Manager),专为 Triton 内核中的代码块指定 昇腾硬件的执行模式。
参数
| 参数名 | 类型 | 必需 | 说明 | 可选值 (示例) |
| core_mode | str | 是 | 指定该作用域内代码将要使用的昇腾核心类型 | "vector", "cube" |
常用值说明
| 值 | 目标核心 | 用途/优化方向 |
| "vector" | Vector Unit (向量核心) | 适用于元素级操作 (Element-wise Operations),如加法 (+)、乘法 (*)、激活函数 (ReLU, Sigmoid)、数据加载 (tl.load) 和存储 (tl.store)。 |
| "cube" | Cube Unit (矩阵核心) | 适用于矩阵计算,特别是矩阵乘法 (Matrix Multiplication, GEMM) 和卷积操作。这通常与 tl.dot 等操作相关联。 |
| "SIMT" | Single instruction multiple thread | - |
| "SIMD" | Single instruction multiple data | - |
3. 约束说明
each kernel have 1 scope for cube and vector, inside them they run parallely and there are other syncing operations that declares the sync between both of the scope
-
Parallel Execution: Operations within cube and vector scopes execute in parallel
-
Single Scope per Type: Each kernel supports one cube scope and one vector scope (?)
-
Explicit Synchronization: Required for data dependencies between scopes using sync operations
4. 用例示例
| Python import os os.environ["TORCH_DEVICE_BACKEND_AUTOLOAD"] = "0" import pytest import triton import triton.language as tl import triton.language.extra.cann.extension as al from triton.compiler.compiler import ASTSource from triton.compiler.code_generator import ast_to_ttir from triton._C.libtriton import ir from triton._C.libtriton.ascend import ir as ascend_ir class Options: num_warps = 4 num_stages = 3 num_ctas = 1 cluster_dims = (1, 1, 1) enable_fp_fusion = True debug = False def compile_kernel(kernel, signature, constants): src = ASTSource(kernel, signature, constants) context = ir.context() ir.load_dialects(context) ascend_ir.load_dialects(context) module = ast_to_ttir(kernel, src, context, Options(), {}, {}) return str(module) @triton.jit def kernel_nested_scope(x_ptr, y_ptr, out_ptr, n, BLOCK: tl.constexpr): i = tl.program_id(0) * BLOCK + tl.arange(0, BLOCK) with al.scope(core_mode="vector"): with al.scope(core_mode="vector"): with al.scope(core_mode="cube"): x = tl.load(x_ptr + i, mask=i < n) y = tl.load(y_ptr + i, mask=i < n) result = x + y tl.store(out_ptr + i, result, mask=i < n) @triton.jit def kernel_scope_escape(x_ptr, out_ptr, n, BLOCK: tl.constexpr): i = tl.program_id(0) * BLOCK + tl.arange(0, BLOCK) with al.scope(core_mode="vector"): x = tl.load(x_ptr + i, mask=i < n) a = x + 1.0 tl.store(out_ptr + i, a, mask=i < n) @triton.jit def kernel_scope_cube(x_ptr, y_ptr, out_ptr, n, BLOCK: tl.constexpr): i = tl.program_id(0) * BLOCK + tl.arange(0, BLOCK) with al.scope(core_mode="cube"): x = tl.load(x_ptr + i, mask=i < n) y = tl.load(y_ptr + i, mask=i < n) result = x + y tl.store(out_ptr + i, result, mask=i < n) @triton.jit def kernel_scope_vector(x_ptr, y_ptr, out_ptr, n, BLOCK: tl.constexpr): i = tl.program_id(0) * BLOCK + tl.arange(0, BLOCK) with al.scope(core_mode="vector"): x = tl.load(x_ptr + i, mask=i < n) y = tl.load(y_ptr + i, mask=i < n) result = x + y tl.store(out_ptr + i, result, mask=i < n) @triton.jit def kernel_scope_disable_auto_sync(x_ptr, y_ptr, out_ptr, n, BLOCK: tl.constexpr): i = tl.program_id(0) * BLOCK + tl.arange(0, BLOCK) with al.scope(core_mode="vector", disable_auto_sync=True): x = tl.load(x_ptr + i, mask=i < n) y = tl.load(y_ptr + i, mask=i < n) result = x + y tl.store(out_ptr + i, result, mask=i < n) if __name__ == "__main__": print("=" * 60) print("Test 1: Nested Scopes") print("=" * 60) mlir = compile_kernel( kernel_nested_scope, {"x_ptr": "*fp32", "y_ptr": "*fp32", "out_ptr": "*fp32", "n": "i32"}, {"BLOCK": 256} ) print(f"✅ Generated MLIR ({len(mlir)} chars):\n") print(mlir) print("\n" + "=" * 60) print("Test 2: Scope Escape") print("=" * 60) mlir = compile_kernel(kernel_scope_escape, {"x_ptr": "*fp32", "out_ptr": "*fp32", "n": "i32"}, {"BLOCK": 256}) print(f"✅ Generated MLIR ({len(mlir)} chars):\n") print(mlir) print("\n" + "=" * 60) print("Test 3: Cube Core Mode") print("=" * 60) mlir = compile_kernel( kernel_scope_cube, {"x_ptr": "*fp32", "y_ptr": "*fp32", "out_ptr": "*fp32", "n": "i32"}, {"BLOCK": 256} ) print(f"✅ Generated MLIR ({len(mlir)} chars):\n") print(mlir) print("\n" + "=" * 60) print("Test 4: Vector Core Mode") print("=" * 60) mlir = compile_kernel( kernel_scope_vector, {"x_ptr": "*fp32", "y_ptr": "*fp32", "out_ptr": "*fp32", "n": "i32"}, {"BLOCK": 256} ) print(f"✅ Generated MLIR ({len(mlir)} chars):\n") print(mlir) print("\n" + "=" * 60) print("Test 5: Disable Auto Sync") print("=" * 60) mlir = compile_kernel( kernel_scope_disable_auto_sync, {"x_ptr": "*fp32", "y_ptr": "*fp32", "out_ptr": "*fp32", "n": "i32"}, {"BLOCK": 256}, ) print(f"✅ Generated MLIR ({len(mlir)} chars):\n") print(mlir) |
5. 编译输出结果
| Plain Text ============================================================ Test 1: Nested Scopes ============================================================ ✅ Generated MLIR (4155 chars): #loc = loc("/home/linxin/triton-test/scope.py":34:0) module { tt.func public @kernel_nested_scope(%arg0: !tt.ptr<f32> loc("/home/linxin/triton-test/scope.py":34:0), %arg1: !tt.ptr<f32> loc("/home/linxin/triton-test/scope.py":34:0), %arg2: !tt.ptr<f32> loc("/home/linxin/triton-test/scope.py":34:0), %arg3: i32 loc("/home/linxin/triton-test/scope.py":34:0)) attributes {noinline = false} { %0 = tt.get_program_id x : i32 loc(#loc1) %c256_i32 = arith.constant 256 : i32 loc(#loc2) %c256_i32_0 = arith.constant 256 : i32 loc(#loc2) %1 = arith.muli %0, %c256_i32_0 : i32 loc(#loc2) %2 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32> loc(#loc3) %3 = tt.splat %1 : i32 -> tensor<256xi32> loc(#loc4) %4 = arith.addi %3, %2 : tensor<256xi32> loc(#loc4) %5:3 = scope.scope : () -> (tensor<256xf32>, tensor<256xf32>, tensor<256xf32>) { %6:3 = scope.scope : () -> (tensor<256xf32>, tensor<256xf32>, tensor<256xf32>) { %7:3 = scope.scope : () -> (tensor<256xf32>, tensor<256xf32>, tensor<256xf32>) { %8 = tt.splat %arg3 : i32 -> tensor<256xi32> loc(#loc8) %9 = arith.cmpi slt, %4, %8 : tensor<256xi32> loc(#loc8) %10 = tt.splat %arg0 : !tt.ptr<f32> -> tensor<256x!tt.ptr<f32>> loc(#loc9) %11 = tt.addptr %10, %4 : tensor<256x!tt.ptr<f32>>, tensor<256xi32> loc(#loc9) %cst = arith.constant 0.000000e+00 : f32 loc(#loc10) %cst_1 = arith.constant dense<0.000000e+00> : tensor<256xf32> loc(#loc10) %12 = tt.load %11, %9, %cst_1 : tensor<256x!tt.ptr<f32>> loc(#loc10) %13 = tt.splat %arg3 : i32 -> tensor<256xi32> loc(#loc11) %14 = arith.cmpi slt, %4, %13 : tensor<256xi32> loc(#loc11) %15 = tt.splat %arg1 : !tt.ptr<f32> -> tensor<256x!tt.ptr<f32>> loc(#loc12) %16 = tt.addptr %15, %4 : tensor<256x!tt.ptr<f32>>, tensor<256xi32> loc(#loc12) %cst_2 = arith.constant 0.000000e+00 : f32 loc(#loc13) %cst_3 = arith.constant dense<0.000000e+00> : tensor<256xf32> loc(#loc13) %17 = tt.load %16, %14, %cst_3 : tensor<256x!tt.ptr<f32>> loc(#loc13) %18 = arith.addf %12, %17 : tensor<256xf32> loc(#loc14) %19 = tt.splat %arg3 : i32 -> tensor<256xi32> loc(#loc15) %20 = arith.cmpi slt, %4, %19 : tensor<256xi32> loc(#loc15) %21 = tt.splat %arg2 : !tt.ptr<f32> -> tensor<256x!tt.ptr<f32>> loc(#loc16) %22 = tt.addptr %21, %4 : tensor<256x!tt.ptr<f32>>, tensor<256xi32> loc(#loc16) tt.store %22, %18, %20 : tensor<256x!tt.ptr<f32>> loc(#loc17) scope.return %12, %17, %18 : tensor<256xf32>, tensor<256xf32>, tensor<256xf32> loc(#loc17) } {hivm.tcore_type = #hivm.tcore_type<CUBE>, noinline} loc(#loc7) scope.return %7#0, %7#1, %7#2 : tensor<256xf32>, tensor<256xf32>, tensor<256xf32> loc(#loc7) } {hivm.tcore_type = #hivm.tcore_type<VECTOR>, noinline} loc(#loc6) scope.return %6#0, %6#1, %6#2 : tensor<256xf32>, tensor<256xf32>, tensor<256xf32> loc(#loc6) } {hivm.tcore_type = #hivm.tcore_type<VECTOR>, noinline} loc(#loc5) tt.return loc(#loc18) } loc(#loc) } loc(#loc) #loc1 = loc("/home/linxin/triton-test/scope.py":35:22) #loc2 = loc("/home/linxin/triton-test/scope.py":35:27) #loc3 = loc("/home/linxin/triton-test/scope.py":35:48) #loc4 = loc("/home/linxin/triton-test/scope.py":35:35) #loc5 = loc("/home/linxin/triton-test/scope.py":36:9) #loc6 = loc("/home/linxin/triton-test/scope.py":37:13) #loc7 = loc("/home/linxin/triton-test/scope.py":38:17) #loc8 = loc("/home/linxin/triton-test/scope.py":39:48) #loc9 = loc("/home/linxin/triton-test/scope.py":39:36) #loc10 = loc("/home/linxin/triton-test/scope.py":39:28) #loc11 = loc("/home/linxin/triton-test/scope.py":40:48) #loc12 = loc("/home/linxin/triton-test/scope.py":40:36) #loc13 = loc("/home/linxin/triton-test/scope.py":40:28) #loc14 = loc("/home/linxin/triton-test/scope.py":41:29) #loc15 = loc("/home/linxin/triton-test/scope.py":42:55) #loc16 = loc("/home/linxin/triton-test/scope.py":42:35) #loc17 = loc("/home/linxin/triton-test/scope.py":42:38) #loc18 = loc("/home/linxin/triton-test/scope.py":36:4) ============================================================ Test 2: Scope Escape ============================================================ ✅ Generated MLIR (2777 chars): #loc = loc("/home/linxin/triton-test/scope.py":46:0) module { tt.func public @kernel_scope_escape(%arg0: !tt.ptr<f32> loc("/home/linxin/triton-test/scope.py":46:0), %arg1: !tt.ptr<f32> loc("/home/linxin/triton-test/scope.py":46:0), %arg2: i32 loc("/home/linxin/triton-test/scope.py":46:0)) attributes {noinline = false} { %0 = tt.get_program_id x : i32 loc(#loc1) %c256_i32 = arith.constant 256 : i32 loc(#loc2) %c256_i32_0 = arith.constant 256 : i32 loc(#loc2) %1 = arith.muli %0, %c256_i32_0 : i32 loc(#loc2) %2 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32> loc(#loc3) %3 = tt.splat %1 : i32 -> tensor<256xi32> loc(#loc4) %4 = arith.addi %3, %2 : tensor<256xi32> loc(#loc4) %5 = scope.scope : () -> tensor<256xf32> { %11 = tt.splat %arg2 : i32 -> tensor<256xi32> loc(#loc6) %12 = arith.cmpi slt, %4, %11 : tensor<256xi32> loc(#loc6) %13 = tt.splat %arg0 : !tt.ptr<f32> -> tensor<256x!tt.ptr<f32>> loc(#loc7) %14 = tt.addptr %13, %4 : tensor<256x!tt.ptr<f32>>, tensor<256xi32> loc(#loc7) %cst_3 = arith.constant 0.000000e+00 : f32 loc(#loc8) %cst_4 = arith.constant dense<0.000000e+00> : tensor<256xf32> loc(#loc8) %15 = tt.load %14, %12, %cst_4 : tensor<256x!tt.ptr<f32>> loc(#loc8) scope.return %15 : tensor<256xf32> loc(#loc8) } {hivm.tcore_type = #hivm.tcore_type<VECTOR>, noinline} loc(#loc5) %cst = arith.constant 1.000000e+00 : f32 loc(#loc9) %cst_1 = arith.constant 1.000000e+00 : f32 loc(#loc9) %cst_2 = arith.constant dense<1.000000e+00> : tensor<256xf32> loc(#loc9) %6 = arith.addf %5, %cst_2 : tensor<256xf32> loc(#loc9) %7 = tt.splat %arg2 : i32 -> tensor<256xi32> loc(#loc10) %8 = arith.cmpi slt, %4, %7 : tensor<256xi32> loc(#loc10) %9 = tt.splat %arg1 : !tt.ptr<f32> -> tensor<256x!tt.ptr<f32>> loc(#loc11) %10 = tt.addptr %9, %4 : tensor<256x!tt.ptr<f32>>, tensor<256xi32> loc(#loc11) tt.store %10, %6, %8 : tensor<256x!tt.ptr<f32>> loc(#loc12) tt.return loc(#loc13) } loc(#loc) } loc(#loc) #loc1 = loc("/home/linxin/triton-test/scope.py":47:22) #loc2 = loc("/home/linxin/triton-test/scope.py":47:27) #loc3 = loc("/home/linxin/triton-test/scope.py":47:48) #loc4 = loc("/home/linxin/triton-test/scope.py":47:35) #loc5 = loc("/home/linxin/triton-test/scope.py":48:9) #loc6 = loc("/home/linxin/triton-test/scope.py":49:40) #loc7 = loc("/home/linxin/triton-test/scope.py":49:28) #loc8 = loc("/home/linxin/triton-test/scope.py":49:20) #loc9 = loc("/home/linxin/triton-test/scope.py":50:12) #loc10 = loc("/home/linxin/triton-test/scope.py":51:38) #loc11 = loc("/home/linxin/triton-test/scope.py":51:23) #loc12 = loc("/home/linxin/triton-test/scope.py":51:26) #loc13 = loc("/home/linxin/triton-test/scope.py":51:4) ============================================================ Test 3: Cube Core Mode ============================================================ ✅ Generated MLIR (3422 chars): #loc = loc("/home/linxin/triton-test/scope.py":55:0) module { tt.func public @kernel_scope_cube(%arg0: !tt.ptr<f32> loc("/home/linxin/triton-test/scope.py":55:0), %arg1: !tt.ptr<f32> loc("/home/linxin/triton-test/scope.py":55:0), %arg2: !tt.ptr<f32> loc("/home/linxin/triton-test/scope.py":55:0), %arg3: i32 loc("/home/linxin/triton-test/scope.py":55:0)) attributes {noinline = false} { %0 = tt.get_program_id x : i32 loc(#loc1) %c256_i32 = arith.constant 256 : i32 loc(#loc2) %c256_i32_0 = arith.constant 256 : i32 loc(#loc2) %1 = arith.muli %0, %c256_i32_0 : i32 loc(#loc2) %2 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32> loc(#loc3) %3 = tt.splat %1 : i32 -> tensor<256xi32> loc(#loc4) %4 = arith.addi %3, %2 : tensor<256xi32> loc(#loc4) %5:3 = scope.scope : () -> (tensor<256xf32>, tensor<256xf32>, tensor<256xf32>) { %6 = tt.splat %arg3 : i32 -> tensor<256xi32> loc(#loc6) %7 = arith.cmpi slt, %4, %6 : tensor<256xi32> loc(#loc6) %8 = tt.splat %arg0 : !tt.ptr<f32> -> tensor<256x!tt.ptr<f32>> loc(#loc7) %9 = tt.addptr %8, %4 : tensor<256x!tt.ptr<f32>>, tensor<256xi32> loc(#loc7) %cst = arith.constant 0.000000e+00 : f32 loc(#loc8) %cst_1 = arith.constant dense<0.000000e+00> : tensor<256xf32> loc(#loc8) %10 = tt.load %9, %7, %cst_1 : tensor<256x!tt.ptr<f32>> loc(#loc8) %11 = tt.splat %arg3 : i32 -> tensor<256xi32> loc(#loc9) %12 = arith.cmpi slt, %4, %11 : tensor<256xi32> loc(#loc9) %13 = tt.splat %arg1 : !tt.ptr<f32> -> tensor<256x!tt.ptr<f32>> loc(#loc10) %14 = tt.addptr %13, %4 : tensor<256x!tt.ptr<f32>>, tensor<256xi32> loc(#loc10) %cst_2 = arith.constant 0.000000e+00 : f32 loc(#loc11) %cst_3 = arith.constant dense<0.000000e+00> : tensor<256xf32> loc(#loc11) %15 = tt.load %14, %12, %cst_3 : tensor<256x!tt.ptr<f32>> loc(#loc11) %16 = arith.addf %10, %15 : tensor<256xf32> loc(#loc12) %17 = tt.splat %arg3 : i32 -> tensor<256xi32> loc(#loc13) %18 = arith.cmpi slt, %4, %17 : tensor<256xi32> loc(#loc13) %19 = tt.splat %arg2 : !tt.ptr<f32> -> tensor<256x!tt.ptr<f32>> loc(#loc14) %20 = tt.addptr %19, %4 : tensor<256x!tt.ptr<f32>>, tensor<256xi32> loc(#loc14) tt.store %20, %16, %18 : tensor<256x!tt.ptr<f32>> loc(#loc15) scope.return %10, %15, %16 : tensor<256xf32>, tensor<256xf32>, tensor<256xf32> loc(#loc15) } {hivm.tcore_type = #hivm.tcore_type<CUBE>, noinline} loc(#loc5) tt.return loc(#loc16) } loc(#loc) } loc(#loc) #loc1 = loc("/home/linxin/triton-test/scope.py":56:22) #loc2 = loc("/home/linxin/triton-test/scope.py":56:27) #loc3 = loc("/home/linxin/triton-test/scope.py":56:48) #loc4 = loc("/home/linxin/triton-test/scope.py":56:35) #loc5 = loc("/home/linxin/triton-test/scope.py":57:9) #loc6 = loc("/home/linxin/triton-test/scope.py":58:40) #loc7 = loc("/home/linxin/triton-test/scope.py":58:28) #loc8 = loc("/home/linxin/triton-test/scope.py":58:20) #loc9 = loc("/home/linxin/triton-test/scope.py":59:40) #loc10 = loc("/home/linxin/triton-test/scope.py":59:28) #loc11 = loc("/home/linxin/triton-test/scope.py":59:20) #loc12 = loc("/home/linxin/triton-test/scope.py":60:21) #loc13 = loc("/home/linxin/triton-test/scope.py":61:47) #loc14 = loc("/home/linxin/triton-test/scope.py":61:27) #loc15 = loc("/home/linxin/triton-test/scope.py":61:30) #loc16 = loc("/home/linxin/triton-test/scope.py":57:4) ============================================================ Test 4: Vector Core Mode ============================================================ ✅ Generated MLIR (3426 chars): #loc = loc("/home/linxin/triton-test/scope.py":65:0) module { tt.func public @kernel_scope_vector(%arg0: !tt.ptr<f32> loc("/home/linxin/triton-test/scope.py":65:0), %arg1: !tt.ptr<f32> loc("/home/linxin/triton-test/scope.py":65:0), %arg2: !tt.ptr<f32> loc("/home/linxin/triton-test/scope.py":65:0), %arg3: i32 loc("/home/linxin/triton-test/scope.py":65:0)) attributes {noinline = false} { %0 = tt.get_program_id x : i32 loc(#loc1) %c256_i32 = arith.constant 256 : i32 loc(#loc2) %c256_i32_0 = arith.constant 256 : i32 loc(#loc2) %1 = arith.muli %0, %c256_i32_0 : i32 loc(#loc2) %2 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32> loc(#loc3) %3 = tt.splat %1 : i32 -> tensor<256xi32> loc(#loc4) %4 = arith.addi %3, %2 : tensor<256xi32> loc(#loc4) %5:3 = scope.scope : () -> (tensor<256xf32>, tensor<256xf32>, tensor<256xf32>) { %6 = tt.splat %arg3 : i32 -> tensor<256xi32> loc(#loc6) %7 = arith.cmpi slt, %4, %6 : tensor<256xi32> loc(#loc6) %8 = tt.splat %arg0 : !tt.ptr<f32> -> tensor<256x!tt.ptr<f32>> loc(#loc7) %9 = tt.addptr %8, %4 : tensor<256x!tt.ptr<f32>>, tensor<256xi32> loc(#loc7) %cst = arith.constant 0.000000e+00 : f32 loc(#loc8) %cst_1 = arith.constant dense<0.000000e+00> : tensor<256xf32> loc(#loc8) %10 = tt.load %9, %7, %cst_1 : tensor<256x!tt.ptr<f32>> loc(#loc8) %11 = tt.splat %arg3 : i32 -> tensor<256xi32> loc(#loc9) %12 = arith.cmpi slt, %4, %11 : tensor<256xi32> loc(#loc9) %13 = tt.splat %arg1 : !tt.ptr<f32> -> tensor<256x!tt.ptr<f32>> loc(#loc10) %14 = tt.addptr %13, %4 : tensor<256x!tt.ptr<f32>>, tensor<256xi32> loc(#loc10) %cst_2 = arith.constant 0.000000e+00 : f32 loc(#loc11) %cst_3 = arith.constant dense<0.000000e+00> : tensor<256xf32> loc(#loc11) %15 = tt.load %14, %12, %cst_3 : tensor<256x!tt.ptr<f32>> loc(#loc11) %16 = arith.addf %10, %15 : tensor<256xf32> loc(#loc12) %17 = tt.splat %arg3 : i32 -> tensor<256xi32> loc(#loc13) %18 = arith.cmpi slt, %4, %17 : tensor<256xi32> loc(#loc13) %19 = tt.splat %arg2 : !tt.ptr<f32> -> tensor<256x!tt.ptr<f32>> loc(#loc14) %20 = tt.addptr %19, %4 : tensor<256x!tt.ptr<f32>>, tensor<256xi32> loc(#loc14) tt.store %20, %16, %18 : tensor<256x!tt.ptr<f32>> loc(#loc15) scope.return %10, %15, %16 : tensor<256xf32>, tensor<256xf32>, tensor<256xf32> loc(#loc15) } {hivm.tcore_type = #hivm.tcore_type<VECTOR>, noinline} loc(#loc5) tt.return loc(#loc16) } loc(#loc) } loc(#loc) #loc1 = loc("/home/linxin/triton-test/scope.py":66:22) #loc2 = loc("/home/linxin/triton-test/scope.py":66:27) #loc3 = loc("/home/linxin/triton-test/scope.py":66:48) #loc4 = loc("/home/linxin/triton-test/scope.py":66:35) #loc5 = loc("/home/linxin/triton-test/scope.py":67:9) #loc6 = loc("/home/linxin/triton-test/scope.py":68:40) #loc7 = loc("/home/linxin/triton-test/scope.py":68:28) #loc8 = loc("/home/linxin/triton-test/scope.py":68:20) #loc9 = loc("/home/linxin/triton-test/scope.py":69:40) #loc10 = loc("/home/linxin/triton-test/scope.py":69:28) #loc11 = loc("/home/linxin/triton-test/scope.py":69:20) #loc12 = loc("/home/linxin/triton-test/scope.py":70:21) #loc13 = loc("/home/linxin/triton-test/scope.py":71:47) #loc14 = loc("/home/linxin/triton-test/scope.py":71:27) #loc15 = loc("/home/linxin/triton-test/scope.py":71:30) #loc16 = loc("/home/linxin/triton-test/scope.py":67:4) ============================================================ Test 5: Disable Auto Sync ============================================================ ✅ Generated MLIR (3468 chars): #loc = loc("/home/linxin/triton-test/scope.py":75:0) module { tt.func public @kernel_scope_disable_auto_sync(%arg0: !tt.ptr<f32> loc("/home/linxin/triton-test/scope.py":75:0), %arg1: !tt.ptr<f32> loc("/home/linxin/triton-test/scope.py":75:0), %arg2: !tt.ptr<f32> loc("/home/linxin/triton-test/scope.py":75:0), %arg3: i32 loc("/home/linxin/triton-test/scope.py":75:0)) attributes {noinline = false} { %0 = tt.get_program_id x : i32 loc(#loc1) %c256_i32 = arith.constant 256 : i32 loc(#loc2) %c256_i32_0 = arith.constant 256 : i32 loc(#loc2) %1 = arith.muli %0, %c256_i32_0 : i32 loc(#loc2) %2 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32> loc(#loc3) %3 = tt.splat %1 : i32 -> tensor<256xi32> loc(#loc4) %4 = arith.addi %3, %2 : tensor<256xi32> loc(#loc4) %5:3 = scope.scope : () -> (tensor<256xf32>, tensor<256xf32>, tensor<256xf32>) { %6 = tt.splat %arg3 : i32 -> tensor<256xi32> loc(#loc6) %7 = arith.cmpi slt, %4, %6 : tensor<256xi32> loc(#loc6) %8 = tt.splat %arg0 : !tt.ptr<f32> -> tensor<256x!tt.ptr<f32>> loc(#loc7) %9 = tt.addptr %8, %4 : tensor<256x!tt.ptr<f32>>, tensor<256xi32> loc(#loc7) %cst = arith.constant 0.000000e+00 : f32 loc(#loc8) %cst_1 = arith.constant dense<0.000000e+00> : tensor<256xf32> loc(#loc8) %10 = tt.load %9, %7, %cst_1 : tensor<256x!tt.ptr<f32>> loc(#loc8) %11 = tt.splat %arg3 : i32 -> tensor<256xi32> loc(#loc9) %12 = arith.cmpi slt, %4, %11 : tensor<256xi32> loc(#loc9) %13 = tt.splat %arg1 : !tt.ptr<f32> -> tensor<256x!tt.ptr<f32>> loc(#loc10) %14 = tt.addptr %13, %4 : tensor<256x!tt.ptr<f32>>, tensor<256xi32> loc(#loc10) %cst_2 = arith.constant 0.000000e+00 : f32 loc(#loc11) %cst_3 = arith.constant dense<0.000000e+00> : tensor<256xf32> loc(#loc11) %15 = tt.load %14, %12, %cst_3 : tensor<256x!tt.ptr<f32>> loc(#loc11) %16 = arith.addf %10, %15 : tensor<256xf32> loc(#loc12) %17 = tt.splat %arg3 : i32 -> tensor<256xi32> loc(#loc13) %18 = arith.cmpi slt, %4, %17 : tensor<256xi32> loc(#loc13) %19 = tt.splat %arg2 : !tt.ptr<f32> -> tensor<256x!tt.ptr<f32>> loc(#loc14) %20 = tt.addptr %19, %4 : tensor<256x!tt.ptr<f32>>, tensor<256xi32> loc(#loc14) tt.store %20, %16, %18 : tensor<256x!tt.ptr<f32>> loc(#loc15) scope.return %10, %15, %16 : tensor<256xf32>, tensor<256xf32>, tensor<256xf32> loc(#loc15) } {hivm.disable_auto_sync = true, hivm.tcore_type = #hivm.tcore_type<VECTOR>, noinline} loc(#loc5) tt.return loc(#loc16) } loc(#loc) } loc(#loc) #loc1 = loc("/home/linxin/triton-test/scope.py":76:22) #loc2 = loc("/home/linxin/triton-test/scope.py":76:27) #loc3 = loc("/home/linxin/triton-test/scope.py":76:48) #loc4 = loc("/home/linxin/triton-test/scope.py":76:35) #loc5 = loc("/home/linxin/triton-test/scope.py":77:9) #loc6 = loc("/home/linxin/triton-test/scope.py":78:40) #loc7 = loc("/home/linxin/triton-test/scope.py":78:28) #loc8 = loc("/home/linxin/triton-test/scope.py":78:20) #loc9 = loc("/home/linxin/triton-test/scope.py":79:40) #loc10 = loc("/home/linxin/triton-test/scope.py":79:28) #loc11 = loc("/home/linxin/triton-test/scope.py":79:20) #loc12 = loc("/home/linxin/triton-test/scope.py":80:21) #loc13 = loc("/home/linxin/triton-test/scope.py":81:47) #loc14 = loc("/home/linxin/triton-test/scope.py":81:27) #loc15 = loc("/home/linxin/triton-test/scope.py":81:30) #loc16 = loc("/home/linxin/triton-test/scope.py":77:4) |