Gggppff24docs: add api extension documentation for al and bl

al.scope 接口文档

1. 硬件背景

昇腾处理器包含多种类型的计算单元（例如，用于矩阵运算的 Cube Unit 和用于向量/标量运算的 Vector Unit）。al.scope 允许内核开发者显式地告诉 Triton 编译器，特定代码区域应该目标哪种硬件单元，从而实现更精细的性能调优和资源利用。

2. 接口说明

Python
with al.scope(core_mode: str):
# 此代码块内的 Triton 语句（如 tl.load, tl.store, 算术运算等）
# 将根据指定的 core_mode 进行编译和执行。
...

al.scope 是 triton.language.extra.ascend 模块中的一个上下文管理器（Context Manager），专为 Triton 内核中的代码块指定昇腾硬件的执行模式。

参数

参数名	类型	必需	说明	可选值 (示例)
core_mode	str	是	指定该作用域内代码将要使用的昇腾核心类型	"vector", "cube"

常用值说明

值	目标核心	用途/优化方向
"vector"	Vector Unit (向量核心)	适用于元素级操作 (Element-wise Operations)，如加法 (+)、乘法 (*)、激活函数 (ReLU, Sigmoid)、数据加载 (tl.load) 和存储 (tl.store)。
"cube"	Cube Unit (矩阵核心)	适用于矩阵计算，特别是矩阵乘法 (Matrix Multiplication, GEMM) 和卷积操作。这通常与 tl.dot 等操作相关联。
"SIMT"	Single instruction multiple thread	-
"SIMD"	Single instruction multiple data	-

3. 约束说明

each kernel have 1 scope for cube and vector, inside them they run parallely and there are other syncing operations that declares the sync between both of the scope

Parallel Execution: Operations within cube and vector scopes execute in parallel
Single Scope per Type: Each kernel supports one cube scope and one vector scope (?)
Explicit Synchronization: Required for data dependencies between scopes using sync operations

4. 用例示例

Python
import os

os.environ["TORCH_DEVICE_BACKEND_AUTOLOAD"] = "0"

import pytest

import triton

import triton.language as tl

import triton.language.extra.cann.extension as al

from triton.compiler.compiler import ASTSource

from triton.compiler.code_generator import ast_to_ttir

from triton._C.libtriton import ir

from triton._C.libtriton.ascend import ir as ascend_ir

class Options:

num_warps = 4

num_stages = 3

num_ctas = 1

cluster_dims = (1, 1, 1)

enable_fp_fusion = True

debug = False

def compile_kernel(kernel, signature, constants):

src = ASTSource(kernel, signature, constants)

context = ir.context()

ir.load_dialects(context)

ascend_ir.load_dialects(context)

module = ast_to_ttir(kernel, src, context, Options(), {}, {})

return str(module)

@triton.jit

def kernel_nested_scope(x_ptr, y_ptr, out_ptr, n, BLOCK: tl.constexpr):

i = tl.program_id(0) * BLOCK + tl.arange(0, BLOCK)

with al.scope(core_mode="vector"):

with al.scope(core_mode="vector"):

with al.scope(core_mode="cube"):

x = tl.load(x_ptr + i, mask=i < n)

y = tl.load(y_ptr + i, mask=i < n)

result = x + y

tl.store(out_ptr + i, result, mask=i < n)

@triton.jit

def kernel_scope_escape(x_ptr, out_ptr, n, BLOCK: tl.constexpr):

i = tl.program_id(0) * BLOCK + tl.arange(0, BLOCK)

with al.scope(core_mode="vector"):

x = tl.load(x_ptr + i, mask=i < n)

a = x + 1.0

tl.store(out_ptr + i, a, mask=i < n)

@triton.jit

def kernel_scope_cube(x_ptr, y_ptr, out_ptr, n, BLOCK: tl.constexpr):

i = tl.program_id(0) * BLOCK + tl.arange(0, BLOCK)

with al.scope(core_mode="cube"):

x = tl.load(x_ptr + i, mask=i < n)

y = tl.load(y_ptr + i, mask=i < n)

result = x + y

tl.store(out_ptr + i, result, mask=i < n)

@triton.jit

def kernel_scope_vector(x_ptr, y_ptr, out_ptr, n, BLOCK: tl.constexpr):

i = tl.program_id(0) * BLOCK + tl.arange(0, BLOCK)

with al.scope(core_mode="vector"):

x = tl.load(x_ptr + i, mask=i < n)

y = tl.load(y_ptr + i, mask=i < n)

result = x + y

tl.store(out_ptr + i, result, mask=i < n)

@triton.jit

def kernel_scope_disable_auto_sync(x_ptr, y_ptr, out_ptr, n, BLOCK: tl.constexpr):

i = tl.program_id(0) * BLOCK + tl.arange(0, BLOCK)

with al.scope(core_mode="vector", disable_auto_sync=True):

x = tl.load(x_ptr + i, mask=i < n)

y = tl.load(y_ptr + i, mask=i < n)

result = x + y

tl.store(out_ptr + i, result, mask=i < n)

if __name__ == "__main__":

print("=" * 60)

print("Test 1: Nested Scopes")

print("=" * 60)

mlir = compile_kernel(

kernel_nested_scope, {"x_ptr": "*fp32", "y_ptr": "*fp32", "out_ptr": "*fp32", "n": "i32"}, {"BLOCK": 256}

)

print(f"✅ Generated MLIR ({len(mlir)} chars):\n")

print(mlir)

print("\n" + "=" * 60)

print("Test 2: Scope Escape")

print("=" * 60)

mlir = compile_kernel(kernel_scope_escape, {"x_ptr": "*fp32", "out_ptr": "*fp32", "n": "i32"}, {"BLOCK": 256})

print(f"✅ Generated MLIR ({len(mlir)} chars):\n")

print(mlir)

print("\n" + "=" * 60)

print("Test 3: Cube Core Mode")

print("=" * 60)

mlir = compile_kernel(

kernel_scope_cube, {"x_ptr": "*fp32", "y_ptr": "*fp32", "out_ptr": "*fp32", "n": "i32"}, {"BLOCK": 256}

)

print(f"✅ Generated MLIR ({len(mlir)} chars):\n")

print(mlir)

print("\n" + "=" * 60)

print("Test 4: Vector Core Mode")

print("=" * 60)

mlir = compile_kernel(

kernel_scope_vector, {"x_ptr": "*fp32", "y_ptr": "*fp32", "out_ptr": "*fp32", "n": "i32"}, {"BLOCK": 256}

)

print(f"✅ Generated MLIR ({len(mlir)} chars):\n")

print(mlir)

print("\n" + "=" * 60)

print("Test 5: Disable Auto Sync")

print("=" * 60)

mlir = compile_kernel(

kernel_scope_disable_auto_sync,

{"x_ptr": "*fp32", "y_ptr": "*fp32", "out_ptr": "*fp32", "n": "i32"},

{"BLOCK": 256},

)

print(f"✅ Generated MLIR ({len(mlir)} chars):\n")

print(mlir)

5. 编译输出结果

Plain Text
============================================================

Test 1: Nested Scopes

============================================================

✅ Generated MLIR (4155 chars):

#loc = loc("/home/linxin/triton-test/scope.py":34:0)

module {

tt.func public @kernel_nested_scope(%arg0: !tt.ptr<f32> loc("/home/linxin/triton-test/scope.py":34:0), %arg1: !tt.ptr<f32> loc("/home/linxin/triton-test/scope.py":34:0), %arg2: !tt.ptr<f32> loc("/home/linxin/triton-test/scope.py":34:0), %arg3: i32 loc("/home/linxin/triton-test/scope.py":34:0)) attributes {noinline = false} {

%0 = tt.get_program_id x : i32 loc(#loc1)

%c256_i32 = arith.constant 256 : i32 loc(#loc2)

%c256_i32_0 = arith.constant 256 : i32 loc(#loc2)

%1 = arith.muli %0, %c256_i32_0 : i32 loc(#loc2)

%2 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32> loc(#loc3)

%3 = tt.splat %1 : i32 -> tensor<256xi32> loc(#loc4)

%4 = arith.addi %3, %2 : tensor<256xi32> loc(#loc4)

%5:3 = scope.scope : () -> (tensor<256xf32>, tensor<256xf32>, tensor<256xf32>) {

%6:3 = scope.scope : () -> (tensor<256xf32>, tensor<256xf32>, tensor<256xf32>) {

%7:3 = scope.scope : () -> (tensor<256xf32>, tensor<256xf32>, tensor<256xf32>) {

%8 = tt.splat %arg3 : i32 -> tensor<256xi32> loc(#loc8)

%9 = arith.cmpi slt, %4, %8 : tensor<256xi32> loc(#loc8)

%10 = tt.splat %arg0 : !tt.ptr<f32> -> tensor<256x!tt.ptr<f32>> loc(#loc9)

%11 = tt.addptr %10, %4 : tensor<256x!tt.ptr<f32>>, tensor<256xi32> loc(#loc9)

%cst = arith.constant 0.000000e+00 : f32 loc(#loc10)

%cst_1 = arith.constant dense<0.000000e+00> : tensor<256xf32> loc(#loc10)

%12 = tt.load %11, %9, %cst_1 : tensor<256x!tt.ptr<f32>> loc(#loc10)

%13 = tt.splat %arg3 : i32 -> tensor<256xi32> loc(#loc11)

%14 = arith.cmpi slt, %4, %13 : tensor<256xi32> loc(#loc11)

%15 = tt.splat %arg1 : !tt.ptr<f32> -> tensor<256x!tt.ptr<f32>> loc(#loc12)

%16 = tt.addptr %15, %4 : tensor<256x!tt.ptr<f32>>, tensor<256xi32> loc(#loc12)

%cst_2 = arith.constant 0.000000e+00 : f32 loc(#loc13)

%cst_3 = arith.constant dense<0.000000e+00> : tensor<256xf32> loc(#loc13)

%17 = tt.load %16, %14, %cst_3 : tensor<256x!tt.ptr<f32>> loc(#loc13)

%18 = arith.addf %12, %17 : tensor<256xf32> loc(#loc14)

%19 = tt.splat %arg3 : i32 -> tensor<256xi32> loc(#loc15)

%20 = arith.cmpi slt, %4, %19 : tensor<256xi32> loc(#loc15)

%21 = tt.splat %arg2 : !tt.ptr<f32> -> tensor<256x!tt.ptr<f32>> loc(#loc16)

%22 = tt.addptr %21, %4 : tensor<256x!tt.ptr<f32>>, tensor<256xi32> loc(#loc16)

tt.store %22, %18, %20 : tensor<256x!tt.ptr<f32>> loc(#loc17)

scope.return %12, %17, %18 : tensor<256xf32>, tensor<256xf32>, tensor<256xf32> loc(#loc17)

} {hivm.tcore_type = #hivm.tcore_type<CUBE>, noinline} loc(#loc7)

scope.return %7#0, %7#1, %7#2 : tensor<256xf32>, tensor<256xf32>, tensor<256xf32> loc(#loc7)

} {hivm.tcore_type = #hivm.tcore_type<VECTOR>, noinline} loc(#loc6)

scope.return %6#0, %6#1, %6#2 : tensor<256xf32>, tensor<256xf32>, tensor<256xf32> loc(#loc6)

} {hivm.tcore_type = #hivm.tcore_type<VECTOR>, noinline} loc(#loc5)

tt.return loc(#loc18)

} loc(#loc)

} loc(#loc)

#loc1 = loc("/home/linxin/triton-test/scope.py":35:22)

#loc2 = loc("/home/linxin/triton-test/scope.py":35:27)

#loc3 = loc("/home/linxin/triton-test/scope.py":35:48)

#loc4 = loc("/home/linxin/triton-test/scope.py":35:35)

#loc5 = loc("/home/linxin/triton-test/scope.py":36:9)

#loc6 = loc("/home/linxin/triton-test/scope.py":37:13)

#loc7 = loc("/home/linxin/triton-test/scope.py":38:17)

#loc8 = loc("/home/linxin/triton-test/scope.py":39:48)

#loc9 = loc("/home/linxin/triton-test/scope.py":39:36)

#loc10 = loc("/home/linxin/triton-test/scope.py":39:28)

#loc11 = loc("/home/linxin/triton-test/scope.py":40:48)

#loc12 = loc("/home/linxin/triton-test/scope.py":40:36)

#loc13 = loc("/home/linxin/triton-test/scope.py":40:28)

#loc14 = loc("/home/linxin/triton-test/scope.py":41:29)

#loc15 = loc("/home/linxin/triton-test/scope.py":42:55)

#loc16 = loc("/home/linxin/triton-test/scope.py":42:35)

#loc17 = loc("/home/linxin/triton-test/scope.py":42:38)

#loc18 = loc("/home/linxin/triton-test/scope.py":36:4)

============================================================

Test 2: Scope Escape

============================================================

✅ Generated MLIR (2777 chars):

#loc = loc("/home/linxin/triton-test/scope.py":46:0)

module {

tt.func public @kernel_scope_escape(%arg0: !tt.ptr<f32> loc("/home/linxin/triton-test/scope.py":46:0), %arg1: !tt.ptr<f32> loc("/home/linxin/triton-test/scope.py":46:0), %arg2: i32 loc("/home/linxin/triton-test/scope.py":46:0)) attributes {noinline = false} {

%0 = tt.get_program_id x : i32 loc(#loc1)

%c256_i32 = arith.constant 256 : i32 loc(#loc2)

%c256_i32_0 = arith.constant 256 : i32 loc(#loc2)

%1 = arith.muli %0, %c256_i32_0 : i32 loc(#loc2)

%2 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32> loc(#loc3)

%3 = tt.splat %1 : i32 -> tensor<256xi32> loc(#loc4)

%4 = arith.addi %3, %2 : tensor<256xi32> loc(#loc4)

%5 = scope.scope : () -> tensor<256xf32> {

%11 = tt.splat %arg2 : i32 -> tensor<256xi32> loc(#loc6)

%12 = arith.cmpi slt, %4, %11 : tensor<256xi32> loc(#loc6)

%13 = tt.splat %arg0 : !tt.ptr<f32> -> tensor<256x!tt.ptr<f32>> loc(#loc7)

%14 = tt.addptr %13, %4 : tensor<256x!tt.ptr<f32>>, tensor<256xi32> loc(#loc7)

%cst_3 = arith.constant 0.000000e+00 : f32 loc(#loc8)

%cst_4 = arith.constant dense<0.000000e+00> : tensor<256xf32> loc(#loc8)

%15 = tt.load %14, %12, %cst_4 : tensor<256x!tt.ptr<f32>> loc(#loc8)

scope.return %15 : tensor<256xf32> loc(#loc8)

} {hivm.tcore_type = #hivm.tcore_type<VECTOR>, noinline} loc(#loc5)

%cst = arith.constant 1.000000e+00 : f32 loc(#loc9)

%cst_1 = arith.constant 1.000000e+00 : f32 loc(#loc9)

%cst_2 = arith.constant dense<1.000000e+00> : tensor<256xf32> loc(#loc9)

%6 = arith.addf %5, %cst_2 : tensor<256xf32> loc(#loc9)

%7 = tt.splat %arg2 : i32 -> tensor<256xi32> loc(#loc10)

%8 = arith.cmpi slt, %4, %7 : tensor<256xi32> loc(#loc10)

%9 = tt.splat %arg1 : !tt.ptr<f32> -> tensor<256x!tt.ptr<f32>> loc(#loc11)

%10 = tt.addptr %9, %4 : tensor<256x!tt.ptr<f32>>, tensor<256xi32> loc(#loc11)

tt.store %10, %6, %8 : tensor<256x!tt.ptr<f32>> loc(#loc12)

tt.return loc(#loc13)

} loc(#loc)

} loc(#loc)

#loc1 = loc("/home/linxin/triton-test/scope.py":47:22)

#loc2 = loc("/home/linxin/triton-test/scope.py":47:27)

#loc3 = loc("/home/linxin/triton-test/scope.py":47:48)

#loc4 = loc("/home/linxin/triton-test/scope.py":47:35)

#loc5 = loc("/home/linxin/triton-test/scope.py":48:9)

#loc6 = loc("/home/linxin/triton-test/scope.py":49:40)

#loc7 = loc("/home/linxin/triton-test/scope.py":49:28)

#loc8 = loc("/home/linxin/triton-test/scope.py":49:20)

#loc9 = loc("/home/linxin/triton-test/scope.py":50:12)

#loc10 = loc("/home/linxin/triton-test/scope.py":51:38)

#loc11 = loc("/home/linxin/triton-test/scope.py":51:23)

#loc12 = loc("/home/linxin/triton-test/scope.py":51:26)

#loc13 = loc("/home/linxin/triton-test/scope.py":51:4)

============================================================

Test 3: Cube Core Mode

============================================================

✅ Generated MLIR (3422 chars):

#loc = loc("/home/linxin/triton-test/scope.py":55:0)

module {

tt.func public @kernel_scope_cube(%arg0: !tt.ptr<f32> loc("/home/linxin/triton-test/scope.py":55:0), %arg1: !tt.ptr<f32> loc("/home/linxin/triton-test/scope.py":55:0), %arg2: !tt.ptr<f32> loc("/home/linxin/triton-test/scope.py":55:0), %arg3: i32 loc("/home/linxin/triton-test/scope.py":55:0)) attributes {noinline = false} {

%0 = tt.get_program_id x : i32 loc(#loc1)

%c256_i32 = arith.constant 256 : i32 loc(#loc2)

%c256_i32_0 = arith.constant 256 : i32 loc(#loc2)

%1 = arith.muli %0, %c256_i32_0 : i32 loc(#loc2)

%2 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32> loc(#loc3)

%3 = tt.splat %1 : i32 -> tensor<256xi32> loc(#loc4)

%4 = arith.addi %3, %2 : tensor<256xi32> loc(#loc4)

%5:3 = scope.scope : () -> (tensor<256xf32>, tensor<256xf32>, tensor<256xf32>) {

%6 = tt.splat %arg3 : i32 -> tensor<256xi32> loc(#loc6)

%7 = arith.cmpi slt, %4, %6 : tensor<256xi32> loc(#loc6)

%8 = tt.splat %arg0 : !tt.ptr<f32> -> tensor<256x!tt.ptr<f32>> loc(#loc7)

%9 = tt.addptr %8, %4 : tensor<256x!tt.ptr<f32>>, tensor<256xi32> loc(#loc7)

%cst = arith.constant 0.000000e+00 : f32 loc(#loc8)

%cst_1 = arith.constant dense<0.000000e+00> : tensor<256xf32> loc(#loc8)

%10 = tt.load %9, %7, %cst_1 : tensor<256x!tt.ptr<f32>> loc(#loc8)

%11 = tt.splat %arg3 : i32 -> tensor<256xi32> loc(#loc9)

%12 = arith.cmpi slt, %4, %11 : tensor<256xi32> loc(#loc9)

%13 = tt.splat %arg1 : !tt.ptr<f32> -> tensor<256x!tt.ptr<f32>> loc(#loc10)

%14 = tt.addptr %13, %4 : tensor<256x!tt.ptr<f32>>, tensor<256xi32> loc(#loc10)

%cst_2 = arith.constant 0.000000e+00 : f32 loc(#loc11)

%cst_3 = arith.constant dense<0.000000e+00> : tensor<256xf32> loc(#loc11)

%15 = tt.load %14, %12, %cst_3 : tensor<256x!tt.ptr<f32>> loc(#loc11)

%16 = arith.addf %10, %15 : tensor<256xf32> loc(#loc12)

%17 = tt.splat %arg3 : i32 -> tensor<256xi32> loc(#loc13)

%18 = arith.cmpi slt, %4, %17 : tensor<256xi32> loc(#loc13)

%19 = tt.splat %arg2 : !tt.ptr<f32> -> tensor<256x!tt.ptr<f32>> loc(#loc14)

%20 = tt.addptr %19, %4 : tensor<256x!tt.ptr<f32>>, tensor<256xi32> loc(#loc14)

tt.store %20, %16, %18 : tensor<256x!tt.ptr<f32>> loc(#loc15)

scope.return %10, %15, %16 : tensor<256xf32>, tensor<256xf32>, tensor<256xf32> loc(#loc15)

} {hivm.tcore_type = #hivm.tcore_type<CUBE>, noinline} loc(#loc5)

tt.return loc(#loc16)

} loc(#loc)

} loc(#loc)

#loc1 = loc("/home/linxin/triton-test/scope.py":56:22)

#loc2 = loc("/home/linxin/triton-test/scope.py":56:27)

#loc3 = loc("/home/linxin/triton-test/scope.py":56:48)

#loc4 = loc("/home/linxin/triton-test/scope.py":56:35)

#loc5 = loc("/home/linxin/triton-test/scope.py":57:9)

#loc6 = loc("/home/linxin/triton-test/scope.py":58:40)

#loc7 = loc("/home/linxin/triton-test/scope.py":58:28)

#loc8 = loc("/home/linxin/triton-test/scope.py":58:20)

#loc9 = loc("/home/linxin/triton-test/scope.py":59:40)

#loc10 = loc("/home/linxin/triton-test/scope.py":59:28)

#loc11 = loc("/home/linxin/triton-test/scope.py":59:20)

#loc12 = loc("/home/linxin/triton-test/scope.py":60:21)

#loc13 = loc("/home/linxin/triton-test/scope.py":61:47)

#loc14 = loc("/home/linxin/triton-test/scope.py":61:27)

#loc15 = loc("/home/linxin/triton-test/scope.py":61:30)

#loc16 = loc("/home/linxin/triton-test/scope.py":57:4)

============================================================

Test 4: Vector Core Mode

============================================================

✅ Generated MLIR (3426 chars):

#loc = loc("/home/linxin/triton-test/scope.py":65:0)

module {

tt.func public @kernel_scope_vector(%arg0: !tt.ptr<f32> loc("/home/linxin/triton-test/scope.py":65:0), %arg1: !tt.ptr<f32> loc("/home/linxin/triton-test/scope.py":65:0), %arg2: !tt.ptr<f32> loc("/home/linxin/triton-test/scope.py":65:0), %arg3: i32 loc("/home/linxin/triton-test/scope.py":65:0)) attributes {noinline = false} {

%0 = tt.get_program_id x : i32 loc(#loc1)

%c256_i32 = arith.constant 256 : i32 loc(#loc2)

%c256_i32_0 = arith.constant 256 : i32 loc(#loc2)

%1 = arith.muli %0, %c256_i32_0 : i32 loc(#loc2)

%2 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32> loc(#loc3)

%3 = tt.splat %1 : i32 -> tensor<256xi32> loc(#loc4)

%4 = arith.addi %3, %2 : tensor<256xi32> loc(#loc4)

%5:3 = scope.scope : () -> (tensor<256xf32>, tensor<256xf32>, tensor<256xf32>) {

%6 = tt.splat %arg3 : i32 -> tensor<256xi32> loc(#loc6)

%7 = arith.cmpi slt, %4, %6 : tensor<256xi32> loc(#loc6)

%8 = tt.splat %arg0 : !tt.ptr<f32> -> tensor<256x!tt.ptr<f32>> loc(#loc7)

%9 = tt.addptr %8, %4 : tensor<256x!tt.ptr<f32>>, tensor<256xi32> loc(#loc7)

%cst = arith.constant 0.000000e+00 : f32 loc(#loc8)

%cst_1 = arith.constant dense<0.000000e+00> : tensor<256xf32> loc(#loc8)

%10 = tt.load %9, %7, %cst_1 : tensor<256x!tt.ptr<f32>> loc(#loc8)

%11 = tt.splat %arg3 : i32 -> tensor<256xi32> loc(#loc9)

%12 = arith.cmpi slt, %4, %11 : tensor<256xi32> loc(#loc9)

%13 = tt.splat %arg1 : !tt.ptr<f32> -> tensor<256x!tt.ptr<f32>> loc(#loc10)

%14 = tt.addptr %13, %4 : tensor<256x!tt.ptr<f32>>, tensor<256xi32> loc(#loc10)

%cst_2 = arith.constant 0.000000e+00 : f32 loc(#loc11)

%cst_3 = arith.constant dense<0.000000e+00> : tensor<256xf32> loc(#loc11)

%15 = tt.load %14, %12, %cst_3 : tensor<256x!tt.ptr<f32>> loc(#loc11)

%16 = arith.addf %10, %15 : tensor<256xf32> loc(#loc12)

%17 = tt.splat %arg3 : i32 -> tensor<256xi32> loc(#loc13)

%18 = arith.cmpi slt, %4, %17 : tensor<256xi32> loc(#loc13)

%19 = tt.splat %arg2 : !tt.ptr<f32> -> tensor<256x!tt.ptr<f32>> loc(#loc14)

%20 = tt.addptr %19, %4 : tensor<256x!tt.ptr<f32>>, tensor<256xi32> loc(#loc14)

tt.store %20, %16, %18 : tensor<256x!tt.ptr<f32>> loc(#loc15)

scope.return %10, %15, %16 : tensor<256xf32>, tensor<256xf32>, tensor<256xf32> loc(#loc15)

} {hivm.tcore_type = #hivm.tcore_type<VECTOR>, noinline} loc(#loc5)

tt.return loc(#loc16)

} loc(#loc)

} loc(#loc)

#loc1 = loc("/home/linxin/triton-test/scope.py":66:22)

#loc2 = loc("/home/linxin/triton-test/scope.py":66:27)

#loc3 = loc("/home/linxin/triton-test/scope.py":66:48)

#loc4 = loc("/home/linxin/triton-test/scope.py":66:35)

#loc5 = loc("/home/linxin/triton-test/scope.py":67:9)

#loc6 = loc("/home/linxin/triton-test/scope.py":68:40)

#loc7 = loc("/home/linxin/triton-test/scope.py":68:28)

#loc8 = loc("/home/linxin/triton-test/scope.py":68:20)

#loc9 = loc("/home/linxin/triton-test/scope.py":69:40)

#loc10 = loc("/home/linxin/triton-test/scope.py":69:28)

#loc11 = loc("/home/linxin/triton-test/scope.py":69:20)

#loc12 = loc("/home/linxin/triton-test/scope.py":70:21)

#loc13 = loc("/home/linxin/triton-test/scope.py":71:47)

#loc14 = loc("/home/linxin/triton-test/scope.py":71:27)

#loc15 = loc("/home/linxin/triton-test/scope.py":71:30)

#loc16 = loc("/home/linxin/triton-test/scope.py":67:4)

============================================================

Test 5: Disable Auto Sync

============================================================

✅ Generated MLIR (3468 chars):

#loc = loc("/home/linxin/triton-test/scope.py":75:0)

module {

tt.func public @kernel_scope_disable_auto_sync(%arg0: !tt.ptr<f32> loc("/home/linxin/triton-test/scope.py":75:0), %arg1: !tt.ptr<f32> loc("/home/linxin/triton-test/scope.py":75:0), %arg2: !tt.ptr<f32> loc("/home/linxin/triton-test/scope.py":75:0), %arg3: i32 loc("/home/linxin/triton-test/scope.py":75:0)) attributes {noinline = false} {

%0 = tt.get_program_id x : i32 loc(#loc1)

%c256_i32 = arith.constant 256 : i32 loc(#loc2)

%c256_i32_0 = arith.constant 256 : i32 loc(#loc2)

%1 = arith.muli %0, %c256_i32_0 : i32 loc(#loc2)

%2 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32> loc(#loc3)

%3 = tt.splat %1 : i32 -> tensor<256xi32> loc(#loc4)

%4 = arith.addi %3, %2 : tensor<256xi32> loc(#loc4)

%5:3 = scope.scope : () -> (tensor<256xf32>, tensor<256xf32>, tensor<256xf32>) {

%6 = tt.splat %arg3 : i32 -> tensor<256xi32> loc(#loc6)

%7 = arith.cmpi slt, %4, %6 : tensor<256xi32> loc(#loc6)

%8 = tt.splat %arg0 : !tt.ptr<f32> -> tensor<256x!tt.ptr<f32>> loc(#loc7)

%9 = tt.addptr %8, %4 : tensor<256x!tt.ptr<f32>>, tensor<256xi32> loc(#loc7)

%cst = arith.constant 0.000000e+00 : f32 loc(#loc8)

%cst_1 = arith.constant dense<0.000000e+00> : tensor<256xf32> loc(#loc8)

%10 = tt.load %9, %7, %cst_1 : tensor<256x!tt.ptr<f32>> loc(#loc8)

%11 = tt.splat %arg3 : i32 -> tensor<256xi32> loc(#loc9)

%12 = arith.cmpi slt, %4, %11 : tensor<256xi32> loc(#loc9)

%13 = tt.splat %arg1 : !tt.ptr<f32> -> tensor<256x!tt.ptr<f32>> loc(#loc10)

%14 = tt.addptr %13, %4 : tensor<256x!tt.ptr<f32>>, tensor<256xi32> loc(#loc10)

%cst_2 = arith.constant 0.000000e+00 : f32 loc(#loc11)

%cst_3 = arith.constant dense<0.000000e+00> : tensor<256xf32> loc(#loc11)

%15 = tt.load %14, %12, %cst_3 : tensor<256x!tt.ptr<f32>> loc(#loc11)

%16 = arith.addf %10, %15 : tensor<256xf32> loc(#loc12)

%17 = tt.splat %arg3 : i32 -> tensor<256xi32> loc(#loc13)

%18 = arith.cmpi slt, %4, %17 : tensor<256xi32> loc(#loc13)

%19 = tt.splat %arg2 : !tt.ptr<f32> -> tensor<256x!tt.ptr<f32>> loc(#loc14)

%20 = tt.addptr %19, %4 : tensor<256x!tt.ptr<f32>>, tensor<256xi32> loc(#loc14)

tt.store %20, %16, %18 : tensor<256x!tt.ptr<f32>> loc(#loc15)

scope.return %10, %15, %16 : tensor<256xf32>, tensor<256xf32>, tensor<256xf32> loc(#loc15)

} {hivm.disable_auto_sync = true, hivm.tcore_type = #hivm.tcore_type<VECTOR>, noinline} loc(#loc5)

tt.return loc(#loc16)

} loc(#loc)

} loc(#loc)

#loc1 = loc("/home/linxin/triton-test/scope.py":76:22)

#loc2 = loc("/home/linxin/triton-test/scope.py":76:27)

#loc3 = loc("/home/linxin/triton-test/scope.py":76:48)

#loc4 = loc("/home/linxin/triton-test/scope.py":76:35)

#loc5 = loc("/home/linxin/triton-test/scope.py":77:9)

#loc6 = loc("/home/linxin/triton-test/scope.py":78:40)

#loc7 = loc("/home/linxin/triton-test/scope.py":78:28)

#loc8 = loc("/home/linxin/triton-test/scope.py":78:20)

#loc9 = loc("/home/linxin/triton-test/scope.py":79:40)

#loc10 = loc("/home/linxin/triton-test/scope.py":79:28)

#loc11 = loc("/home/linxin/triton-test/scope.py":79:20)

#loc12 = loc("/home/linxin/triton-test/scope.py":80:21)

#loc13 = loc("/home/linxin/triton-test/scope.py":81:47)

#loc14 = loc("/home/linxin/triton-test/scope.py":81:27)

#loc15 = loc("/home/linxin/triton-test/scope.py":81:30)

#loc16 = loc("/home/linxin/triton-test/scope.py":77:4)