import asc
from asc.runtime import config
def setup_function():
config.set_platform(config.Backend.Model, check=False)
def test_repeat_reduce_sum_kernel(mock_launcher_run):
@asc.jit
def repeat_reduce_sum_kernel():
x_local = asc.LocalTensor(dtype=asc.float16, pos=asc.TPosition.VECIN, addr=0, tile_size=512)
z_local = asc.LocalTensor(dtype=asc.float16, pos=asc.TPosition.VECOUT, addr=0, tile_size=512)
asc.repeat_reduce_sum(z_local, x_local, repeat_time=4, mask=128,
dst_blk_stride=0, src_blk_stride=1, dst_rep_stride=8, src_rep_stride=8)
repeat_reduce_sum_kernel[1]()
assert mock_launcher_run.call_count == 1
def test_pair_reduce_sum_kernel(mock_launcher_run):
@asc.jit
def pair_reduce_sum_kernel():
x_local = asc.LocalTensor(dtype=asc.float16, pos=asc.TPosition.VECIN, addr=0, tile_size=512)
z_local = asc.LocalTensor(dtype=asc.float16, pos=asc.TPosition.VECOUT, addr=0, tile_size=512)
asc.pair_reduce_sum(z_local, x_local, repeat_time=2, mask=128,
dst_rep_stride=1, src_blk_stride=1, src_rep_stride=8)
uint64_max = 2**64 - 1
mask = [uint64_max, uint64_max]
asc.pair_reduce_sum(z_local, x_local, repeat_time=2, mask=mask,
dst_rep_stride=1, src_blk_stride=1, src_rep_stride=8)
pair_reduce_sum_kernel[1]()
assert mock_launcher_run.call_count == 1
def test_whole_reduce_sum_kernel(mock_launcher_run):
@asc.jit
def whole_reduce_sum_kernel():
x_local = asc.LocalTensor(dtype=asc.float16, pos=asc.TPosition.VECIN, addr=0, tile_size=512)
z_local = asc.LocalTensor(dtype=asc.float16, pos=asc.TPosition.VECOUT, addr=0, tile_size=512)
asc.whole_reduce_sum(z_local, x_local, mask=128, repeat_time=4,
dst_rep_stride=1, src_blk_stride=1, src_rep_stride=8)
uint64_max = 2**64 - 1
mask = [uint64_max, uint64_max]
asc.whole_reduce_sum(z_local, x_local, mask=mask, repeat_time=4,
dst_rep_stride=1, src_blk_stride=1, src_rep_stride=8)
whole_reduce_sum_kernel[1]()
assert mock_launcher_run.call_count == 1
def test_whole_reduce_max_kernel(mock_launcher_run):
@asc.jit
def whole_reduce_max_kernel():
x_local = asc.LocalTensor(dtype=asc.float16, pos=asc.TPosition.VECIN, addr=0, tile_size=512)
z_local = asc.LocalTensor(dtype=asc.float16, pos=asc.TPosition.VECOUT, addr=0, tile_size=512)
asc.whole_reduce_max(z_local, x_local, mask=128, repeat_time=4,
dst_rep_stride=1, src_blk_stride=1, src_rep_stride=8)
asc.whole_reduce_max(z_local, x_local, mask=128, repeat_time=4,
dst_rep_stride=1, src_blk_stride=1, src_rep_stride=8,
order=asc.ReduceOrder.ORDER_INDEX_VALUE)
uint64_max = 2**64 - 1
mask = [uint64_max, uint64_max]
asc.whole_reduce_max(z_local, x_local, mask=mask, repeat_time=4,
dst_rep_stride=1, src_blk_stride=1, src_rep_stride=8)
asc.whole_reduce_max(z_local, x_local, mask=mask, repeat_time=4,
dst_rep_stride=1, src_blk_stride=1, src_rep_stride=8,
order=asc.ReduceOrder.ORDER_INDEX_VALUE)
whole_reduce_max_kernel[1]()
assert mock_launcher_run.call_count == 1
def test_whole_reduce_min_kernel(mock_launcher_run):
@asc.jit
def whole_reduce_min_kernel():
x_local = asc.LocalTensor(dtype=asc.float16, pos=asc.TPosition.VECIN, addr=0, tile_size=512)
z_local = asc.LocalTensor(dtype=asc.float16, pos=asc.TPosition.VECOUT, addr=0, tile_size=512)
asc.whole_reduce_min(z_local, x_local, mask=128, repeat_time=4,
dst_rep_stride=1, src_blk_stride=1, src_rep_stride=8)
asc.whole_reduce_min(z_local, x_local, mask=128, repeat_time=4,
dst_rep_stride=1, src_blk_stride=1, src_rep_stride=8,
order=asc.ReduceOrder.ORDER_INDEX_VALUE)
uint64_max = 2**64 - 1
mask = [uint64_max, uint64_max]
asc.whole_reduce_min(z_local, x_local, mask=mask, repeat_time=4,
dst_rep_stride=1, src_blk_stride=1, src_rep_stride=8)
asc.whole_reduce_min(z_local, x_local, mask=mask, repeat_time=4,
dst_rep_stride=1, src_blk_stride=1, src_rep_stride=8,
order=asc.ReduceOrder.ORDER_INDEX_VALUE)
whole_reduce_min_kernel[1]()
assert mock_launcher_run.call_count == 1
def test_reduce_max_kernel(mock_launcher_run):
@asc.jit
def reduce_max_kernel():
x_local = asc.LocalTensor(dtype=asc.float16, pos=asc.TPosition.VECIN, addr=0, tile_size=8320)
z_local = asc.LocalTensor(dtype=asc.float16, pos=asc.TPosition.VECOUT, addr=0, tile_size=8320)
shared_tmp = asc.LocalTensor(dtype=asc.float16, pos=asc.TPosition.VECCALC, addr=0, tile_size=8320)
asc.reduce_max(z_local, x_local, shared_tmp_buffer=shared_tmp,
mask=128, repeat_time=128, src_rep_stride=65, cal_index=True)
uint64_max = 2**64 - 1
mask = [uint64_max, uint64_max]
asc.reduce_max(z_local, x_local, shared_tmp_buffer=shared_tmp,
mask=mask, repeat_time=65, src_rep_stride=8, cal_index=True)
asc.reduce_max(z_local, x_local, shared_tmp_buffer=shared_tmp, count=2048, cal_index=True)
reduce_max_kernel[1]()
assert mock_launcher_run.call_count == 1
def test_reduce_min_kernel(mock_launcher_run):
@asc.jit
def reduce_min_kernel():
x_local = asc.LocalTensor(dtype=asc.float16, pos=asc.TPosition.VECIN, addr=0, tile_size=8320)
z_local = asc.LocalTensor(dtype=asc.float16, pos=asc.TPosition.VECOUT, addr=0, tile_size=8320)
shared_tmp = asc.LocalTensor(dtype=asc.float16, pos=asc.TPosition.VECCALC, addr=0, tile_size=8320)
asc.reduce_min(z_local, x_local, shared_tmp_buffer=shared_tmp,
mask=128, repeat_time=128, src_rep_stride=65, cal_index=True)
uint64_max = 2**64 - 1
mask = [uint64_max, uint64_max]
asc.reduce_min(z_local, x_local, shared_tmp_buffer=shared_tmp,
mask=mask, repeat_time=65, src_rep_stride=8, cal_index=True)
asc.reduce_min(z_local, x_local, shared_tmp_buffer=shared_tmp, count=2048, cal_index=True)
reduce_min_kernel[1]()
assert mock_launcher_run.call_count == 1
def test_reduce_sum_kernel(mock_launcher_run):
@asc.jit
def reduce_sum_kernel():
x_local = asc.LocalTensor(dtype=asc.float16, pos=asc.TPosition.VECIN, addr=0, tile_size=8320)
z_local = asc.LocalTensor(dtype=asc.float16, pos=asc.TPosition.VECOUT, addr=0, tile_size=8320)
shared_tmp = asc.LocalTensor(dtype=asc.float16, pos=asc.TPosition.VECCALC, addr=0, tile_size=8320)
asc.reduce_sum(z_local, x_local, shared_tmp_buffer=shared_tmp, mask=128, repeat_time=128, src_rep_stride=65)
uint64_max = 2**64 - 1
mask = [uint64_max, uint64_max]
asc.reduce_sum(z_local, x_local, shared_tmp_buffer=shared_tmp, mask=mask, repeat_time=65, src_rep_stride=8)
asc.reduce_sum(z_local, x_local, shared_tmp_buffer=shared_tmp, count=2048)
reduce_sum_kernel[1]()
assert mock_launcher_run.call_count == 1