import sys
import os
import shutil
import logging
import tbe.common.platform as tbe_platform
from te import tik
logging.basicConfig(level=logging.INFO, format='%(levelname)s %(asctime)s [%(filename)s:%(lineno)d] %(message)s')
def ceil_block(value, tiling_dtype):
"""
if not divide exactly then plus 1
"""
value *= Constant.TYPE_LEN_DICT.get(tiling_dtype)
return (value + Constant.BLOCK_BYTE_SIZE - 1) // Constant.BLOCK_BYTE_SIZE
def ceil_value(value_x, value_n):
"""
if not divide exactly then plus 1
"""
return (value_x + value_n - 1) // value_n
def mk_to_k1mk0(self, mk_input_tensor, k1mk0_tensor, dtype, k1, m, k0):
"""change data format mk to k1mk0"""
tik_instance = self.tik_instance
with tik_instance.for_range(0, k1) as i:
tik_instance.data_move(self.k1mk0_workspace[(16 * 1024 + i * m * k0):], mk_input_tensor[i * k0:], 0, m,
k0 * Constant.TYPE_LEN_DICT[dtype] // 32,
(k1 - 1) * k0 * Constant.TYPE_LEN_DICT[dtype] // 32, 0)
tik_instance.data_move(k1mk0_tensor, self.k1mk0_workspace[16 * 1024], 0, 1,
k1 * m * k0 * Constant.TYPE_LEN_DICT[dtype] // 32, 0, 0)
def kn_to_k1nk0(self, kn_input_tensor, k1nk0_tensor, dtype, k1, n, k0):
"""change data format kn to k1nk0"""
tik_instance = self.tik_instance
with tik_instance.for_range(0, k1) as index:
k1nk0_ub = tik_instance.Tensor(dtype, (n, k0), tik.scope_ubuf, "k1nk0_ub")
src_ub = tik_instance.Tensor(dtype, (k0, n), tik.scope_ubuf, "src_ub")
burst_len = k0 * n * Constant.TYPE_LEN_DICT[dtype] // 32
tik_instance.data_move(src_ub, kn_input_tensor[index * k0 * n], 0, 1, burst_len, 0, 0)
dst_list = [k1nk0_ub[16 * i] for i in range(16)]
src_list = [src_ub[n * i] for i in range(16)]
rep_times = n // k0
dst_rep_stride = k0
src_rep_stride = 1
tik_instance.vec_trans_scatter(False, False, dst_list, src_list, rep_times, dst_rep_stride, src_rep_stride)
tik_instance.data_move(k1nk0_tensor[index * k0 * n], k1nk0_ub, 0, 1, burst_len, 0, 0)
def n1mn0_to_mn(self, mn_output_tensor, n1mn0_tensor, dtype, n1, m, n0, src_ub):
"""change data format n1mn0 to mn"""
tik_instance = self.tik_instance
with tik_instance.for_range(0, n1) as i:
tik_instance.data_move(src_ub[i * n0:], n1mn0_tensor[i * m * n0:], 0, m,
n0 * Constant.TYPE_LEN_DICT[dtype] // 32, 0,
(n1 - 1) * n0 * Constant.TYPE_LEN_DICT[dtype] // 32)
tik_instance.data_move(mn_output_tensor, src_ub, 0, 1, m * n1 * n0 * Constant.TYPE_LEN_DICT[dtype] // 32, 0, 0)
def vec_dup_dynamic(self, data_len, dst, dup_num):
tik_instance = self.tik_instance
mask = Constant.TYPE_MASK_DICT.get(dst.dtype)
repeat_times = tik_instance.Scalar(dtype=self.scalar_dtype, name="repeat_times", init_value=0)
offset = tik_instance.Scalar(dtype=self.scalar_dtype, name="offset", init_value=0)
mask_last = tik_instance.Scalar(dtype=self.scalar_dtype, name="mask_last", init_value=0)
loop1 = data_len // mask // Constant.REPEAT_TIMES_MAX
with tik_instance.for_range(0, loop1) as loop_id:
repeat_times.set_as(Constant.REPEAT_TIMES_MAX)
offset.set_as(loop_id * Constant.REPEAT_TIMES_MAX * mask)
tik_instance.vec_dup(mask, dst[offset], dup_num, repeat_times, 8)
loop2 = data_len // mask % Constant.REPEAT_TIMES_MAX
with tik_instance.if_scope(loop2 > 0):
repeat_times.set_as(loop2)
offset.set_as(loop1 * Constant.REPEAT_TIMES_MAX * mask)
tik_instance.vec_dup(mask, dst[offset], dup_num, repeat_times, 8)
loop3 = data_len % mask
with tik_instance.if_scope(loop3 > 0):
offset.set_as(mask * (loop1 * Constant.REPEAT_TIMES_MAX + loop2))
mask_last.set_as(data_len % mask)
repeat_times.set_as(1)
tik_instance.vec_dup(mask_last, dst[offset], dup_num, repeat_times, 8)
def vec_conv_dynamic(self, round_mode, data_len, dst, src, deqscale=None):
tik_instance = self.tik_instance
dst_mask = Constant.TYPE_MASK_DICT.get(dst.dtype)
src_mask = Constant.TYPE_MASK_DICT.get(src.dtype)
dtype = dst.dtype if (dst_mask < src_mask) else src.dtype
mask = Constant.TYPE_MASK_DICT.get(dtype)
dst_rep_stride = mask * Constant.TYPE_LEN_DICT.get(dst.dtype) // Constant.BLOCK_BYTE_SIZE
src_rep_stride = mask * Constant.TYPE_LEN_DICT.get(src.dtype) // Constant.BLOCK_BYTE_SIZE
repeat_times = tik_instance.Scalar(dtype=self.scalar_dtype, name="repeat_times", init_value=0)
offset = tik_instance.Scalar(dtype=self.scalar_dtype, name="offset", init_value=0)
mask_last = tik_instance.Scalar(dtype=self.scalar_dtype, name="mask_last", init_value=0)
loop1 = data_len // mask // Constant.REPEAT_TIMES_MAX
with tik_instance.for_range(0, loop1) as loop_id:
repeat_times.set_as(Constant.REPEAT_TIMES_MAX)
offset.set_as(loop_id * Constant.REPEAT_TIMES_MAX * mask)
tik_instance.vec_conv(mask, round_mode, dst[offset], src[offset], repeat_times, dst_rep_stride, src_rep_stride,
deqscale=deqscale)
loop2 = data_len // mask % Constant.REPEAT_TIMES_MAX
with tik_instance.if_scope(loop2 > 0):
repeat_times.set_as(loop2)
offset.set_as(loop1 * Constant.REPEAT_TIMES_MAX * mask)
tik_instance.vec_conv(mask, round_mode, dst[offset], src[offset], repeat_times, dst_rep_stride, src_rep_stride,
deqscale=deqscale)
loop3 = data_len % mask
with tik_instance.if_scope(loop3 > 0):
offset.set_as(mask * (loop1 * Constant.REPEAT_TIMES_MAX + loop2))
mask_last.set_as(data_len % mask)
repeat_times.set_as(1)
tik_instance.vec_conv(mask_last, round_mode, dst[offset], src[offset], repeat_times, dst_rep_stride,
src_rep_stride, deqscale=deqscale)
def vmul_vadd_vsub_dynamic(self, mode, data_len, dst, src0, src1):
'''
mode one to vmul, mode two to vadd, mode three to vsub
'''
tik_instance = self.tik_instance
mask = Constant.TYPE_MASK_DICT.get(dst.dtype)
repeat_times = tik_instance.Scalar(dtype=self.scalar_dtype, name="repeat_times", init_value=0)
offset = tik_instance.Scalar(dtype=self.scalar_dtype, name="offset", init_value=0)
mask_last = tik_instance.Scalar(dtype=self.scalar_dtype, name="mask_last", init_value=0)
loop1 = data_len // mask // Constant.REPEAT_TIMES_MAX
with tik_instance.for_range(0, loop1) as loop_id:
repeat_times.set_as(Constant.REPEAT_TIMES_MAX)
offset.set_as(loop_id * Constant.REPEAT_TIMES_MAX * mask)
with tik_instance.if_scope(mode == 1):
tik_instance.vmul(mask, dst[offset], src0[offset], src1[offset],
repeat_times, 1, 1, 1, 8, 8, 8)
with tik_instance.elif_scope(mode == 2):
tik_instance.vadd(mask, dst[offset], src0[offset], src1[offset],
repeat_times, 1, 1, 1, 8, 8, 8)
with tik_instance.else_scope():
tik_instance.vsub(mask, dst[offset], src0[offset], src1[offset],
repeat_times, 1, 1, 1, 8, 8, 8)
loop2 = data_len // mask % Constant.REPEAT_TIMES_MAX
with tik_instance.if_scope(loop2 > 0):
repeat_times.set_as(loop2)
offset.set_as(loop1 * Constant.REPEAT_TIMES_MAX * mask)
with tik_instance.if_scope(mode == 1):
tik_instance.vmul(mask, dst[offset], src0[offset], src1[offset],
repeat_times, 1, 1, 1, 8, 8, 8)
with tik_instance.elif_scope(mode == 2):
tik_instance.vadd(mask, dst[offset], src0[offset], src1[offset],
repeat_times, 1, 1, 1, 8, 8, 8)
with tik_instance.else_scope():
tik_instance.vsub(mask, dst[offset], src0[offset], src1[offset],
repeat_times, 1, 1, 1, 8, 8, 8)
loop3 = data_len % mask
with tik_instance.if_scope(loop3 > 0):
offset.set_as(mask * (loop1 * Constant.REPEAT_TIMES_MAX + loop2))
mask_last.set_as(data_len % mask)
repeat_times.set_as(1)
with tik_instance.if_scope(mode == 1):
tik_instance.vmul(mask_last, dst[offset], src0[offset], src1[offset],
repeat_times, 1, 1, 1, 8, 8, 8)
with tik_instance.elif_scope(mode == 2):
tik_instance.vadd(mask_last, dst[offset], src0[offset], src1[offset],
repeat_times, 1, 1, 1, 8, 8, 8)
with tik_instance.else_scope():
tik_instance.vsub(mask_last, dst[offset], src0[offset], src1[offset],
repeat_times, 1, 1, 1, 8, 8, 8)
def vmuls_dynamic(self, data_len, dst, src, mul_num):
tik_instance = self.tik_instance
mask = Constant.TYPE_MASK_DICT.get(dst.dtype)
repeat_times = tik_instance.Scalar(dtype=self.scalar_dtype, name="repeat_times", init_value=0)
offset = tik_instance.Scalar(dtype=self.scalar_dtype, name="offset", init_value=0)
mask_last = tik_instance.Scalar(dtype=self.scalar_dtype, name="mask_last", init_value=0)
loop1 = data_len // mask // Constant.REPEAT_TIMES_MAX
with tik_instance.for_range(0, loop1) as loop_id:
repeat_times.set_as(Constant.REPEAT_TIMES_MAX)
offset.set_as(loop_id * Constant.REPEAT_TIMES_MAX * mask)
tik_instance.vmuls(mask, dst[offset], src[offset],
mul_num, repeat_times, 1, 1, 8, 8)
loop2 = data_len // mask % Constant.REPEAT_TIMES_MAX
with tik_instance.if_scope(loop2 > 0):
repeat_times.set_as(loop2)
offset.set_as(loop1 * Constant.REPEAT_TIMES_MAX * mask)
tik_instance.vmuls(mask, dst[offset], src[offset],
mul_num, repeat_times, 1, 1, 8, 8)
loop3 = data_len % mask
with tik_instance.if_scope(loop3 > 0):
offset.set_as(mask * (loop1 * Constant.REPEAT_TIMES_MAX + loop2))
mask_last.set_as(data_len % mask)
repeat_times.set_as(1)
tik_instance.vmuls(mask_last, dst[offset], src[offset],
mul_num, repeat_times, 1, 1, 8, 8)
class Constant:
"""
The class for constant
"""
TILING_ARG_NUM = 24
INT32 = "int32"
UINT8 = "uint8"
UINT16 = "uint16"
INT16 = "int16"
FLOAT16 = "float16"
FLOAT32 = "float32"
FLOAT64 = "float64"
BLOCK_BYTE_SIZE = 32
TYPE_LEN_DICT = {
"int8": 1,
"uint8": 1,
"uint16": 2,
"float16": 2,
"float32": 4,
"int32": 4
}
TYPE_NUM_EACH_BLOCK = {
"int8": 32,
"uint8": 32,
"float16": 16,
"float32": 8,
"int32": 8
}
TYPE_MASK_DICT = {
"int8": 256,
"uint8": 256,
"uint16": 128,
"float16": 128,
"int32": 64,
"float32": 64
}
PARAM_NUM_EACH_BOX = 5
POINT_NUM_EACH_BOX = 5
BURST_MAX = 65535
NBURST_MAX = 4095
REPEAT_TIMES_MAX = 255
UB_30K_SIZE = 30 * 1024
TILEW = 32
TILEH = 32
TILEM = TILEW * TILEH
BUFFERLIMIT = 39 * TILEM
M0 = 16
N0 = 16
K0 = 16
K1 = ceil_value(3, K0)
N1 = ceil_value(TILEM, N0)
M = ceil_value(2, M0) * M0
K = ceil_value(3, K0) * K0
N = ceil_value(TILEM, N0) * N0
K1NK0_SHAPE = (K1, N, K0)
N1MN0_SHAPE = (N1, M, N0)
N1MN0_NTHREAD = N1 * M * N0
DST_COORD_SHAPE = (M, N)
class WarpPerspectiveUint8():
def __init__(self):
soc_version = "Ascend310P3"
tbe_platform.set_current_compile_soc_info(soc_version, core_type="AiCore")
profile = tik.Dprofile()
self.tik_instance = tik.Tik(profile, disable_debug=False)
self.core_num = profile.get_aicore_num()
self.ub_size = profile.get_unified_buffer_size()
self.kernel_name = "warp_perspective_uint8"
self.dtype = Constant.FLOAT32
self.scalar_dtype = Constant.INT32
self.tiling_dtype = Constant.FLOAT32
self.image_dtype = Constant.UINT8
self.l1_in_type = Constant.FLOAT16
self.loc_out_type = Constant.FLOAT32
self.dtype_bytes_size = Constant.TYPE_LEN_DICT.get(self.dtype)
self.data_each_block = Constant.BLOCK_BYTE_SIZE // self.dtype_bytes_size
self.tiling_data = self.tik_instance.InputScalar(dtype=Constant.INT32, name="tiling_data")
self.init_scalar()
self.get_tiling_args()
self.input_image_shape = (self.number, self.input_height, self.input_width, self.channel)
self.output_image_shape = (self.number, self.output_height, self.output_width, self.channel)
self.bilinear_shape = (1, Constant.TILEM)
self.bilinear_nthread = Constant.TILEM
def init_scalar(self):
self.output_height = self.tik_instance.Scalar(dtype=self.scalar_dtype)
self.output_width = self.tik_instance.Scalar(dtype=self.scalar_dtype)
self.flags = self.tik_instance.Scalar(dtype=self.scalar_dtype)
self.border_mode = self.tik_instance.Scalar(dtype=self.scalar_dtype)
self.number = self.tik_instance.Scalar(dtype=self.scalar_dtype)
self.input_height = self.tik_instance.Scalar(dtype=self.scalar_dtype)
self.input_width = self.tik_instance.Scalar(dtype=self.scalar_dtype)
self.channel = self.tik_instance.Scalar(dtype=self.scalar_dtype)
self.loop_w = self.tik_instance.Scalar(self.scalar_dtype)
self.loop_h = self.tik_instance.Scalar(self.scalar_dtype)
self.loop_total = self.tik_instance.Scalar(self.scalar_dtype)
self.loop_w_idx = self.tik_instance.Scalar(self.scalar_dtype)
self.loop_h_idx = self.tik_instance.Scalar(self.scalar_dtype)
self.cur_w = self.tik_instance.Scalar(self.scalar_dtype)
self.cur_wup_32 = self.tik_instance.Scalar(self.scalar_dtype)
self.cur_h = self.tik_instance.Scalar(self.scalar_dtype)
self.w_start = self.tik_instance.Scalar(self.scalar_dtype)
self.h_start = self.tik_instance.Scalar(self.scalar_dtype)
self.coord_offset = self.tik_instance.Scalar(self.scalar_dtype)
self.temp = self.tik_instance.Scalar(Constant.FLOAT16)
self.load_input_or_not = self.tik_instance.Scalar(self.scalar_dtype)
self.x_min = self.tik_instance.Scalar(self.dtype)
self.x_max = self.tik_instance.Scalar(self.dtype)
self.y_min = self.tik_instance.Scalar(self.dtype)
self.y_max = self.tik_instance.Scalar(self.dtype)
self.x_int32_lft_top_idx = self.tik_instance.Scalar(self.scalar_dtype)
self.x_int32_lft_bot_idx = self.tik_instance.Scalar(self.scalar_dtype)
self.x_int32_rgt_top_idx = self.tik_instance.Scalar(self.scalar_dtype)
self.x_int32_rgt_bot_idx = self.tik_instance.Scalar(self.scalar_dtype)
self.y_int32_lft_top_idx = self.tik_instance.Scalar(self.scalar_dtype)
self.y_int32_lft_bot_idx = self.tik_instance.Scalar(self.scalar_dtype)
self.y_int32_rgt_top_idx = self.tik_instance.Scalar(self.scalar_dtype)
self.y_int32_rgt_bot_idx = self.tik_instance.Scalar(self.scalar_dtype)
self.src0 = self.tik_instance.Scalar(self.dtype)
self.src1 = self.tik_instance.Scalar(self.dtype)
self.input_width_float32 = self.tik_instance.Scalar(self.dtype)
self.input_height_float32 = self.tik_instance.Scalar(self.dtype)
self.input_width_sub_one_float32 = self.tik_instance.Scalar(self.dtype)
self.htmp = self.tik_instance.Scalar(self.scalar_dtype)
self.wtmp = self.tik_instance.Scalar(self.scalar_dtype)
self.wtmp_32 = self.tik_instance.Scalar(self.scalar_dtype)
def get_tiling_args(self):
self.tiling_gm = self.tik_instance.Tensor(self.tiling_dtype, (Constant.TILING_ARG_NUM,), name="tiling_gm",
scope=tik.scope_gm)
self.tiling_float32_ub = self.tik_instance.Tensor(self.dtype, (Constant.TILING_ARG_NUM,),
name="tiling_float32_ub", scope=tik.scope_ubuf)
self.tiling_float16_ub = self.tik_instance.Tensor(Constant.FLOAT16, (Constant.TILING_ARG_NUM,),
name="tiling_float16_ub", scope=tik.scope_ubuf)
self.tiling_int32_ub = self.tik_instance.Tensor(self.scalar_dtype, (Constant.TILING_ARG_NUM,),
name="tiling_int32_ub", scope=tik.scope_ubuf)
self.tik_instance.data_move(self.tiling_float32_ub, self.tiling_gm, 0, 1,
ceil_block(Constant.TILING_ARG_NUM, self.tiling_dtype), 0, 0)
round_mode = "none"
vec_conv_dynamic(self, round_mode, Constant.TILING_ARG_NUM, self.tiling_float16_ub, self.tiling_float32_ub)
round_mode = "round"
vec_conv_dynamic(self, round_mode, Constant.TILING_ARG_NUM, self.tiling_int32_ub, self.tiling_float16_ub)
output_height_float32 = self.tik_instance.Scalar(dtype=self.tiling_dtype, init_value=self.tiling_float32_ub[9])
round_mode = "round"
self.tik_instance.scalar_conv(round_mode, self.output_height, output_height_float32)
output_width_float32 = self.tik_instance.Scalar(dtype=self.tiling_dtype, init_value=self.tiling_float32_ub[10])
self.tik_instance.scalar_conv(round_mode, self.output_width, output_width_float32)
self.flags.set_as(self.tiling_int32_ub[11])
self.border_mode.set_as(self.tiling_int32_ub[12])
self.border_value = self.tik_instance.Tensor(Constant.FLOAT16, (4,), name="border_value", scope=tik.scope_ubuf)
self.border_value[0].set_as(self.tiling_float16_ub[13])
self.border_value[1].set_as(self.tiling_float16_ub[14])
self.border_value[2].set_as(self.tiling_float16_ub[15])
self.border_value[3].set_as(self.tiling_float16_ub[16])
self.number.set_as(self.tiling_int32_ub[17])
input_height_float32 = self.tik_instance.Scalar(dtype=self.tiling_dtype, init_value=self.tiling_float32_ub[18])
self.tik_instance.scalar_conv(round_mode, self.input_height, input_height_float32)
input_width_float32 = self.tik_instance.Scalar(dtype=self.tiling_dtype, init_value=self.tiling_float32_ub[19])
self.tik_instance.scalar_conv(round_mode, self.input_width, input_width_float32)
self.channel.set_as(self.tiling_int32_ub[20])
def warp_perspective_compute(self):
tik_instance = self.tik_instance
self.input_image_gm = tik_instance.Tensor(self.image_dtype,
self.input_image_shape,
name="input_image_gm",
scope=tik.scope_gm)
self.output_image_gm = tik_instance.Tensor(self.image_dtype,
self.output_image_shape,
name="output_image_gm",
scope=tik.scope_gm)
inputs = [self.input_image_gm, self.tiling_gm]
outputs = [self.output_image_gm]
self.warp_perspective_compute_tiling()
tik_instance.BuildCCE(kernel_name=self.kernel_name,
inputs=inputs,
outputs=outputs,
flowtable=[self.tiling_data, ])
return tik_instance
def warp_perspective_compute_tiling(self):
tik_instance = self.tik_instance
self.loop_w.set_as((self.output_width + Constant.TILEW - 1) / Constant.TILEW)
self.loop_h.set_as((self.output_height + Constant.TILEH - 1) / Constant.TILEH)
self.loop_total.set_as(self.loop_w * self.loop_h)
self.x = tik_instance.Tensor(Constant.FLOAT16, (16, 16), name="x", scope=tik.scope_ubuf)
tik_instance.vec_dup(128, self.x, 0, ceil_value(16 * 16, 128), 8)
self.x[0].set_as(1)
self.x[1].set_as(-1)
self.x[2].set_as(-1)
self.x[3].set_as(1)
self.x[17].set_as(1)
self.x[19].set_as(-1)
self.x[34].set_as(1)
self.x[35].set_as(-1)
self.x[51].set_as(1)
self.mat_k_l1 = tik_instance.Tensor(self.l1_in_type, (1, 16, 16), name='mat_k_l1', scope=tik.scope_cbuf)
tik_instance.data_move(self.mat_k_l1, self.x, 0, 1, ceil_block(256, Constant.FLOAT16), 0, 0)
tik_instance.vec_dup(128, self.x, 0, ceil_value(16 * 16, 128), 8)
self.x[0] = self.tiling_float16_ub[0]
self.x[1] = self.tiling_float16_ub[1]
self.x[2] = self.tiling_float16_ub[2]
self.x[16] = self.tiling_float16_ub[3]
self.x[17] = self.tiling_float16_ub[4]
self.x[18] = self.tiling_float16_ub[5]
self.x[32] = self.tiling_float16_ub[6]
self.x[33] = self.tiling_float16_ub[7]
self.x[34] = self.tiling_float16_ub[8]
self.trans_matrix_l1 = tik_instance.Tensor(self.l1_in_type, (1, 16, 16), name="trans_matrix_l1",
scope=tik.scope_cbuf)
tik_instance.data_move(self.trans_matrix_l1, self.x, 0, 1, ceil_block(256, Constant.FLOAT16), 0, 0)
core_task_num = self.loop_total
per_core_task_num = core_task_num // self.core_num
core_task_tail = core_task_num % self.core_num
with tik_instance.for_range(0, self.core_num, block_num=self.core_num) as i:
with tik_instance.for_range(0, per_core_task_num, name='j') as j:
loopi = i * per_core_task_num + j
self.compute_per_core(loopi)
with tik_instance.if_scope(i < core_task_tail):
loop_idx = self.core_num * per_core_task_num + i
self.compute_per_core(loop_idx)
def compute_per_core(self, loopi):
tik_instance = self.tik_instance
self.loop_w_idx.set_as(loopi % self.loop_w)
self.loop_h_idx.set_as(loopi / self.loop_w)
with tik_instance.if_scope(self.loop_w_idx == self.loop_w - 1):
self.cur_w.set_as(self.output_width - self.loop_w_idx * Constant.TILEW)
with tik_instance.else_scope():
self.cur_w.set_as(Constant.TILEW)
self.cur_wup_32.set_as(Constant.TILEW)
with tik_instance.if_scope(self.loop_h_idx == self.loop_h - 1):
self.cur_h.set_as(self.output_height - self.loop_h_idx * Constant.TILEH)
with tik_instance.else_scope():
self.cur_h.set_as(Constant.TILEH)
with tik_instance.if_scope(self.cur_w == self.cur_wup_32):
self.w_start.set_as(self.loop_w_idx * Constant.TILEW)
with tik_instance.else_scope():
self.w_start.set_as(self.output_width - Constant.TILEW)
self.h_start.set_as(self.loop_h_idx * Constant.TILEH)
self.coord_offset.set_as(self.h_start * self.output_width + self.w_start)
with tik_instance.for_range(0, self.number) as n:
self.coord_trans()
self.get_image_bilinear(n)
def coord_trans(self):
tik_instance = self.tik_instance
self.src_coord_ub = tik_instance.Tensor(self.l1_in_type, (16, Constant.TILEM), name="src_coord_ub",
scope=tik.scope_ubuf)
with tik_instance.for_range(0, self.cur_wup_32) as w:
self.temp.set_as(w + self.w_start)
self.x[w].set_as(self.temp)
with tik_instance.for_range(0, self.cur_h) as h:
tik_instance.data_move(self.src_coord_ub[h * self.cur_wup_32], self.x, 0, 1,
ceil_block(self.cur_wup_32, self.l1_in_type), 0, 0)
tik_instance.vec_dup(self.cur_wup_32, self.src_coord_ub[Constant.TILEM + h * self.cur_wup_32],
h + self.h_start, 1, 8)
tik_instance.vec_dup(128, self.src_coord_ub[2 * Constant.TILEM], 1, ceil_value(Constant.TILEM, 128), 8)
vec_dup_dynamic(self, (16 - 3) * Constant.TILEM, self.src_coord_ub[3 * Constant.TILEM], 0)
self.coord_trans_matmul()
def coord_trans_matmul(self):
tik_instance = self.tik_instance
self.src_coord_l1 = tik_instance.Tensor(self.l1_in_type, Constant.K1NK0_SHAPE, name='src_coord_l1',
scope=tik.scope_cbuf)
kn_to_k1nk0(self, self.src_coord_ub, self.src_coord_l1, self.l1_in_type, Constant.K1, Constant.N, Constant.K0)
self.dst_coord_loc = tik_instance.Tensor(self.loc_out_type, Constant.N1MN0_SHAPE, name='dst_coord_loc',
scope=tik.scope_cbuf_out)
tik_instance.matmul(self.dst_coord_loc, self.trans_matrix_l1, self.src_coord_l1, Constant.M, Constant.K,
Constant.N)
self.dst_coord_n1mn0_ub = tik_instance.Tensor(self.loc_out_type, Constant.N1MN0_SHAPE,
name='dst_coord_n1mn0_ub', scope=tik.scope_ubuf)
tik_instance.data_move(self.dst_coord_n1mn0_ub, self.dst_coord_loc, 0, 1,
ceil_value(Constant.N1MN0_NTHREAD, 256), 0, 0)
self.dst_coord_ub = tik_instance.Tensor(self.loc_out_type, Constant.DST_COORD_SHAPE, name='dst_coord_ub',
scope=tik.scope_ubuf)
self.src_ub = tik_instance.Tensor(self.loc_out_type,
(Constant.BUFFERLIMIT * 2 // Constant.TYPE_LEN_DICT.get(self.loc_out_type),),
name="src_ub", scope=tik.scope_ubuf)
n1mn0_to_mn(self, self.dst_coord_ub, self.dst_coord_n1mn0_ub, self.loc_out_type, Constant.N1, Constant.M,
Constant.N0, self.src_ub)
self.x_fp32_ub = tik_instance.Tensor(self.dtype, self.bilinear_shape, name='x_fp32_ub', scope=tik.scope_ubuf)
self.y_fp32_ub = tik_instance.Tensor(self.dtype, self.bilinear_shape, name='y_fp32_ub', scope=tik.scope_ubuf)
tik_instance.vdiv(64, self.x_fp32_ub, self.dst_coord_ub, self.dst_coord_ub[2 * Constant.TILEM],
ceil_value(Constant.TILEM, 64), 1, 1, 1, 8, 8, 8)
tik_instance.vdiv(64, self.y_fp32_ub, self.dst_coord_ub[Constant.TILEM], self.dst_coord_ub[2 * Constant.TILEM],
ceil_value(Constant.TILEM, 64), 1, 1, 1, 8, 8, 8)
def get_image_bilinear(self, n):
tik_instance = self.tik_instance
self.x_int32_ub = tik_instance.Tensor(Constant.INT32, self.bilinear_shape, name='x_int32_ub',
scope=tik.scope_ubuf)
self.y_int32_ub = tik_instance.Tensor(Constant.INT32, self.bilinear_shape, name='y_int32_ub',
scope=tik.scope_ubuf)
vec_conv_dynamic(self, 'floor', self.bilinear_nthread, self.x_int32_ub, self.x_fp32_ub)
vec_conv_dynamic(self, 'floor', self.bilinear_nthread, self.y_int32_ub, self.y_fp32_ub)
vec_conv_dynamic(self, 'none', self.bilinear_nthread, self.dst_coord_n1mn0_ub, self.x_int32_ub)
vec_conv_dynamic(self, 'none', self.bilinear_nthread, self.dst_coord_n1mn0_ub[self.bilinear_nthread],
self.y_int32_ub)
vmul_vadd_vsub_dynamic(self, 3, self.bilinear_nthread,
self.dst_coord_n1mn0_ub[2 * self.bilinear_nthread],
self.x_fp32_ub,
self.dst_coord_n1mn0_ub)
vmul_vadd_vsub_dynamic(self, 3, self.bilinear_nthread,
self.dst_coord_n1mn0_ub[3 * self.bilinear_nthread],
self.y_fp32_ub,
self.dst_coord_n1mn0_ub[self.bilinear_nthread])
vmul_vadd_vsub_dynamic(self, 1, self.bilinear_nthread,
self.dst_coord_n1mn0_ub[4 * self.bilinear_nthread],
self.dst_coord_n1mn0_ub[2 * self.bilinear_nthread],
self.dst_coord_n1mn0_ub[3 * self.bilinear_nthread])
tik_instance.vec_dup(128, self.src_coord_ub, 1, ceil_value(self.bilinear_nthread, 128), 8)
vec_conv_dynamic(self, 'none', Constant.TILEM,
self.src_coord_ub[self.bilinear_nthread],
self.dst_coord_n1mn0_ub[2 * self.bilinear_nthread])
vec_conv_dynamic(self, 'none', Constant.TILEM,
self.src_coord_ub[2 * self.bilinear_nthread],
self.dst_coord_n1mn0_ub[3 * self.bilinear_nthread])
vec_conv_dynamic(self, 'none', Constant.TILEM,
self.src_coord_ub[3 * self.bilinear_nthread],
self.dst_coord_n1mn0_ub[4 * self.bilinear_nthread])
vec_dup_dynamic(self, 12 * self.bilinear_nthread, self.src_coord_ub[4 * self.bilinear_nthread], 0)
self.k1mk0_workspace = self.dst_coord_n1mn0_ub.reinterpret_cast_to(self.l1_in_type)
mk_to_k1mk0(self, self.src_coord_ub, self.k1mk0_workspace, self.l1_in_type, 64, 16, Constant.K0)
self.k1mk0_workspace_trans = self.src_coord_ub.reshape((64, 16, Constant.K0))
tik_instance.vec_trans(self.k1mk0_workspace_trans, self.k1mk0_workspace, 64, 1, 1)
self.n_l1 = tik_instance.Tensor(self.l1_in_type, (64, 16, Constant.K0), name='n_l1', scope=tik.scope_cbuf)
tik_instance.data_move(self.n_l1, self.k1mk0_workspace_trans, 0, 1, ceil_block(1024 * 16, self.l1_in_type), 0,
0)
self.n_mul_k_loc = tik_instance.Tensor(self.loc_out_type, (64, 16, Constant.N0), name='dst_coord_loc',
scope=tik.scope_cbuf_out)
tik_instance.matmul(self.n_mul_k_loc, self.n_l1, self.mat_k_l1, 1024, 16, 16)
self.nk_n1mn0_ub = self.dst_coord_n1mn0_ub.reshape((64, 16, Constant.N0))
self.tik_instance.data_move(self.nk_n1mn0_ub, self.n_mul_k_loc, 0, 1, ceil_value(1024 * Constant.N0, 256), 0, 0)
self.nk_hw4_ub = self.dst_coord_ub.reshape((1024, 16))
n1mn0_to_mn(self, self.nk_hw4_ub, self.nk_n1mn0_ub, self.loc_out_type, 1, 1024, 16, self.src_ub)
self.nk_4hw_ub = self.dst_coord_n1mn0_ub.reshape((16, Constant.TILEH, Constant.TILEW))
tik_instance.v4dtrans(False, self.nk_4hw_ub, self.nk_hw4_ub, Constant.TILEM, 16)
self.nk_4hw_fp16_ub = self.src_coord_ub.reshape((16, Constant.TILEH, Constant.TILEW))
vec_conv_dynamic(self, 'none', 16 * Constant.TILEM, self.nk_4hw_fp16_ub, self.nk_4hw_ub)
self.move_input_image(n)
def move_input_image(self, n):
tik_instance = self.tik_instance
self.lft_top_idx = 0
self.lft_bot_idx = (self.cur_h - 1) * self.cur_wup_32
self.rgt_top_idx = self.cur_wup_32 - 1
self.rgt_bot_idx = self.lft_bot_idx + self.rgt_top_idx
self.load_input_or_not.set_as(1)
self.x_min.set_as(0)
self.x_max.set_as(0)
self.y_min.set_as(0)
self.y_max.set_as(0)
self.x_int32_lft_top_idx.set_as(self.x_int32_ub[self.lft_top_idx])
self.x_int32_lft_bot_idx.set_as(self.x_int32_ub[self.lft_bot_idx])
self.x_int32_rgt_top_idx.set_as(self.x_int32_ub[self.rgt_top_idx])
self.x_int32_rgt_bot_idx.set_as(self.x_int32_ub[self.rgt_bot_idx])
self.y_int32_lft_top_idx.set_as(self.y_int32_ub[self.lft_top_idx])
self.y_int32_lft_bot_idx.set_as(self.y_int32_ub[self.lft_bot_idx])
self.y_int32_rgt_top_idx.set_as(self.y_int32_ub[self.rgt_top_idx])
self.y_int32_rgt_bot_idx.set_as(self.y_int32_ub[self.rgt_bot_idx])
self.src0.set_as(0)
self.src1.set_as(0)
tik_instance.scalar_conv('none', self.input_width_float32, self.input_width)
tik_instance.scalar_conv('none', self.input_height_float32, self.input_height)
tik_instance.scalar_conv('none', self.src0, self.x_int32_lft_top_idx)
tik_instance.scalar_conv('none', self.src1, self.x_int32_lft_bot_idx)
tik_instance.scalar_min(self.x_min, self.src0, self.src1)
tik_instance.scalar_conv('none', self.src1, self.x_int32_rgt_top_idx)
tik_instance.scalar_min(self.x_min, self.x_min, self.src1)
tik_instance.scalar_conv('none', self.src1, self.x_int32_rgt_bot_idx)
tik_instance.scalar_min(self.x_min, self.x_min, self.src1)
tik_instance.scalar_max(self.x_min, 0, self.x_min)
with tik_instance.if_scope(self.x_min > (self.input_width_float32 - 1)):
self.load_input_or_not.set_as(0)
tik_instance.scalar_conv('none', self.src0, self.x_int32_lft_top_idx)
self.src0.set_as(self.src0 + 1)
tik_instance.scalar_conv('none', self.src1, self.x_int32_lft_bot_idx)
self.src1.set_as(self.src1 + 1)
tik_instance.scalar_max(self.x_max, self.src0, self.src1)
tik_instance.scalar_conv('none', self.src1, self.x_int32_rgt_top_idx)
self.src1.set_as(self.src1 + 1)
tik_instance.scalar_max(self.x_max, self.x_max, self.src1)
tik_instance.scalar_conv('none', self.src1, self.x_int32_rgt_bot_idx)
self.src1.set_as(self.src1 + 1)
tik_instance.scalar_max(self.x_max, self.x_max, self.src1)
self.input_width_sub_one_float32.set_as(self.input_width_float32 - 1)
tik_instance.scalar_min(self.x_max, self.input_width_sub_one_float32, self.x_max)
with tik_instance.if_scope(self.x_max < 0):
self.load_input_or_not.set_as(0)
tik_instance.scalar_conv('none', self.src0, self.y_int32_lft_top_idx)
tik_instance.scalar_conv('none', self.src1, self.y_int32_lft_bot_idx)
tik_instance.scalar_min(self.y_min, self.src0, self.src1)
tik_instance.scalar_conv('none', self.src1, self.y_int32_rgt_top_idx)
tik_instance.scalar_min(self.y_min, self.y_min, self.src1)
tik_instance.scalar_conv('none', self.src1, self.y_int32_rgt_bot_idx)
tik_instance.scalar_min(self.y_min, self.y_min, self.src1)
tik_instance.scalar_max(self.y_min, 0, self.y_min)
with tik_instance.if_scope(self.y_min > (self.input_height_float32 - 1)):
self.load_input_or_not.set_as(0)
tik_instance.scalar_conv('none', self.src0, self.y_int32_lft_top_idx)
self.src0.set_as(self.src0 + 1)
tik_instance.scalar_conv('none', self.src1, self.y_int32_lft_bot_idx)
self.src1.set_as(self.src1 + 1)
tik_instance.scalar_max(self.y_max, self.src0, self.src1)
tik_instance.scalar_conv('none', self.src1, self.y_int32_rgt_top_idx)
self.src1.set_as(self.src1 + 1)
tik_instance.scalar_max(self.y_max, self.y_max, self.src1)
tik_instance.scalar_conv('none', self.src1, self.y_int32_rgt_bot_idx)
self.src1.set_as(self.src1 + 1)
tik_instance.scalar_max(self.y_max, self.y_max, self.src1)
input_height_sub_one_float32 = tik_instance.Scalar(self.dtype, name='input_height_sub_one_float32')
input_height_sub_one_float32.set_as(self.input_height_float32 - 1)
tik_instance.scalar_min(self.y_max, input_height_sub_one_float32, self.y_max)
with tik_instance.if_scope(self.y_max < 0):
self.load_input_or_not.set_as(0)
pixel_tmp_ub, flag_int_ub = self.gather_from_image(n)
offset1 = tik_instance.Scalar(self.scalar_dtype)
offset2 = tik_instance.Scalar(self.scalar_dtype)
with tik_instance.for_range(0, 4) as i:
offset2.set_as(i * Constant.TILEM)
with tik_instance.for_range(0, self.channel) as c:
offset1.set_as(((i * self.channel) + c) * Constant.TILEM)
tik_instance.vmul(128, pixel_tmp_ub[offset1], pixel_tmp_ub[offset1], self.nk_4hw_fp16_ub[offset2],
Constant.TILEM // 128, 1, 1, 1, 8, 8, 8)
with tik_instance.for_range(1, 4) as i:
offset1.set_as(i * self.channel * Constant.TILEM)
tik_instance.vadd(128, pixel_tmp_ub, pixel_tmp_ub, pixel_tmp_ub[offset1],
self.channel * Constant.TILEM // 128, 1, 1, 1, 8, 8, 8)
pixel_val_fp16_ub = self.src_coord_ub.reinterpret_cast_to(Constant.FLOAT16)
scalar_temp = tik_instance.Scalar(Constant.FLOAT16, name='scalar_temp')
with tik_instance.for_range(0, self.channel) as c:
scalar_temp.set_as(self.border_value[c])
tik_instance.vec_dup(128, pixel_val_fp16_ub[c * Constant.TILEM], scalar_temp,
ceil_value(Constant.TILEM, 128), 8)
with tik_instance.for_range(0, self.channel) as c:
offset1.set_as(c * Constant.TILEM)
tik_instance.vec_sel(128, 2, pixel_val_fp16_ub[offset1], flag_int_ub, pixel_tmp_ub[offset1],
pixel_val_fp16_ub[offset1], Constant.TILEM // 128, 8, 8, 8)
output_image_fp16_ub = self.src_ub.reinterpret_cast_to(Constant.FLOAT16)
output_image_u8_ub = self.dst_coord_n1mn0_ub.reinterpret_cast_to(Constant.UINT8)
with tik_instance.if_scope(self.channel > 1):
tik_instance.v4dtrans(True, output_image_fp16_ub, pixel_val_fp16_ub, Constant.TILEM, self.channel)
vec_conv_dynamic(self, 'floor', (Constant.TILEM * self.channel), output_image_u8_ub, output_image_fp16_ub)
with tik_instance.else_scope():
vec_conv_dynamic(self, 'floor', (Constant.TILEM * self.channel), output_image_u8_ub, pixel_val_fp16_ub)
gm_offset_hw = tik_instance.Scalar(self.scalar_dtype)
gm_offset_hw.set_as((n * self.output_height * self.output_width + self.coord_offset) * self.channel)
with tik_instance.if_scope(self.output_width % 32 == 0):
tik_instance.data_move(self.output_image_gm[gm_offset_hw], output_image_u8_ub, 0, self.cur_h,
ceil_block(Constant.TILEW * self.channel, self.image_dtype), 0,
ceil_block((self.output_width - Constant.TILEW) * self.channel, self.image_dtype))
with tik_instance.else_scope():
with tik_instance.for_range(0, self.cur_h) as h:
tik_instance.data_move(self.output_image_gm[gm_offset_hw + h * self.output_width * self.channel],
output_image_u8_ub[h * Constant.TILEW * self.channel], 0, 1,
ceil_block(Constant.TILEW * self.channel, self.image_dtype), 0, 0)
def gather_from_image(self, n):
tik_instance = self.tik_instance
flag_int_ub = self.x.reinterpret_cast_to(Constant.UINT16)
tik_instance.vec_dup(64, flag_int_ub, 0, 1, 8)
gather_index = self.dst_coord_n1mn0_ub.reinterpret_cast_to(self.scalar_dtype)
pixel_tmp_ub = self.dst_coord_ub.reinterpret_cast_to(Constant.FLOAT16)
with tik_instance.if_scope(self.load_input_or_not == 1):
x_min_int32 = tik_instance.Scalar(self.scalar_dtype)
x_max_int32 = tik_instance.Scalar(self.scalar_dtype)
y_min_int32 = tik_instance.Scalar(self.scalar_dtype)
y_max_int32 = tik_instance.Scalar(self.scalar_dtype)
tik_instance.scalar_conv('round', x_min_int32, self.x_min)
tik_instance.scalar_conv('round', x_max_int32, self.x_max)
tik_instance.scalar_conv('round', y_min_int32, self.y_min)
tik_instance.scalar_conv('round', y_max_int32, self.y_max)
self.htmp.set_as(y_max_int32 - y_min_int32 + 1)
self.wtmp.set_as(x_max_int32 - x_min_int32 + 1)
self.wtmp_32.set_as(
ceil_block(self.wtmp, self.image_dtype) * Constant.TYPE_NUM_EACH_BLOCK.get(self.image_dtype))
self.input_image_ub = self.dst_coord_n1mn0_ub.reinterpret_cast_to(self.image_dtype)
with tik_instance.if_scope(self.htmp * self.wtmp_32 * self.channel * Constant.TYPE_LEN_DICT.get(
self.image_dtype) <= Constant.BUFFERLIMIT):
tik_instance.vec_adds(64, self.x_int32_ub, self.x_int32_ub, -1 * x_min_int32, Constant.TILEM // 64, 8,
8)
tik_instance.vec_adds(64, self.y_int32_ub, self.y_int32_ub, -1 * y_min_int32, Constant.TILEM // 64, 8,
8)
gm_in_offset1 = tik_instance.Scalar(self.scalar_dtype)
gm_in_offset1.set_as(
n * self.input_height * self.input_width + y_min_int32 * self.input_width + x_min_int32)
gm_in_offset = tik_instance.Scalar(self.scalar_dtype)
gm_in_offset.set_as(gm_in_offset1 * self.channel)
with tik_instance.if_scope(
tik.all(
(self.input_width * self.channel) % Constant.TYPE_NUM_EACH_BLOCK.get(self.image_dtype) == 0,
self.input_width >= self.wtmp_32)):
tik_instance.data_move(self.input_image_ub, self.input_image_gm[gm_in_offset], 0, self.htmp,
ceil_block(self.wtmp_32 * self.channel, self.image_dtype),
ceil_block((self.input_width - self.wtmp_32) * self.channel,
self.image_dtype), 0)
with tik_instance.else_scope():
with tik_instance.for_range(0, self.htmp) as h:
tik_instance.data_move(self.input_image_ub[h * self.wtmp_32 * self.channel],
self.input_image_gm[
gm_in_offset + h * self.input_width * self.channel], 0, 1,
ceil_block(self.wtmp_32 * self.channel, self.image_dtype), 0, 0)
self.input_image_fp16_ub = self.src_ub.reinterpret_cast_to(Constant.FLOAT16)
vec_conv_dynamic(self, 'none', (self.htmp * self.wtmp_32 * self.channel), self.input_image_fp16_ub,
self.input_image_ub)
vec_conv_dynamic(self, 'none', self.bilinear_nthread, self.x_fp32_ub, self.x_int32_ub)
vec_conv_dynamic(self, 'none', self.bilinear_nthread, self.y_fp32_ub, self.y_int32_ub)
cmp_ub = self.dst_coord_n1mn0_ub.reinterpret_cast_to(Constant.FLOAT32)
tik_instance.vec_dup(64, cmp_ub, 0, ceil_value(Constant.TILEM, 64), 8)
tik_instance.vec_dup(64, cmp_ub[1024], self.htmp - 1, ceil_value(Constant.TILEM, 64), 8)
tik_instance.vec_dup(64, cmp_ub[2 * 1024], self.wtmp - 1, ceil_value(Constant.TILEM, 64), 8)
repeat_times1 = ceil_value(Constant.TILEM, Constant.TYPE_MASK_DICT.get(Constant.INT32))
tik_instance.vec_cmpv_ge(flag_int_ub, self.y_fp32_ub, cmp_ub, repeat_times1, 8, 8)
tik_instance.vec_cmpv_le(flag_int_ub[64], self.y_fp32_ub, cmp_ub[1024], repeat_times1, 8, 8)
repeat_times2 = 1
tik_instance.vec_and(64, flag_int_ub, flag_int_ub, flag_int_ub[64], repeat_times2, 8, 8, 8)
tik_instance.vec_cmpv_ge(flag_int_ub[64], self.x_fp32_ub, cmp_ub, repeat_times1, 8, 8)
tik_instance.vec_and(64, flag_int_ub, flag_int_ub, flag_int_ub[64], repeat_times2, 8, 8, 8)
tik_instance.vec_cmpv_le(flag_int_ub[64], self.x_fp32_ub, cmp_ub[2 * 1024], repeat_times1, 8, 8)
tik_instance.vec_and(64, flag_int_ub, flag_int_ub, flag_int_ub[64], repeat_times2, 8, 8, 8)
tmp_ub = self.dst_coord_ub.reinterpret_cast_to(self.scalar_dtype)
tik_instance.vmuls(64, tmp_ub, self.y_int32_ub, self.wtmp_32 * self.channel, Constant.TILEM // 64, 1, 1,
8, 8)
tik_instance.vmuls(64, tmp_ub[1024], self.x_int32_ub, self.channel, Constant.TILEM // 64, 1, 1, 8, 8)
tik_instance.vadd(64, gather_index, tmp_ub, tmp_ub[1024], Constant.TILEM // 64, 1, 1, 1, 8, 8, 8)
tik_instance.vadds(64, gather_index[self.channel * Constant.TILEM], gather_index, self.channel,
Constant.TILEM // 64, 1, 1, 8, 8)
tik_instance.vadds(64, gather_index[2 * self.channel * Constant.TILEM], gather_index,
self.wtmp_32 * self.channel, Constant.TILEM // 64, 1, 1, 8, 8)
tik_instance.vadds(64, gather_index[3 * self.channel * Constant.TILEM],
gather_index[2 * self.channel * Constant.TILEM], self.channel, Constant.TILEM // 64,
1, 1, 8, 8)
with tik_instance.for_range(1, self.channel) as c:
with tik_instance.for_range(0, 4) as i:
tik_instance.vadds(64, gather_index[((i * self.channel) + c) * Constant.TILEM],
gather_index[i * self.channel * Constant.TILEM], c, Constant.TILEM // 64, 1,
1, 8, 8)
vmuls_dynamic(self, 4 * self.channel * Constant.TILEM, gather_index, gather_index, 2)
tik_instance.vgather(128, pixel_tmp_ub, self.input_image_fp16_ub, gather_index,
4 * self.channel * Constant.TILEM // 128, 0, 0, 3)
with tik_instance.else_scope():
cmp_ub = self.dst_coord_n1mn0_ub.reinterpret_cast_to(Constant.FLOAT32)
tik_instance.vec_dup(64, cmp_ub, 0, ceil_value(Constant.TILEM, 64), 8)
tik_instance.vec_dup(64, cmp_ub[1024], self.input_height_float32 - 1, ceil_value(Constant.TILEM, 64), 8)
tik_instance.vec_dup(64, cmp_ub[2 * 1024], self.input_width_float32 - 1, ceil_value(Constant.TILEM, 64),
8)
repeat_times1 = ceil_value(Constant.TILEM, Constant.TYPE_MASK_DICT.get(self.dtype))
tik_instance.vec_cmpv_ge(flag_int_ub, self.y_fp32_ub, cmp_ub, repeat_times1, 8, 8)
tik_instance.vec_cmpv_le(flag_int_ub[64], self.y_fp32_ub, cmp_ub[1024], repeat_times1, 8, 8)
repeat_times2 = 1
tik_instance.vec_and(64, flag_int_ub, flag_int_ub, flag_int_ub[64], repeat_times2, 8, 8, 8)
tik_instance.vec_cmpv_ge(flag_int_ub[64], self.x_fp32_ub, cmp_ub, repeat_times1, 8, 8)
tik_instance.vec_and(64, flag_int_ub, flag_int_ub, flag_int_ub[64], repeat_times2, 8, 8, 8)
tik_instance.vec_cmpv_le(flag_int_ub[64], self.x_fp32_ub, cmp_ub[2 * 1024], repeat_times1, 8, 8)
tik_instance.vec_and(64, flag_int_ub, flag_int_ub, flag_int_ub[64], repeat_times2, 8, 8, 8)
tmp_ub = self.dst_coord_ub.reinterpret_cast_to(self.scalar_dtype)
tik_instance.vmuls(64, tmp_ub, self.y_int32_ub, self.input_width * self.channel, Constant.TILEM // 64,
1, 1, 8, 8)
tik_instance.vmuls(64, tmp_ub[1024], self.x_int32_ub, self.channel, Constant.TILEM // 64, 1, 1, 8, 8)
tik_instance.vadd(64, gather_index, tmp_ub, tmp_ub[1024], Constant.TILEM // 64, 1, 1, 1, 8, 8, 8)
tik_instance.vadds(64, gather_index[self.channel * Constant.TILEM], gather_index, self.channel,
Constant.TILEM // 64, 1, 1, 8, 8)
tik_instance.vadds(64, gather_index[2 * self.channel * Constant.TILEM], gather_index,
self.input_width * self.channel, Constant.TILEM // 64, 1, 1, 8, 8)
tik_instance.vadds(64, gather_index[3 * self.channel * Constant.TILEM],
gather_index[2 * self.channel * Constant.TILEM], self.channel, Constant.TILEM // 64,
1, 1, 8, 8)
with tik_instance.for_range(1, self.channel) as c:
with tik_instance.for_range(0, 4) as i:
tik_instance.vadds(64, gather_index[((i * self.channel) + c) * Constant.TILEM],
gather_index[i * self.channel * Constant.TILEM], c, Constant.TILEM // 64, 1,
1, 8, 8)
tik_instance.vec_adds(64, gather_index, gather_index,
n * self.input_height * self.input_width * self.channel,
4 * self.channel * Constant.TILEM // 64, 8, 8)
pixel_tmp_uint8_ub = self.src_ub.reinterpret_cast_to(self.image_dtype)
offset = tik_instance.Scalar(self.scalar_dtype)
index = tik_instance.Scalar(self.scalar_dtype)
with tik_instance.for_range(0, self.channel) as c:
with tik_instance.for_range(0, self.cur_h) as h:
with tik_instance.for_range(0, self.cur_wup_32) as w:
offset.set_as((c * self.cur_h + h) * self.cur_wup_32 + w)
index.set_as(gather_index[offset])
pixel_tmp_uint8_ub[offset].set_as(self.input_image_gm[index])
offset.set_as(((self.channel + c) * self.cur_h + h) * self.cur_wup_32 + w)
index.set_as(gather_index[offset])
pixel_tmp_uint8_ub[offset].set_as(self.input_image_gm[index])
offset.set_as(((2 * self.channel + c) * self.cur_h + h) * self.cur_wup_32 + w)
index.set_as(gather_index[offset])
pixel_tmp_uint8_ub[offset].set_as(self.input_image_gm[index])
offset.set_as(((3 * self.channel + c) * self.cur_h + h) * self.cur_wup_32 + w)
index.set_as(gather_index[offset])
pixel_tmp_uint8_ub[offset].set_as(self.input_image_gm[index])
vec_conv_dynamic(self, 'none', 4 * self.channel * Constant.TILEM, pixel_tmp_ub, pixel_tmp_uint8_ub)
return pixel_tmp_ub, flag_int_ub
class WarpPerspectiveFloat():
def __init__(self, image_dtype):
soc_version = "Ascend310P3"
tbe_platform.set_current_compile_soc_info(soc_version, core_type="AiCore")
profile = tik.Dprofile()
self.tik_instance = tik.Tik(profile, disable_debug=False)
self.core_num = profile.get_aicore_num()
self.ub_size = profile.get_unified_buffer_size()
self.kernel_name = "warp_perspective_float32" if (
image_dtype == Constant.FLOAT32) else "warp_perspective_float16"
self.dtype = Constant.FLOAT32
self.scalar_dtype = Constant.INT32
self.tiling_dtype = Constant.FLOAT32
self.image_dtype = Constant.FLOAT32 if (image_dtype == Constant.FLOAT32) else Constant.FLOAT16
self.l1_in_type = Constant.FLOAT16
self.loc_out_type = Constant.FLOAT32
self.dtype_bytes_size = Constant.TYPE_LEN_DICT.get(self.dtype)
self.data_each_block = Constant.BLOCK_BYTE_SIZE // self.dtype_bytes_size
self.image_mask = Constant.TYPE_MASK_DICT.get(self.image_dtype)
self.tiling_data = self.tik_instance.InputScalar(dtype=Constant.INT32, name="tiling_data")
self.init_scalar()
self.get_tiling_args()
self.input_image_shape = (self.number, self.input_height, self.input_width, self.channel)
self.output_image_shape = (self.number, self.output_height, self.output_width, self.channel)
self.bilinear_shape = (1, Constant.TILEM)
self.bilinear_nthread = Constant.TILEM
def init_scalar(self):
self.output_height = self.tik_instance.Scalar(dtype=self.scalar_dtype)
self.output_width = self.tik_instance.Scalar(dtype=self.scalar_dtype)
self.flags = self.tik_instance.Scalar(dtype=self.scalar_dtype)
self.border_mode = self.tik_instance.Scalar(dtype=self.scalar_dtype)
self.number = self.tik_instance.Scalar(dtype=self.scalar_dtype)
self.input_height = self.tik_instance.Scalar(dtype=self.scalar_dtype)
self.input_width = self.tik_instance.Scalar(dtype=self.scalar_dtype)
self.channel = self.tik_instance.Scalar(dtype=self.scalar_dtype)
self.loop_w = self.tik_instance.Scalar(self.scalar_dtype)
self.loop_h = self.tik_instance.Scalar(self.scalar_dtype)
self.loop_total = self.tik_instance.Scalar(self.scalar_dtype)
self.loop_w_idx = self.tik_instance.Scalar(self.scalar_dtype)
self.loop_h_idx = self.tik_instance.Scalar(self.scalar_dtype)
self.cur_w = self.tik_instance.Scalar(self.scalar_dtype)
self.cur_wup_32 = self.tik_instance.Scalar(self.scalar_dtype)
self.cur_h = self.tik_instance.Scalar(self.scalar_dtype)
self.w_start = self.tik_instance.Scalar(self.scalar_dtype)
self.h_start = self.tik_instance.Scalar(self.scalar_dtype)
self.coord_offset = self.tik_instance.Scalar(self.scalar_dtype)
self.temp = self.tik_instance.Scalar(Constant.FLOAT16)
self.load_input_or_not = self.tik_instance.Scalar(self.scalar_dtype)
self.x_min = self.tik_instance.Scalar(self.dtype)
self.x_max = self.tik_instance.Scalar(self.dtype)
self.y_min = self.tik_instance.Scalar(self.dtype)
self.y_max = self.tik_instance.Scalar(self.dtype)
self.x_int32_lft_top_idx = self.tik_instance.Scalar(self.scalar_dtype)
self.x_int32_lft_bot_idx = self.tik_instance.Scalar(self.scalar_dtype)
self.x_int32_rgt_top_idx = self.tik_instance.Scalar(self.scalar_dtype)
self.x_int32_rgt_bot_idx = self.tik_instance.Scalar(self.scalar_dtype)
self.y_int32_lft_top_idx = self.tik_instance.Scalar(self.scalar_dtype)
self.y_int32_lft_bot_idx = self.tik_instance.Scalar(self.scalar_dtype)
self.y_int32_rgt_top_idx = self.tik_instance.Scalar(self.scalar_dtype)
self.y_int32_rgt_bot_idx = self.tik_instance.Scalar(self.scalar_dtype)
self.src0 = self.tik_instance.Scalar(self.dtype)
self.src1 = self.tik_instance.Scalar(self.dtype)
self.input_width_float32 = self.tik_instance.Scalar(self.dtype)
self.input_height_float32 = self.tik_instance.Scalar(self.dtype)
self.x_min_int32 = self.tik_instance.Scalar(self.scalar_dtype)
self.x_max_int32 = self.tik_instance.Scalar(self.scalar_dtype)
self.y_min_int32 = self.tik_instance.Scalar(self.scalar_dtype)
self.y_max_int32 = self.tik_instance.Scalar(self.scalar_dtype)
self.h_tmp = self.tik_instance.Scalar(self.scalar_dtype)
self.w_tmp = self.tik_instance.Scalar(self.scalar_dtype)
self.w_tmp_32 = self.tik_instance.Scalar(self.scalar_dtype)
def get_tiling_args(self):
tik_instance = self.tik_instance
self.tiling_gm = tik_instance.Tensor(self.tiling_dtype, (Constant.TILING_ARG_NUM,), name="tiling_gm",
scope=tik.scope_gm)
self.tiling_float32_ub = tik_instance.Tensor(self.dtype, (Constant.TILING_ARG_NUM,), name="tiling_float32_ub",
scope=tik.scope_ubuf)
self.tiling_float16_ub = tik_instance.Tensor(Constant.FLOAT16, (Constant.TILING_ARG_NUM,),
name="tiling_float16_ub", scope=tik.scope_ubuf)
self.tiling_int32_ub = tik_instance.Tensor(self.scalar_dtype, (Constant.TILING_ARG_NUM,),
name="tiling_int32_ub", scope=tik.scope_ubuf)
tik_instance.data_move(self.tiling_float32_ub, self.tiling_gm, 0, 1,
ceil_block(Constant.TILING_ARG_NUM, self.tiling_dtype), 0, 0)
vec_conv_dynamic(self, "none", Constant.TILING_ARG_NUM, self.tiling_float16_ub, self.tiling_float32_ub)
vec_conv_dynamic(self, "round", Constant.TILING_ARG_NUM, self.tiling_int32_ub, self.tiling_float16_ub)
output_height_float32 = tik_instance.Scalar(dtype=self.tiling_dtype, init_value=self.tiling_float32_ub[9])
round_mode = "round"
tik_instance.scalar_conv(round_mode, self.output_height, output_height_float32)
output_width_float32 = tik_instance.Scalar(dtype=self.tiling_dtype, init_value=self.tiling_float32_ub[10])
tik_instance.scalar_conv(round_mode, self.output_width, output_width_float32)
self.flags.set_as(self.tiling_int32_ub[11])
self.border_mode.set_as(self.tiling_int32_ub[12])
self.border_value = tik_instance.Tensor(self.image_dtype, (4,), name="border_value", scope=tik.scope_ubuf)
if self.image_dtype == Constant.FLOAT32:
self.border_value[0].set_as(self.tiling_float32_ub[13])
self.border_value[1].set_as(self.tiling_float32_ub[14])
self.border_value[2].set_as(self.tiling_float32_ub[15])
self.border_value[3].set_as(self.tiling_float32_ub[16])
else:
self.border_value[0].set_as(self.tiling_float16_ub[13])
self.border_value[1].set_as(self.tiling_float16_ub[14])
self.border_value[2].set_as(self.tiling_float16_ub[15])
self.border_value[3].set_as(self.tiling_float16_ub[16])
self.number.set_as(self.tiling_int32_ub[17])
input_height_float32 = tik_instance.Scalar(dtype=self.tiling_dtype, init_value=self.tiling_float32_ub[18])
tik_instance.scalar_conv(round_mode, self.input_height, input_height_float32)
input_width_float32 = tik_instance.Scalar(dtype=self.tiling_dtype, init_value=self.tiling_float32_ub[19])
tik_instance.scalar_conv(round_mode, self.input_width, input_width_float32)
self.channel.set_as(self.tiling_int32_ub[20])
def warp_perspective_compute(self):
tik_instance = self.tik_instance
self.input_image_gm = tik_instance.Tensor(self.image_dtype,
self.input_image_shape,
name="input_image_gm",
scope=tik.scope_gm)
self.output_image_gm = tik_instance.Tensor(self.image_dtype,
self.output_image_shape,
name="output_image_gm",
scope=tik.scope_gm)
inputs = [self.input_image_gm, self.tiling_gm]
outputs = [self.output_image_gm]
self.warp_perspective_compute_tiling()
tik_instance.BuildCCE(kernel_name=self.kernel_name,
inputs=inputs,
outputs=outputs,
flowtable=[self.tiling_data, ])
return tik_instance
def warp_perspective_compute_tiling(self):
tik_instance = self.tik_instance
self.loop_w.set_as((self.output_width + Constant.TILEW - 1) / Constant.TILEW)
self.loop_h.set_as((self.output_height + Constant.TILEH - 1) / Constant.TILEH)
self.loop_total.set_as(self.loop_w * self.loop_h)
self.x = tik_instance.Tensor(Constant.FLOAT16, (16, 16), name="x", scope=tik.scope_ubuf)
tik_instance.vec_dup(128, self.x, 0, ceil_value(16 * 16, 128), 8)
self.x[0].set_as(1)
self.x[1].set_as(-1)
self.x[2].set_as(-1)
self.x[3].set_as(1)
self.x[17].set_as(1)
self.x[19].set_as(-1)
self.x[34].set_as(1)
self.x[35].set_as(-1)
self.x[51].set_as(1)
self.mat_k_l1 = tik_instance.Tensor(self.l1_in_type, (1, 16, 16), name='mat_k_l1', scope=tik.scope_cbuf)
tik_instance.data_move(self.mat_k_l1, self.x, 0, 1, ceil_block(256, Constant.FLOAT16), 0, 0)
tik_instance.vec_dup(128, self.x, 0, ceil_value(16 * 16, 128), 8)
self.x[0] = self.tiling_float16_ub[0]
self.x[1] = self.tiling_float16_ub[1]
self.x[2] = self.tiling_float16_ub[2]
self.x[16] = self.tiling_float16_ub[3]
self.x[17] = self.tiling_float16_ub[4]
self.x[18] = self.tiling_float16_ub[5]
self.x[32] = self.tiling_float16_ub[6]
self.x[33] = self.tiling_float16_ub[7]
self.x[34] = self.tiling_float16_ub[8]
self.trans_matrix_l1 = tik_instance.Tensor(self.l1_in_type, (1, 16, 16), name="trans_matrix_l1",
scope=tik.scope_cbuf)
tik_instance.data_move(self.trans_matrix_l1, self.x, 0, 1, ceil_block(256, Constant.FLOAT16), 0, 0)
core_task_num = self.loop_total
per_core_task_num = core_task_num // self.core_num
core_task_tail = core_task_num % self.core_num
with tik_instance.for_range(0, self.core_num, block_num=self.core_num) as i:
with tik_instance.for_range(0, per_core_task_num, name='j') as j:
loopi = i * per_core_task_num + j
self.compute_per_core(loopi)
with tik_instance.if_scope(i < core_task_tail):
loop_idx = self.core_num * per_core_task_num + i
self.compute_per_core(loop_idx)
def compute_per_core(self, loopi):
tik_instance = self.tik_instance
self.loop_w_idx.set_as(loopi % self.loop_w)
self.loop_h_idx.set_as(loopi / self.loop_w)
with tik_instance.if_scope(self.loop_w_idx == self.loop_w - 1):
self.cur_w.set_as(self.output_width - self.loop_w_idx * Constant.TILEW)
with tik_instance.else_scope():
self.cur_w.set_as(Constant.TILEW)
self.cur_wup_32.set_as(Constant.TILEW)
with tik_instance.if_scope(self.loop_h_idx == self.loop_h - 1):
self.cur_h.set_as(self.output_height - self.loop_h_idx * Constant.TILEH)
with tik_instance.else_scope():
self.cur_h.set_as(Constant.TILEH)
with tik_instance.if_scope(self.cur_w == self.cur_wup_32):
self.w_start.set_as(self.loop_w_idx * Constant.TILEW)
with tik_instance.else_scope():
self.w_start.set_as(self.output_width - Constant.TILEW)
self.h_start.set_as(self.loop_h_idx * Constant.TILEH)
self.coord_offset.set_as(self.h_start * self.output_width + self.w_start)
with tik_instance.for_range(0, self.number) as n:
self.coord_trans()
self.get_image_bilinear(n)
def coord_trans(self):
tik_instance = self.tik_instance
self.src_coord_ub = tik_instance.Tensor(self.l1_in_type, (16, Constant.TILEM), name="src_coord_ub",
scope=tik.scope_ubuf)
with tik_instance.for_range(0, self.cur_wup_32) as w:
self.temp.set_as(w + self.w_start)
self.x[w].set_as(self.temp)
with tik_instance.for_range(0, self.cur_h) as h:
tik_instance.data_move(self.src_coord_ub[h * self.cur_wup_32], self.x, 0, 1,
ceil_block(self.cur_wup_32, self.l1_in_type), 0, 0)
tik_instance.vec_dup(self.cur_wup_32, self.src_coord_ub[Constant.TILEM + h * self.cur_wup_32],
h + self.h_start, 1, 8)
tik_instance.vec_dup(128, self.src_coord_ub[2 * Constant.TILEM], 1, ceil_value(Constant.TILEM, 128), 8)
vec_dup_dynamic(self, (16 - 3) * Constant.TILEM, self.src_coord_ub[3 * Constant.TILEM], 0)
self.coord_trans_matmul()
def coord_trans_matmul(self):
tik_instance = self.tik_instance
self.src_coord_l1 = tik_instance.Tensor(self.l1_in_type, Constant.K1NK0_SHAPE, name='src_coord_l1',
scope=tik.scope_cbuf)
kn_to_k1nk0(self, self.src_coord_ub, self.src_coord_l1, self.l1_in_type, Constant.K1, Constant.N, Constant.K0)
self.dst_coord_loc = tik_instance.Tensor(self.loc_out_type, Constant.N1MN0_SHAPE, name='dst_coord_loc',
scope=tik.scope_cbuf_out)
tik_instance.matmul(self.dst_coord_loc, self.trans_matrix_l1, self.src_coord_l1, Constant.M, Constant.K,
Constant.N)
self.dst_coord_n1mn0_ub = tik_instance.Tensor(self.loc_out_type, Constant.N1MN0_SHAPE,
name='dst_coord_n1mn0_ub', scope=tik.scope_ubuf)
tik_instance.data_move(self.dst_coord_n1mn0_ub, self.dst_coord_loc, 0, 1,
ceil_value(Constant.N1MN0_NTHREAD, 256), 0, 0)
self.dst_coord_ub = tik_instance.Tensor(self.loc_out_type, Constant.DST_COORD_SHAPE, name='dst_coord_ub',
scope=tik.scope_ubuf)
self.src_ub = tik_instance.Tensor(self.loc_out_type,
(Constant.BUFFERLIMIT * 2 // Constant.TYPE_LEN_DICT.get(self.loc_out_type),),
name="src_ub", scope=tik.scope_ubuf)
n1mn0_to_mn(self, self.dst_coord_ub, self.dst_coord_n1mn0_ub, self.loc_out_type, Constant.N1, Constant.M,
Constant.N0, self.src_ub)
self.x_fp32_ub = tik_instance.Tensor(self.dtype, self.bilinear_shape, name='x_fp32_ub', scope=tik.scope_ubuf)
self.y_fp32_ub = tik_instance.Tensor(self.dtype, self.bilinear_shape, name='y_fp32_ub', scope=tik.scope_ubuf)
tik_instance.vdiv(64, self.x_fp32_ub, self.dst_coord_ub, self.dst_coord_ub[2 * Constant.TILEM],
ceil_value(Constant.TILEM, 64), 1, 1, 1, 8, 8, 8)
tik_instance.vdiv(64, self.y_fp32_ub, self.dst_coord_ub[Constant.TILEM], self.dst_coord_ub[2 * Constant.TILEM],
ceil_value(Constant.TILEM, 64), 1, 1, 1, 8, 8, 8)
def get_image_bilinear(self, n):
tik_instance = self.tik_instance
self.x_int32_ub = tik_instance.Tensor(Constant.INT32, self.bilinear_shape, name='x_int32_ub',
scope=tik.scope_ubuf)
self.y_int32_ub = tik_instance.Tensor(Constant.INT32, self.bilinear_shape, name='y_int32_ub',
scope=tik.scope_ubuf)
vec_conv_dynamic(self, 'floor', self.bilinear_nthread, self.x_int32_ub, self.x_fp32_ub)
vec_conv_dynamic(self, 'floor', self.bilinear_nthread, self.y_int32_ub, self.y_fp32_ub)
vec_conv_dynamic(self, 'none', self.bilinear_nthread, self.dst_coord_n1mn0_ub,
self.x_int32_ub)
vec_conv_dynamic(self, 'none', self.bilinear_nthread, self.dst_coord_n1mn0_ub[self.bilinear_nthread],
self.y_int32_ub)
vmul_vadd_vsub_dynamic(self, 3, self.bilinear_nthread,
self.dst_coord_n1mn0_ub[2 * self.bilinear_nthread],
self.x_fp32_ub,
self.dst_coord_n1mn0_ub)
vmul_vadd_vsub_dynamic(self, 3, self.bilinear_nthread,
self.dst_coord_n1mn0_ub[3 * self.bilinear_nthread],
self.y_fp32_ub,
self.dst_coord_n1mn0_ub[self.bilinear_nthread])
vmul_vadd_vsub_dynamic(self, 1, self.bilinear_nthread,
self.dst_coord_n1mn0_ub[4 * self.bilinear_nthread],
self.dst_coord_n1mn0_ub[2 * self.bilinear_nthread],
self.dst_coord_n1mn0_ub[3 * self.bilinear_nthread])
tik_instance.vec_dup(128, self.src_coord_ub, 1, ceil_value(self.bilinear_nthread, 128), 8)
vec_conv_dynamic(self, 'none', Constant.TILEM,
self.src_coord_ub[self.bilinear_nthread],
self.dst_coord_n1mn0_ub[2 * self.bilinear_nthread])
vec_conv_dynamic(self, 'none', Constant.TILEM,
self.src_coord_ub[2 * self.bilinear_nthread],
self.dst_coord_n1mn0_ub[3 * self.bilinear_nthread])
vec_conv_dynamic(self, 'none', Constant.TILEM,
self.src_coord_ub[3 * self.bilinear_nthread],
self.dst_coord_n1mn0_ub[4 * self.bilinear_nthread])
vec_dup_dynamic(self, 12 * self.bilinear_nthread, self.src_coord_ub[4 * self.bilinear_nthread], 0)
self.k1mk0_workspace = self.dst_coord_n1mn0_ub.reinterpret_cast_to(self.l1_in_type)
mk_to_k1mk0(self, self.src_coord_ub, self.k1mk0_workspace, self.l1_in_type, 64, 16, Constant.K0)
self.k1mk0_workspace_trans = self.src_coord_ub.reshape((64, 16, Constant.K0))
tik_instance.vec_trans(self.k1mk0_workspace_trans, self.k1mk0_workspace, 64, 1, 1)
self.n_l1 = tik_instance.Tensor(self.l1_in_type, (64, 16, Constant.K0), name='n_l1', scope=tik.scope_cbuf)
tik_instance.data_move(self.n_l1, self.k1mk0_workspace_trans, 0, 1, ceil_block(1024 * 16, self.l1_in_type), 0,
0)
self.n_mul_k_loc = tik_instance.Tensor(self.loc_out_type, (64, 16, Constant.N0), name='dst_coord_loc',
scope=tik.scope_cbuf_out)
tik_instance.matmul(self.n_mul_k_loc, self.n_l1, self.mat_k_l1, 1024, 16, 16)
self.nk_n1mn0_ub = self.dst_coord_n1mn0_ub.reshape((64, 16, Constant.N0))
self.tik_instance.data_move(self.nk_n1mn0_ub, self.n_mul_k_loc, 0, 1, ceil_value(1024 * Constant.N0, 256), 0, 0)
self.nk_hw4_ub = self.dst_coord_ub.reshape((1024, 16))
n1mn0_to_mn(self, self.nk_hw4_ub, self.nk_n1mn0_ub, self.loc_out_type, 1, 1024, 16, self.src_ub)
self.nk_4hw_ub = self.dst_coord_n1mn0_ub.reshape((16, Constant.TILEH, Constant.TILEW))
tik_instance.v4dtrans(False, self.nk_4hw_ub, self.nk_hw4_ub, Constant.TILEM, 16)
self.nk_4hw_fp_ub = self.src_coord_ub.reinterpret_cast_to(self.image_dtype)
if self.image_dtype == Constant.FLOAT32:
tik_instance.data_move(self.nk_4hw_fp_ub, self.nk_4hw_ub, 0, 1, ceil_block(4 * 1024, Constant.FLOAT32), 0,
0)
else:
vec_conv_dynamic(self, 'none', 4 * Constant.TILEM, self.nk_4hw_fp_ub, self.nk_4hw_ub)
self.move_input_image(n)
def move_input_image(self, n):
tik_instance = self.tik_instance
self.lft_top_idx = 0
self.lft_bot_idx = (self.cur_h - 1) * self.cur_wup_32
self.rgt_top_idx = self.cur_wup_32 - 1
self.rgt_bot_idx = self.lft_bot_idx + self.rgt_top_idx
self.load_input_or_not.set_as(1)
self.x_min.set_as(0)
self.x_max.set_as(0)
self.y_min.set_as(0)
self.y_max.set_as(0)
self.x_int32_lft_top_idx.set_as(self.x_int32_ub[self.lft_top_idx])
self.x_int32_lft_bot_idx.set_as(self.x_int32_ub[self.lft_bot_idx])
self.x_int32_rgt_top_idx.set_as(self.x_int32_ub[self.rgt_top_idx])
self.x_int32_rgt_bot_idx.set_as(self.x_int32_ub[self.rgt_bot_idx])
self.y_int32_lft_top_idx.set_as(self.y_int32_ub[self.lft_top_idx])
self.y_int32_lft_bot_idx.set_as(self.y_int32_ub[self.lft_bot_idx])
self.y_int32_rgt_top_idx.set_as(self.y_int32_ub[self.rgt_top_idx])
self.y_int32_rgt_bot_idx.set_as(self.y_int32_ub[self.rgt_bot_idx])
self.src0.set_as(0)
self.src1.set_as(0)
tik_instance.scalar_conv('none', self.input_width_float32, self.input_width)
tik_instance.scalar_conv('none', self.input_height_float32, self.input_height)
tik_instance.scalar_conv('none', self.src0, self.x_int32_lft_top_idx)
tik_instance.scalar_conv('none', self.src1, self.x_int32_lft_bot_idx)
tik_instance.scalar_min(self.x_min, self.src0, self.src1)
tik_instance.scalar_conv('none', self.src1, self.x_int32_rgt_top_idx)
tik_instance.scalar_min(self.x_min, self.x_min, self.src1)
tik_instance.scalar_conv('none', self.src1, self.x_int32_rgt_bot_idx)
tik_instance.scalar_min(self.x_min, self.x_min, self.src1)
tik_instance.scalar_max(self.x_min, 0, self.x_min)
with tik_instance.if_scope(self.x_min > (self.input_width_float32 - 1)):
self.load_input_or_not.set_as(0)
tik_instance.scalar_conv('none', self.src0, self.x_int32_lft_top_idx)
self.src0.set_as(self.src0 + 1)
tik_instance.scalar_conv('none', self.src1, self.x_int32_lft_bot_idx)
self.src1.set_as(self.src1 + 1)
tik_instance.scalar_max(self.x_max, self.src0, self.src1)
tik_instance.scalar_conv('none', self.src1, self.x_int32_rgt_top_idx)
self.src1.set_as(self.src1 + 1)
tik_instance.scalar_max(self.x_max, self.x_max, self.src1)
tik_instance.scalar_conv('none', self.src1, self.x_int32_rgt_bot_idx)
self.src1.set_as(self.src1 + 1)
tik_instance.scalar_max(self.x_max, self.x_max, self.src1)
input_width_sub_one_float32 = tik_instance.Scalar(self.dtype, name='input_width_sub_one_float32')
input_width_sub_one_float32.set_as(self.input_width_float32 - 1)
tik_instance.scalar_min(self.x_max, input_width_sub_one_float32, self.x_max)
with tik_instance.if_scope(self.x_max < 0):
self.load_input_or_not.set_as(0)
tik_instance.scalar_conv('none', self.src0, self.y_int32_lft_top_idx)
tik_instance.scalar_conv('none', self.src1, self.y_int32_lft_bot_idx)
tik_instance.scalar_min(self.y_min, self.src0, self.src1)
tik_instance.scalar_conv('none', self.src1, self.y_int32_rgt_top_idx)
tik_instance.scalar_min(self.y_min, self.y_min, self.src1)
tik_instance.scalar_conv('none', self.src1, self.y_int32_rgt_bot_idx)
tik_instance.scalar_min(self.y_min, self.y_min, self.src1)
tik_instance.scalar_max(self.y_min, 0, self.y_min)
with tik_instance.if_scope(self.y_min > (self.input_height_float32 - 1)):
self.load_input_or_not.set_as(0)
tik_instance.scalar_conv('none', self.src0, self.y_int32_lft_top_idx)
self.src0.set_as(self.src0 + 1)
tik_instance.scalar_conv('none', self.src1, self.y_int32_lft_bot_idx)
self.src1.set_as(self.src1 + 1)
tik_instance.scalar_max(self.y_max, self.src0, self.src1)
tik_instance.scalar_conv('none', self.src1, self.y_int32_rgt_top_idx)
self.src1.set_as(self.src1 + 1)
tik_instance.scalar_max(self.y_max, self.y_max, self.src1)
tik_instance.scalar_conv('none', self.src1, self.y_int32_rgt_bot_idx)
self.src1.set_as(self.src1 + 1)
tik_instance.scalar_max(self.y_max, self.y_max, self.src1)
input_height_sub_one_float32 = tik_instance.Scalar(self.dtype, name='input_height_sub_one_float32')
input_height_sub_one_float32.set_as(self.input_height_float32 - 1)
tik_instance.scalar_min(self.y_max, input_height_sub_one_float32, self.y_max)
with tik_instance.if_scope(self.y_max < 0):
self.load_input_or_not.set_as(0)
pixel_tmp_ub, flag_int_ub = self.gather_from_image(n)
offset1 = tik_instance.Scalar(self.scalar_dtype)
offset2 = tik_instance.Scalar(self.scalar_dtype)
with tik_instance.for_range(0, 4) as i:
offset2.set_as(i * Constant.TILEM)
with tik_instance.for_range(0, self.channel) as c:
offset1.set_as(((i * self.channel) + c) * Constant.TILEM)
tik_instance.vmul(self.image_mask, pixel_tmp_ub[offset1], pixel_tmp_ub[offset1],
self.nk_4hw_fp_ub[offset2], Constant.TILEM // self.image_mask, 1, 1, 1, 8, 8, 8)
with tik_instance.for_range(1, 4) as i:
offset1.set_as(i * self.channel * Constant.TILEM)
tik_instance.vadd(self.image_mask, pixel_tmp_ub, pixel_tmp_ub, pixel_tmp_ub[offset1],
self.channel * Constant.TILEM // self.image_mask, 1, 1, 1, 8, 8, 8)
scalar_temp = tik_instance.Scalar(self.image_dtype, name='scalar_temp')
with tik_instance.for_range(0, self.channel) as c:
offset1.set_as(c * Constant.TILEM)
scalar_temp.set_as(self.border_value[c])
tik_instance.vec_sel(self.image_mask, 1, pixel_tmp_ub[offset1], flag_int_ub, pixel_tmp_ub[offset1],
scalar_temp, Constant.TILEM // self.image_mask, 8, 8, 8)
output_image_fp_ub = self.dst_coord_n1mn0_ub.reinterpret_cast_to(self.image_dtype)
tik_instance.v4dtrans(True, output_image_fp_ub, pixel_tmp_ub, Constant.TILEM, self.channel)
gm_offset_hw = tik_instance.Scalar(self.scalar_dtype)
gm_offset_hw.set_as((n * self.output_height * self.output_width + self.coord_offset) * self.channel)
with tik_instance.if_scope(self.output_width % 32 == 0):
tik_instance.data_move(self.output_image_gm[gm_offset_hw], output_image_fp_ub, 0, self.cur_h,
ceil_block(Constant.TILEW * self.channel, self.image_dtype), 0,
ceil_block((self.output_width - Constant.TILEW) * self.channel, self.image_dtype))
with tik_instance.else_scope():
with tik_instance.for_range(0, self.cur_h) as h:
tik_instance.data_move(self.output_image_gm[gm_offset_hw + h * self.output_width * self.channel],
output_image_fp_ub[h * Constant.TILEW * self.channel], 0, 1,
ceil_block(Constant.TILEW * self.channel, self.image_dtype), 0, 0)
def gather_from_image(self, n):
tik_instance = self.tik_instance
flag_int_ub = self.x.reinterpret_cast_to(Constant.UINT16)
tik_instance.vec_dup(64, flag_int_ub, 0, 1, 8)
gather_index = self.dst_coord_n1mn0_ub.reinterpret_cast_to(self.scalar_dtype)
pixel_tmp_ub = self.dst_coord_ub.reinterpret_cast_to(self.image_dtype)
with tik_instance.if_scope(self.load_input_or_not == 1):
tik_instance.scalar_conv('round', self.x_min_int32, self.x_min)
tik_instance.scalar_conv('round', self.x_max_int32, self.x_max)
tik_instance.scalar_conv('round', self.y_min_int32, self.y_min)
tik_instance.scalar_conv('round', self.y_max_int32, self.y_max)
self.h_tmp.set_as(self.y_max_int32 - self.y_min_int32 + 1)
self.w_tmp.set_as(self.x_max_int32 - self.x_min_int32 + 1)
self.w_tmp_32.set_as(
ceil_block(self.w_tmp, self.image_dtype) * Constant.TYPE_NUM_EACH_BLOCK.get(self.image_dtype))
self.input_image_ub = self.src_ub.reinterpret_cast_to(self.image_dtype)
with tik_instance.if_scope(self.h_tmp * self.w_tmp_32 * self.channel * Constant.TYPE_LEN_DICT.get(
self.image_dtype) <= 2 * Constant.BUFFERLIMIT):
tik_instance.vec_adds(64, self.x_int32_ub, self.x_int32_ub, -1 * self.x_min_int32, Constant.TILEM // 64,
8, 8)
tik_instance.vec_adds(64, self.y_int32_ub, self.y_int32_ub, -1 * self.y_min_int32, Constant.TILEM // 64,
8, 8)
gm_in_offset1 = tik_instance.Scalar(self.scalar_dtype)
gm_in_offset1.set_as(
n * self.input_height * self.input_width + self.y_min_int32 * self.input_width + self.x_min_int32)
self.gm_in_offset = tik_instance.Scalar(self.scalar_dtype)
self.gm_in_offset.set_as(gm_in_offset1 * self.channel)
with tik_instance.if_scope(
tik.all(
(self.input_width * self.channel) % Constant.TYPE_NUM_EACH_BLOCK.get(self.image_dtype) == 0,
self.input_width >= self.w_tmp_32)):
tik_instance.data_move(self.input_image_ub, self.input_image_gm[self.gm_in_offset], 0, self.h_tmp,
ceil_block(self.w_tmp_32 * self.channel, self.image_dtype),
ceil_block((self.input_width - self.w_tmp_32) * self.channel,
self.image_dtype), 0)
with tik_instance.else_scope():
with tik_instance.for_range(0, self.h_tmp) as h:
tik_instance.data_move(self.input_image_ub[h * self.w_tmp_32 * self.channel],
self.input_image_gm[
self.gm_in_offset + h * self.input_width * self.channel],
0, 1, ceil_block(self.w_tmp_32 * self.channel, self.image_dtype), 0, 0)
vec_conv_dynamic(self, 'none', self.bilinear_nthread, self.x_fp32_ub, self.x_int32_ub)
vec_conv_dynamic(self, 'none', self.bilinear_nthread, self.y_fp32_ub, self.y_int32_ub)
cmp_ub = self.dst_coord_n1mn0_ub.reinterpret_cast_to(Constant.FLOAT32)
tik_instance.vec_dup(64, cmp_ub, 0, ceil_value(Constant.TILEM, 64), 8)
tik_instance.vec_dup(64, cmp_ub[1024], self.h_tmp - 1, ceil_value(Constant.TILEM, 64), 8)
tik_instance.vec_dup(64, cmp_ub[2 * 1024], self.w_tmp - 1, ceil_value(Constant.TILEM, 64), 8)
repeat_times1 = ceil_value(Constant.TILEM, Constant.TYPE_MASK_DICT.get(Constant.INT32))
tik_instance.vec_cmpv_ge(flag_int_ub, self.y_fp32_ub, cmp_ub, repeat_times1, 8, 8)
tik_instance.vec_cmpv_le(flag_int_ub[64], self.y_fp32_ub, cmp_ub[1024], repeat_times1, 8, 8)
repeat_times2 = 1
tik_instance.vec_and(64, flag_int_ub, flag_int_ub, flag_int_ub[64], repeat_times2, 8, 8, 8)
tik_instance.vec_cmpv_ge(flag_int_ub[64], self.x_fp32_ub, cmp_ub, repeat_times1, 8, 8)
tik_instance.vec_and(64, flag_int_ub, flag_int_ub, flag_int_ub[64], repeat_times2, 8, 8, 8)
tik_instance.vec_cmpv_le(flag_int_ub[64], self.x_fp32_ub, cmp_ub[2 * 1024], repeat_times1, 8, 8)
tik_instance.vec_and(64, flag_int_ub, flag_int_ub, flag_int_ub[64], repeat_times2, 8, 8, 8)
tmp_ub = self.dst_coord_ub.reinterpret_cast_to(self.scalar_dtype)
tik_instance.vmuls(64, tmp_ub, self.y_int32_ub, self.w_tmp_32 * self.channel, Constant.TILEM // 64, 1,
1, 8, 8)
tik_instance.vmuls(64, tmp_ub[1024], self.x_int32_ub, self.channel, Constant.TILEM // 64, 1, 1, 8, 8)
tik_instance.vadd(64, gather_index, tmp_ub, tmp_ub[1024], Constant.TILEM // 64, 1, 1, 1, 8, 8, 8)
tik_instance.vadds(64, gather_index[self.channel * Constant.TILEM], gather_index, self.channel,
Constant.TILEM // 64, 1, 1, 8, 8)
tik_instance.vadds(64, gather_index[2 * self.channel * Constant.TILEM], gather_index,
self.w_tmp_32 * self.channel, Constant.TILEM // 64, 1, 1, 8, 8)
tik_instance.vadds(64, gather_index[3 * self.channel * Constant.TILEM],
gather_index[2 * self.channel * Constant.TILEM], self.channel, Constant.TILEM // 64,
1, 1, 8, 8)
with tik_instance.for_range(1, self.channel) as c:
with tik_instance.for_range(0, 4) as i:
tik_instance.vadds(64, gather_index[((i * self.channel) + c) * Constant.TILEM],
gather_index[i * self.channel * Constant.TILEM], c, Constant.TILEM // 64, 1,
1, 8, 8)
if self.image_dtype == Constant.FLOAT32:
vmuls_dynamic(self, 4 * self.channel * Constant.TILEM, gather_index, gather_index, 4)
with tik_instance.if_scope(4 * self.channel * Constant.TILEM // 64 > Constant.REPEAT_TIMES_MAX):
tik_instance.vgather(64, pixel_tmp_ub, self.input_image_ub, gather_index,
Constant.REPEAT_TIMES_MAX, 0, 0, 3)
tik_instance.vgather(64, pixel_tmp_ub[Constant.REPEAT_TIMES_MAX * 64], self.input_image_ub,
gather_index[64 * Constant.REPEAT_TIMES_MAX],
(4 * self.channel * Constant.TILEM // 64) - Constant.REPEAT_TIMES_MAX, 0,
0, 3)
with tik_instance.else_scope():
tik_instance.vgather(64, pixel_tmp_ub, self.input_image_ub, gather_index,
4 * self.channel * Constant.TILEM // 64, 0, 0, 3)
else:
vmuls_dynamic(self, 4 * self.channel * Constant.TILEM, gather_index, gather_index, 2)
tik_instance.vgather(128, pixel_tmp_ub, self.input_image_ub, gather_index,
4 * self.channel * Constant.TILEM // 128, 0, 0, 3)
with tik_instance.else_scope():
cmp_ub = self.dst_coord_n1mn0_ub.reinterpret_cast_to(Constant.FLOAT32)
tik_instance.vec_dup(64, cmp_ub, 0, ceil_value(Constant.TILEM, 64), 8)
tik_instance.vec_dup(64, cmp_ub[1024], self.input_height_float32 - 1, ceil_value(Constant.TILEM, 64), 8)
tik_instance.vec_dup(64, cmp_ub[2 * 1024], self.input_width_float32 - 1, ceil_value(Constant.TILEM, 64),
8)
repeat_times1 = ceil_value(Constant.TILEM, Constant.TYPE_MASK_DICT.get(self.dtype))
tik_instance.vec_cmpv_ge(flag_int_ub, self.y_fp32_ub, cmp_ub, repeat_times1, 8, 8)
tik_instance.vec_cmpv_le(flag_int_ub[64], self.y_fp32_ub, cmp_ub[1024], repeat_times1, 8, 8)
repeat_times2 = 1
tik_instance.vec_and(64, flag_int_ub, flag_int_ub, flag_int_ub[64], repeat_times2, 8, 8, 8)
tik_instance.vec_cmpv_ge(flag_int_ub[64], self.x_fp32_ub, cmp_ub, repeat_times1, 8, 8)
tik_instance.vec_and(64, flag_int_ub, flag_int_ub, flag_int_ub[64], repeat_times2, 8, 8, 8)
tik_instance.vec_cmpv_le(flag_int_ub[64], self.x_fp32_ub, cmp_ub[2 * 1024], repeat_times1, 8, 8)
tik_instance.vec_and(64, flag_int_ub, flag_int_ub, flag_int_ub[64], repeat_times2, 8, 8, 8)
tmp_ub = self.dst_coord_ub.reinterpret_cast_to(self.scalar_dtype)
tik_instance.vmuls(64, tmp_ub, self.y_int32_ub, self.input_width * self.channel, Constant.TILEM // 64,
1, 1, 8, 8)
tik_instance.vmuls(64, tmp_ub[1024], self.x_int32_ub, self.channel, Constant.TILEM // 64, 1, 1, 8, 8)
tik_instance.vadd(64, gather_index, tmp_ub, tmp_ub[1024], Constant.TILEM // 64, 1, 1, 1, 8, 8, 8)
tik_instance.vadds(64, gather_index[self.channel * Constant.TILEM], gather_index, self.channel,
Constant.TILEM // 64, 1, 1, 8, 8)
tik_instance.vadds(64, gather_index[2 * self.channel * Constant.TILEM], gather_index,
self.input_width * self.channel, Constant.TILEM // 64, 1, 1, 8, 8)
tik_instance.vadds(64, gather_index[3 * self.channel * Constant.TILEM],
gather_index[2 * self.channel * Constant.TILEM], self.channel, Constant.TILEM // 64,
1, 1, 8, 8)
with tik_instance.for_range(1, self.channel) as c:
with tik_instance.for_range(0, 4) as i:
tik_instance.vadds(64, gather_index[((i * self.channel) + c) * Constant.TILEM],
gather_index[i * self.channel * Constant.TILEM], c, Constant.TILEM // 64, 1,
1, 8, 8)
tik_instance.vec_adds(64, gather_index, gather_index,
n * self.input_height * self.input_width * self.channel,
4 * self.channel * Constant.TILEM // 64, 8, 8)
offset = tik_instance.Scalar(self.scalar_dtype)
index = tik_instance.Scalar(self.scalar_dtype)
with tik_instance.for_range(0, self.channel) as c:
with tik_instance.for_range(0, self.cur_h) as h:
with tik_instance.for_range(0, self.cur_wup_32) as w:
offset.set_as((c * self.cur_h + h) * self.cur_wup_32 + w)
index.set_as(gather_index[offset])
pixel_tmp_ub[offset].set_as(self.input_image_gm[index])
offset.set_as(((self.channel + c) * self.cur_h + h) * self.cur_wup_32 + w)
index.set_as(gather_index[offset])
pixel_tmp_ub[offset].set_as(self.input_image_gm[index])
offset.set_as(((2 * self.channel + c) * self.cur_h + h) * self.cur_wup_32 + w)
index.set_as(gather_index[offset])
pixel_tmp_ub[offset].set_as(self.input_image_gm[index])
offset.set_as(((3 * self.channel + c) * self.cur_h + h) * self.cur_wup_32 + w)
index.set_as(gather_index[offset])
pixel_tmp_ub[offset].set_as(self.input_image_gm[index])
return pixel_tmp_ub, flag_int_ub
def warp_perspective(image_dtype=Constant.UINT8):
"""
WarpPerspective operator
"""
warp_perspective_obj = WarpPerspectiveUint8() if (image_dtype == Constant.UINT8) else WarpPerspectiveFloat(
image_dtype)
return warp_perspective_obj.warp_perspective_compute()
def try_remove_kernel_dir(dir_path):
if os.path.exists(dir_path):
try:
shutil.rmtree(dir_path)
except Exception as e:
logging.warning("Skip remove dir.")
def build_with_retry(dir_path, path, op_name, cons):
"""
retry 3 times to build
"""
out_path = os.path.join(path, 'kernel_meta', op_name)
retry_times = 3
logging.info("start build %s", op_name)
for retry in range(retry_times):
try:
warp_perspective(cons)
logging.info("times %s build success.", retry)
break
except Exception as e:
logging.warning("times %s build failed.", retry)
if not os.path.exists(out_path):
try_remove_kernel_dir(dir_path)
if not os.path.exists(out_path):
raise Exception('Build %s failed', op_name)
out_path_bk = os.path.join('/tmp/', op_name)
try:
shutil.copy(out_path, out_path_bk)
except Exception as e:
logging.warning("backup operator failed.")
if __name__ == "__main__":
path = sys.argv[1]
dir_path = os.path.join(path, 'kernel_meta')
try_remove_kernel_dir(dir_path)
build_with_retry(dir_path, path, 'warp_perspective_uint8.o', Constant.UINT8)
build_with_retry(dir_path, path, 'warp_perspective_float32.o', Constant.FLOAT32)
build_with_retry(dir_path, path, 'warp_perspective_float16.o', Constant.FLOAT16)