@@ -27,7 +27,7 @@ import torchaudio.sox_effects as sox_effects
import yaml
from PIL import Image
from PIL.Image import BICUBIC
-from torch.nn.utils.rnn import pad_sequence
+#from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
import wenet.dataset.kaldi_io as kaldi_io
@@ -36,7 +36,69 @@ from wenet.utils.common import IGNORE_ID
torchaudio.set_audio_backend("sox_io")
+def _pad_sequence(sequences, batch_first=False, padding_value=0, mul_shape = None):
+ r"""Pad a list of variable length Tensors with ``padding_value``
+
+ ``pad_sequence`` stacks a list of Tensors along a new dimension,
+ and pads them to equal length. For example, if the input is list of
+ sequences with size ``L x *`` and if batch_first is False, and ``T x B x *``
+ otherwise.
+
+ `B` is batch size. It is equal to the number of elements in ``sequences``.
+ `T` is length of the longest sequence.
+ `L` is length of the sequence.
+ `*` is any number of trailing dimensions, including none.
+
+ Example:
+ >>> from torch.nn.utils.rnn import pad_sequence
+ >>> a = torch.ones(25, 300)
+ >>> b = torch.ones(22, 300)
+ >>> c = torch.ones(15, 300)
+ >>> pad_sequence([a, b, c]).size()
+ torch.Size([25, 3, 300])
+
+ Note:
+ This function returns a Tensor of size ``T x B x *`` or ``B x T x *``
+ where `T` is the length of the longest sequence. This function assumes
+ trailing dimensions and type of all the Tensors in sequences are same.
+
+ Arguments:
+ sequences (list[Tensor]): list of variable length sequences.
+ batch_first (bool, optional): output will be in ``B x T x *`` if True, or in
+ ``T x B x *`` otherwise
+ padding_value (float, optional): value for padded elements. Default: 0.
+ Returns:
+ Tensor of size ``T x B x *`` if :attr:`batch_first` is ``False``.
+ Tensor of size ``B x T x *`` otherwise
+ """
+
+ # assuming trailing dimensions and type of all the Tensors
+ # in sequences are same and fetching those from sequences[0]
+
+ max_size = sequences[0].size()
+ trailing_dims = max_size[1:]
+
+ max_len = max([s.size(0) for s in sequences])
+ if mul_shape is not None:
+ for in_shape in mul_shape:
+ if max_len < in_shape:
+ max_len = in_shape
+ break
+ if batch_first:
+ out_dims = (len(sequences), max_len) + trailing_dims
+ else:
+ out_dims = (max_len, len(sequences)) + trailing_dims
+
+ out_tensor = sequences[0].data.new(*out_dims).fill_(padding_value)
+ for i, tensor in enumerate(sequences):
+ length = tensor.size(0)
+ # use index notation to prevent duplicate references to the tensor
+ if batch_first:
+ out_tensor[i, :length, ...] = tensor
+ else:
+ out_tensor[:length, i, ...] = tensor
+ return out_tensor
def _spec_augmentation(x,
warp_for_time=False,
num_t_mask=2,
@@ -187,6 +249,7 @@ def _extract_feature(batch, speed_perturb, wav_distortion_conf,
Returns:
(keys, feats, labels)
"""
+
keys = []
feats = []
lengths = []
@@ -331,13 +394,14 @@ class CollateFunc(object):
self.spec_sub = spec_sub
self.spec_sub_conf = spec_sub_conf
+
+
def __call__(self, batch):
assert (len(batch) == 1)
if self.raw_wav:
keys, xs, ys = _extract_feature(batch[0], self.speed_perturb,
self.wav_distortion_conf,
self.feature_extraction_conf)
-
else:
keys, xs, ys = _load_feature(batch[0])
@@ -359,27 +423,31 @@ class CollateFunc(object):
if self.spec_aug:
xs = [_spec_augmentation(x, **self.spec_aug_conf) for x in xs]
- # padding
- xs_lengths = torch.from_numpy(
- np.array([x.shape[0] for x in xs], dtype=np.int32))
+
# pad_sequence will FAIL in case xs is empty
+ mul_shape = [262, 326, 390, 454, 518, 582, 646, 710, 774, 838, 902, 966, 1028, 1284, 1478]
if len(xs) > 0:
- xs_pad = pad_sequence([torch.from_numpy(x).float() for x in xs],
- True, 0)
+ xs_pad = _pad_sequence([torch.from_numpy(x).float() for x in xs],
+ True, 0, mul_shape)
else:
xs_pad = torch.Tensor(xs)
+ # padding
+ xs_lengths = torch.from_numpy(
+ np.array([x.shape[0] for x in xs_pad], dtype=np.int32))
+
if train_flag:
ys_lengths = torch.from_numpy(
np.array([y.shape[0] for y in ys], dtype=np.int32))
if len(ys) > 0:
- ys_pad = pad_sequence([torch.from_numpy(y).int() for y in ys],
+ ys_pad = _pad_sequence([torch.from_numpy(y).int() for y in ys],
True, IGNORE_ID)
else:
ys_pad = torch.Tensor(ys)
else:
ys_pad = None
ys_lengths = None
+
return keys, xs_pad, ys_pad, xs_lengths, ys_lengths
@@ -430,7 +498,6 @@ class AudioDataset(Dataset):
"""
assert batch_type in ['static', 'dynamic']
data = []
-
# Open in utf8 mode since meet encoding problem
with codecs.open(data_file, 'r', encoding='utf-8') as f:
for line in f:
@@ -32,8 +32,74 @@ from wenet.utils.common import (IGNORE_ID, add_sos_eos, log_add,
reverse_pad_list)
from wenet.utils.mask import (make_pad_mask, mask_finished_preds,
mask_finished_scores, subsequent_mask)
+from wenet.transformer.acl_net import Net
+import time
+import acl
+
+def _pad_sequence(sequences, batch_first=False, padding_value=0, mul_shape = None):
+ r"""Pad a list of variable length Tensors with ``padding_value``
+
+ ``pad_sequence`` stacks a list of Tensors along a new dimension,
+ and pads them to equal length. For example, if the input is list of
+ sequences with size ``L x *`` and if batch_first is False, and ``T x B x *``
+ otherwise.
+
+ `B` is batch size. It is equal to the number of elements in ``sequences``.
+ `T` is length of the longest sequence.
+ `L` is length of the sequence.
+ `*` is any number of trailing dimensions, including none.
+
+ Example:
+ >>> from torch.nn.utils.rnn import pad_sequence
+ >>> a = torch.ones(25, 300)
+ >>> b = torch.ones(22, 300)
+ >>> c = torch.ones(15, 300)
+ >>> pad_sequence([a, b, c]).size()
+ torch.Size([25, 3, 300])
+
+ Note:
+ This function returns a Tensor of size ``T x B x *`` or ``B x T x *``
+ where `T` is the length of the longest sequence. This function assumes
+ trailing dimensions and type of all the Tensors in sequences are same.
+
+ Arguments:
+ sequences (list[Tensor]): list of variable length sequences.
+ batch_first (bool, optional): output will be in ``B x T x *`` if True, or in
+ ``T x B x *`` otherwise
+ padding_value (float, optional): value for padded elements. Default: 0.
+
+ Returns:
+ Tensor of size ``T x B x *`` if :attr:`batch_first` is ``False``.
+ Tensor of size ``B x T x *`` otherwise
+ """
+
+ # assuming trailing dimensions and type of all the Tensors
+ # in sequences are same and fetching those from sequences[0]
+
+ max_size = sequences[0].size()
+ trailing_dims = max_size[1:]
+
+ max_len = max([s.size(0) for s in sequences])
+ if mul_shape is not None:
+ for in_shape in mul_shape:
+ if max_len < in_shape:
+ max_len = in_shape
+ break
-
+ if batch_first:
+ out_dims = (len(sequences), max_len) + trailing_dims
+ else:
+ out_dims = (max_len, len(sequences)) + trailing_dims
+
+ out_tensor = sequences[0].data.new(*out_dims).fill_(padding_value)
+ for i, tensor in enumerate(sequences):
+ length = tensor.size(0)
+ # use index notation to prevent duplicate references to the tensor
+ if batch_first:
+ out_tensor[i, :length, ...] = tensor
+ else:
+ out_tensor[:length, i, ...] = tensor
+ return out_tensor
class ASRModel(torch.nn.Module):
"""CTC-attention hybrid Encoder-Decoder model"""
def __init__(
@@ -60,6 +126,13 @@ class ASRModel(torch.nn.Module):
self.reverse_weight = reverse_weight
self.encoder = encoder
+ self.device_id = 0
+ ret = acl.init()
+ ret = acl.rt.set_device(self.device_id)
+ context, ret = acl.rt.create_context(self.device_id)
+ self.encoder_ascend = Net(model_path="/home/zry2/wenet/examples/aishell/s0/onnx/encoder_fendang_262_1478_static.om", device_id=self.device_id)
+ self.decoder_ascend = Net(model_path="/home/zry2/wenet/examples/aishell/s0/onnx/decoder_fendang.om", device_id=self.device_id)
+ self.encoder_out_shape = []
self.decoder = decoder
self.ctc = ctc
self.criterion_att = LabelSmoothingLoss(
@@ -168,13 +241,21 @@ class ASRModel(torch.nn.Module):
num_decoding_left_chunks=num_decoding_left_chunks
) # (B, maxlen, encoder_dim)
else:
- encoder_out, encoder_mask = self.encoder(
- speech,
- speech_lengths,
- decoding_chunk_size=decoding_chunk_size,
- num_decoding_left_chunks=num_decoding_left_chunks
- ) # (B, maxlen, encoder_dim)
- return encoder_out, encoder_mask
+ st = time.time()
+
+ # encoder_out, encoder_mask = self.encoder(
+ # speech,
+ # speech_lengths,
+ # decoding_chunk_size=decoding_chunk_size,
+ # num_decoding_left_chunks=num_decoding_left_chunks
+ # ) # (B, maxlen, encoder_dim)
+ speech = speech.numpy()
+ speech_lengths = speech_lengths.numpy().astype("int32")
+ dims1 = {'dimCount': 4, 'name': '', 'dims': [1, speech.shape[1], 80, 1]}
+ y, exe_time = self.encoder_ascend([speech, speech_lengths], dims = dims1)
+ encoder_out = torch.from_numpy(y[0])
+ encoder_mask = torch.from_numpy(y[1])
+ return encoder_out, encoder_mask, exe_time
def recognize(
self,
@@ -361,13 +442,17 @@ class ASRModel(torch.nn.Module):
assert batch_size == 1
# Let's assume B = batch_size and N = beam_size
# 1. Encoder forward and get CTC score
- encoder_out, encoder_mask = self._forward_encoder(
+ encoder_out, encoder_mask, encoder_t = self._forward_encoder(
speech, speech_lengths, decoding_chunk_size,
num_decoding_left_chunks,
simulate_streaming) # (B, maxlen, encoder_dim)
maxlen = encoder_out.size(1)
+ mul_shape = [96, 144, 384]
+
+ encoder_out = _pad_sequence(encoder_out, True, 0, mul_shape)
ctc_probs = self.ctc.log_softmax(
encoder_out) # (1, maxlen, vocab_size)
+
ctc_probs = ctc_probs.squeeze(0)
# cur_hyps: (prefix, (blank_ending_score, none_blank_ending_score))
cur_hyps = [(tuple(), (0.0, -float('inf')))]
@@ -409,7 +494,7 @@ class ASRModel(torch.nn.Module):
reverse=True)
cur_hyps = next_hyps[:beam_size]
hyps = [(y[0], log_add([y[1][0], y[1][1]])) for y in cur_hyps]
- return hyps, encoder_out
+ return hyps, encoder_out, encoder_t
def ctc_prefix_beam_search(
self,
@@ -485,7 +570,7 @@ class ASRModel(torch.nn.Module):
# For attention rescoring we only support batch_size=1
assert batch_size == 1
# encoder_out: (1, maxlen, encoder_dim), len(hyps) = beam_size
- hyps, encoder_out = self._ctc_prefix_beam_search(
+ hyps, encoder_out, encoder_t = self._ctc_prefix_beam_search(
speech, speech_lengths, beam_size, decoding_chunk_size,
num_decoding_left_chunks, simulate_streaming)
@@ -510,9 +595,19 @@ class ASRModel(torch.nn.Module):
r_hyps_pad = reverse_pad_list(ori_hyps_pad, hyps_lens, self.ignore_id)
r_hyps_pad, _ = add_sos_eos(r_hyps_pad, self.sos, self.eos,
self.ignore_id)
- decoder_out, r_decoder_out, _ = self.decoder(
- encoder_out, encoder_mask, hyps_pad, hyps_lens, r_hyps_pad,
- reverse_weight) # (beam_size, max_hyps_len, vocab_size)
+
+ encoder_out = encoder_out.numpy()
+ encoder_mask = encoder_mask.numpy()
+ hyps_pad = hyps_pad.numpy()
+ hyps_lens = hyps_lens.numpy().astype("int32")
+ r_hyps_pad = r_hyps_pad.numpy()
+ dims2 = {'dimCount': 11, 'name': '', 'dims': [10, encoder_out.shape[1], 256, 10, 1, encoder_out.shape[1], 10, r_hyps_pad.shape[1], 10, 10, r_hyps_pad.shape[1]]}
+
+ y, exe_time = self.decoder_ascend([encoder_out, encoder_mask, hyps_pad, hyps_lens, r_hyps_pad], dims=dims2)
+ batch_t = encoder_t + exe_time
+ decoder_out = torch.from_numpy(y[0])
+ r_decoder_out = torch.from_numpy(y[1])
+
decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1)
decoder_out = decoder_out.cpu().numpy()
# r_decoder_out will be 0.0, if reverse_weight is 0.0 or decoder is a
@@ -539,7 +634,7 @@ class ASRModel(torch.nn.Module):
if score > best_score:
best_score = score
best_index = i
- return hyps[best_index][0]
+ return hyps[best_index][0], batch_t
@torch.jit.export
def subsampling_rate(self) -> int:
@@ -157,6 +157,8 @@ class BaseEncoder(torch.nn.Module):
decoding_chunk_size,
self.static_chunk_size,
num_decoding_left_chunks)
+
+
for layer in self.encoders:
xs, chunk_masks, _ = layer(xs, chunk_masks, pos_emb, mask_pad)
if self.normalize_before: