ModelZoo-PyTorch/ACL_PyTorch/contrib/audio/WeNet/acc.diff-代码预览-ModelZoo-PyTorch:基于昇腾生态的AI模型平台项目 - AtomGit

bbd45fd1创建于 2022年3月24日历史提交
diff --git a/wenet/dataset/dataset.py b/wenet/dataset/dataset.py
index 4f0ff39..4ce97a4 100644
--- a/wenet/dataset/dataset.py
+++ b/wenet/dataset/dataset.py
@@ -27,7 +27,7 @@ import torchaudio.sox_effects as sox_effects
 import yaml
 from PIL import Image
 from PIL.Image import BICUBIC
-from torch.nn.utils.rnn import pad_sequence
+#from torch.nn.utils.rnn import pad_sequence
 from torch.utils.data import Dataset, DataLoader
 
 import wenet.dataset.kaldi_io as kaldi_io
@@ -36,7 +36,69 @@ from wenet.utils.common import IGNORE_ID
 
 torchaudio.set_audio_backend("sox_io")
 
+def _pad_sequence(sequences, batch_first=False, padding_value=0, mul_shape = None):
+    r"""Pad a list of variable length Tensors with ``padding_value``
+
+    ``pad_sequence`` stacks a list of Tensors along a new dimension,
+    and pads them to equal length. For example, if the input is list of
+    sequences with size ``L x *`` and if batch_first is False, and ``T x B x *``
+    otherwise.
+
+    `B` is batch size. It is equal to the number of elements in ``sequences``.
+    `T` is length of the longest sequence.
+    `L` is length of the sequence.
+    `*` is any number of trailing dimensions, including none.
+
+    Example:
+        >>> from torch.nn.utils.rnn import pad_sequence
+        >>> a = torch.ones(25, 300)
+        >>> b = torch.ones(22, 300)
+        >>> c = torch.ones(15, 300)
+        >>> pad_sequence([a, b, c]).size()
+        torch.Size([25, 3, 300])
+
+    Note:
+        This function returns a Tensor of size ``T x B x *`` or ``B x T x *``
+        where `T` is the length of the longest sequence. This function assumes
+        trailing dimensions and type of all the Tensors in sequences are same.
+
+    Arguments:
+        sequences (list[Tensor]): list of variable length sequences.
+        batch_first (bool, optional): output will be in ``B x T x *`` if True, or in
+            ``T x B x *`` otherwise
+        padding_value (float, optional): value for padded elements. Default: 0.
 
+    Returns:
+        Tensor of size ``T x B x *`` if :attr:`batch_first` is ``False``.
+        Tensor of size ``B x T x *`` otherwise
+    """
+
+    # assuming trailing dimensions and type of all the Tensors
+    # in sequences are same and fetching those from sequences[0]
+
+    max_size = sequences[0].size()
+    trailing_dims = max_size[1:]
+
+    max_len = max([s.size(0) for s in sequences])
+    if mul_shape is not None:
+        for in_shape in mul_shape:
+            if max_len < in_shape:
+                max_len = in_shape
+                break
+    if batch_first:
+        out_dims = (len(sequences), max_len) + trailing_dims
+    else:
+        out_dims = (max_len, len(sequences)) + trailing_dims
+
+    out_tensor = sequences[0].data.new(*out_dims).fill_(padding_value)
+    for i, tensor in enumerate(sequences):
+        length = tensor.size(0)
+        # use index notation to prevent duplicate references to the tensor
+        if batch_first:
+            out_tensor[i, :length, ...] = tensor
+        else:
+            out_tensor[:length, i, ...] = tensor
+    return out_tensor
 def _spec_augmentation(x,
                        warp_for_time=False,
                        num_t_mask=2,
@@ -187,6 +249,7 @@ def _extract_feature(batch, speed_perturb, wav_distortion_conf,
     Returns:
         (keys, feats, labels)
     """
+
     keys = []
     feats = []
     lengths = []
@@ -331,13 +394,14 @@ class CollateFunc(object):
         self.spec_sub = spec_sub
         self.spec_sub_conf = spec_sub_conf
 
+
+
     def __call__(self, batch):
         assert (len(batch) == 1)
         if self.raw_wav:
             keys, xs, ys = _extract_feature(batch[0], self.speed_perturb,
                                             self.wav_distortion_conf,
                                             self.feature_extraction_conf)
-
         else:
             keys, xs, ys = _load_feature(batch[0])
 
@@ -359,27 +423,31 @@ class CollateFunc(object):
         if self.spec_aug:
             xs = [_spec_augmentation(x, **self.spec_aug_conf) for x in xs]
 
-        # padding
-        xs_lengths = torch.from_numpy(
-            np.array([x.shape[0] for x in xs], dtype=np.int32))
+
 
         # pad_sequence will FAIL in case xs is empty
+        mul_shape = [262, 326, 390, 454, 518, 582, 646, 710, 774, 838, 902, 966, 1028, 1284, 1478]
         if len(xs) > 0:
-            xs_pad = pad_sequence([torch.from_numpy(x).float() for x in xs],
-                                  True, 0)
+            xs_pad = _pad_sequence([torch.from_numpy(x).float() for x in xs],
+                                  True, 0, mul_shape)
         else:
             xs_pad = torch.Tensor(xs)
+        # padding
+        xs_lengths = torch.from_numpy(
+            np.array([x.shape[0] for x in xs_pad], dtype=np.int32))
+
         if train_flag:
             ys_lengths = torch.from_numpy(
                 np.array([y.shape[0] for y in ys], dtype=np.int32))
             if len(ys) > 0:
-                ys_pad = pad_sequence([torch.from_numpy(y).int() for y in ys],
+                ys_pad = _pad_sequence([torch.from_numpy(y).int() for y in ys],
                                       True, IGNORE_ID)
             else:
                 ys_pad = torch.Tensor(ys)
         else:
             ys_pad = None
             ys_lengths = None
+
         return keys, xs_pad, ys_pad, xs_lengths, ys_lengths
 
 
@@ -430,7 +498,6 @@ class AudioDataset(Dataset):
         """
         assert batch_type in ['static', 'dynamic']
         data = []
-
         # Open in utf8 mode since meet encoding problem
         with codecs.open(data_file, 'r', encoding='utf-8') as f:
             for line in f:
diff --git a/wenet/transformer/asr_model.py b/wenet/transformer/asr_model.py
index 73990fa..50358ca 100644
--- a/wenet/transformer/asr_model.py
+++ b/wenet/transformer/asr_model.py
@@ -32,8 +32,74 @@ from wenet.utils.common import (IGNORE_ID, add_sos_eos, log_add,
                                 reverse_pad_list)
 from wenet.utils.mask import (make_pad_mask, mask_finished_preds,
                               mask_finished_scores, subsequent_mask)
+from wenet.transformer.acl_net import Net
+import time
+import acl
+
+def _pad_sequence(sequences, batch_first=False, padding_value=0, mul_shape = None):
+    r"""Pad a list of variable length Tensors with ``padding_value``
+
+    ``pad_sequence`` stacks a list of Tensors along a new dimension,
+    and pads them to equal length. For example, if the input is list of
+    sequences with size ``L x *`` and if batch_first is False, and ``T x B x *``
+    otherwise.
+
+    `B` is batch size. It is equal to the number of elements in ``sequences``.
+    `T` is length of the longest sequence.
+    `L` is length of the sequence.
+    `*` is any number of trailing dimensions, including none.
+
+    Example:
+        >>> from torch.nn.utils.rnn import pad_sequence
+        >>> a = torch.ones(25, 300)
+        >>> b = torch.ones(22, 300)
+        >>> c = torch.ones(15, 300)
+        >>> pad_sequence([a, b, c]).size()
+        torch.Size([25, 3, 300])
+
+    Note:
+        This function returns a Tensor of size ``T x B x *`` or ``B x T x *``
+        where `T` is the length of the longest sequence. This function assumes
+        trailing dimensions and type of all the Tensors in sequences are same.
+
+    Arguments:
+        sequences (list[Tensor]): list of variable length sequences.
+        batch_first (bool, optional): output will be in ``B x T x *`` if True, or in
+            ``T x B x *`` otherwise
+        padding_value (float, optional): value for padded elements. Default: 0.
+
+    Returns:
+        Tensor of size ``T x B x *`` if :attr:`batch_first` is ``False``.
+        Tensor of size ``B x T x *`` otherwise
+    """
+
+    # assuming trailing dimensions and type of all the Tensors
+    # in sequences are same and fetching those from sequences[0]
+
+    max_size = sequences[0].size()
+    trailing_dims = max_size[1:]
+
+    max_len = max([s.size(0) for s in sequences])
+    if mul_shape is not None:
+        for in_shape in mul_shape:
+            if max_len < in_shape:
+                max_len = in_shape
+                break
 
-
+    if batch_first:
+        out_dims = (len(sequences), max_len) + trailing_dims
+    else:
+        out_dims = (max_len, len(sequences)) + trailing_dims
+
+    out_tensor = sequences[0].data.new(*out_dims).fill_(padding_value)
+    for i, tensor in enumerate(sequences):
+        length = tensor.size(0)
+        # use index notation to prevent duplicate references to the tensor
+        if batch_first:
+            out_tensor[i, :length, ...] = tensor
+        else:
+            out_tensor[:length, i, ...] = tensor
+    return out_tensor
 class ASRModel(torch.nn.Module):
     """CTC-attention hybrid Encoder-Decoder model"""
     def __init__(
@@ -60,6 +126,13 @@ class ASRModel(torch.nn.Module):
         self.reverse_weight = reverse_weight
 
         self.encoder = encoder
+        self.device_id = 0
+        ret = acl.init()
+        ret = acl.rt.set_device(self.device_id)
+        context, ret = acl.rt.create_context(self.device_id)
+        self.encoder_ascend = Net(model_path="/home/zry2/wenet/examples/aishell/s0/onnx/encoder_fendang_262_1478_static.om", device_id=self.device_id)
+        self.decoder_ascend = Net(model_path="/home/zry2/wenet/examples/aishell/s0/onnx/decoder_fendang.om", device_id=self.device_id)
+        self.encoder_out_shape = []
         self.decoder = decoder
         self.ctc = ctc
         self.criterion_att = LabelSmoothingLoss(
@@ -168,13 +241,21 @@ class ASRModel(torch.nn.Module):
                 num_decoding_left_chunks=num_decoding_left_chunks
             )  # (B, maxlen, encoder_dim)
         else:
-            encoder_out, encoder_mask = self.encoder(
-                speech,
-                speech_lengths,
-                decoding_chunk_size=decoding_chunk_size,
-                num_decoding_left_chunks=num_decoding_left_chunks
-            )  # (B, maxlen, encoder_dim)
-        return encoder_out, encoder_mask
+            st = time.time()
+
+            # encoder_out, encoder_mask = self.encoder(
+            #     speech,
+            #     speech_lengths,
+            #     decoding_chunk_size=decoding_chunk_size,
+            #     num_decoding_left_chunks=num_decoding_left_chunks
+            # )  # (B, maxlen, encoder_dim)
+            speech = speech.numpy()
+            speech_lengths = speech_lengths.numpy().astype("int32")
+            dims1 = {'dimCount': 4, 'name': '', 'dims': [1, speech.shape[1], 80, 1]}
+            y, exe_time = self.encoder_ascend([speech, speech_lengths], dims = dims1)
+            encoder_out = torch.from_numpy(y[0])
+            encoder_mask = torch.from_numpy(y[1])
+        return encoder_out, encoder_mask, exe_time
 
     def recognize(
         self,
@@ -361,13 +442,17 @@ class ASRModel(torch.nn.Module):
         assert batch_size == 1
         # Let's assume B = batch_size and N = beam_size
         # 1. Encoder forward and get CTC score
-        encoder_out, encoder_mask = self._forward_encoder(
+        encoder_out, encoder_mask, encoder_t = self._forward_encoder(
             speech, speech_lengths, decoding_chunk_size,
             num_decoding_left_chunks,
             simulate_streaming)  # (B, maxlen, encoder_dim)
         maxlen = encoder_out.size(1)
+        mul_shape = [96, 144, 384]
+
+        encoder_out = _pad_sequence(encoder_out, True, 0, mul_shape)
         ctc_probs = self.ctc.log_softmax(
             encoder_out)  # (1, maxlen, vocab_size)
+
         ctc_probs = ctc_probs.squeeze(0)
         # cur_hyps: (prefix, (blank_ending_score, none_blank_ending_score))
         cur_hyps = [(tuple(), (0.0, -float('inf')))]
@@ -409,7 +494,7 @@ class ASRModel(torch.nn.Module):
                                reverse=True)
             cur_hyps = next_hyps[:beam_size]
         hyps = [(y[0], log_add([y[1][0], y[1][1]])) for y in cur_hyps]
-        return hyps, encoder_out
+        return hyps, encoder_out, encoder_t
 
     def ctc_prefix_beam_search(
         self,
@@ -485,7 +570,7 @@ class ASRModel(torch.nn.Module):
         # For attention rescoring we only support batch_size=1
         assert batch_size == 1
         # encoder_out: (1, maxlen, encoder_dim), len(hyps) = beam_size
-        hyps, encoder_out = self._ctc_prefix_beam_search(
+        hyps, encoder_out, encoder_t = self._ctc_prefix_beam_search(
             speech, speech_lengths, beam_size, decoding_chunk_size,
             num_decoding_left_chunks, simulate_streaming)
 
@@ -510,9 +595,19 @@ class ASRModel(torch.nn.Module):
         r_hyps_pad = reverse_pad_list(ori_hyps_pad, hyps_lens, self.ignore_id)
         r_hyps_pad, _ = add_sos_eos(r_hyps_pad, self.sos, self.eos,
                                     self.ignore_id)
-        decoder_out, r_decoder_out, _ = self.decoder(
-            encoder_out, encoder_mask, hyps_pad, hyps_lens, r_hyps_pad,
-            reverse_weight)  # (beam_size, max_hyps_len, vocab_size)
+
+        encoder_out = encoder_out.numpy()
+        encoder_mask = encoder_mask.numpy()
+        hyps_pad = hyps_pad.numpy()
+        hyps_lens = hyps_lens.numpy().astype("int32")
+        r_hyps_pad = r_hyps_pad.numpy()
+        dims2 = {'dimCount': 11, 'name': '', 'dims': [10, encoder_out.shape[1], 256, 10, 1, encoder_out.shape[1], 10, r_hyps_pad.shape[1], 10, 10, r_hyps_pad.shape[1]]}
+
+        y, exe_time = self.decoder_ascend([encoder_out, encoder_mask, hyps_pad, hyps_lens, r_hyps_pad], dims=dims2)
+        batch_t = encoder_t + exe_time
+        decoder_out = torch.from_numpy(y[0])
+        r_decoder_out = torch.from_numpy(y[1])
+
         decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1)
         decoder_out = decoder_out.cpu().numpy()
         # r_decoder_out will be 0.0, if reverse_weight is 0.0 or decoder is a
@@ -539,7 +634,7 @@ class ASRModel(torch.nn.Module):
             if score > best_score:
                 best_score = score
                 best_index = i
-        return hyps[best_index][0]
+        return hyps[best_index][0], batch_t
 
     @torch.jit.export
     def subsampling_rate(self) -> int:
diff --git a/wenet/transformer/encoder.py b/wenet/transformer/encoder.py
index e342ed4..c8e18d5 100644
--- a/wenet/transformer/encoder.py
+++ b/wenet/transformer/encoder.py
@@ -157,6 +157,8 @@ class BaseEncoder(torch.nn.Module):
                                               decoding_chunk_size,
                                               self.static_chunk_size,
                                               num_decoding_left_chunks)
+
+
         for layer in self.encoders:
             xs, chunk_masks, _ = layer(xs, chunk_masks, pos_emb, mask_pad)
         if self.normalize_before: