import os
import json
import torchaudio
import torch.nn.functional as F
from speechbrain.pretrained import EncoderClassifier
from mini_librispeech_prepare import prepare_mini_librispeech
prepare_mini_librispeech(data_folder='data', save_json_train='train.json', save_json_valid='valid.json',
save_json_test='test.json', split_ratio=[0, 0, 100])
if not os.path.exists('mini_librispeech_test_bin'):
os.makedirs('mini_librispeech_test_bin')
file = open('mini_librispeech_test.info', 'w')
classifier = EncoderClassifier.from_hparams(source='best_model', savedir='best_model')
with open('test.json', 'r') as f:
data_info = json.load(f)
i = 0
for key, value in data_info.items():
wav_file = 'data' + value['wav'][11:]
signal, fs = torchaudio.load(wav_file)
feats = classifier.extract_feats(signal)
pad = (feats.shape[1] // 100 + 1) * 100 - feats.shape[1]
feats = F.pad(feats, (0,0,0,pad,0,0), value=0)
output = 'mini_librispeech_test_bin/' + value['wav'].split('/')[-1][:-4] + 'bin'
feats.numpy().tofile(output)
file.write(str(i) + ' ' + output + ' (' + str(feats.shape[0]) + ',' + str(feats.shape[1]) + ',' + str(feats.shape[2]) + ')')
file.write('\n')
i += 1
print('data preprocess done')
file.close()