#include <algorithm>
#include <chrono>
#include <fstream>
#include <iostream>
#include <memory>
#include <string>
#include <vector>
#include "paddle_api.h"
using namespace paddle::lite_api;
class PredictorInterface {
public:
virtual ~PredictorInterface() = 0;
virtual bool Init(const std::string &AcousticModelPath,
const std::string &VocoderPath,
PowerMode cpuPowerMode,
int cpuThreadNum,
uint32_t wavSampleRate) = 0;
virtual std::shared_ptr<PaddlePredictor> LoadModel(
const std::string &modelPath,
int cpuThreadNum,
PowerMode cpuPowerMode) = 0;
virtual void ReleaseModel() = 0;
virtual bool RunModel(const std::vector<int64_t> &phones) = 0;
virtual std::unique_ptr<const Tensor> GetAcousticModelOutput(
const std::vector<int64_t> &phones) = 0;
virtual std::unique_ptr<const Tensor> GetVocoderOutput(
std::unique_ptr<const Tensor> &&amOutput) = 0;
virtual void VocoderOutputToWav(
std::unique_ptr<const Tensor> &&vocOutput) = 0;
virtual void SaveFloatWav(float *floatWav, int64_t size) = 0;
virtual bool IsLoaded() = 0;
virtual float GetInferenceTime() = 0;
virtual int GetWavSize() = 0;
virtual float GetWavDuration() = 0;
virtual float GetRTF() = 0;
virtual void ReleaseWav() = 0;
virtual bool WriteWavToFile(const std::string &wavPath) = 0;
};
PredictorInterface::~PredictorInterface() {}
template <typename WavDataType>
class Predictor : public PredictorInterface {
public:
bool Init(const std::string &AcousticModelPath,
const std::string &VocoderPath,
PowerMode cpuPowerMode,
int cpuThreadNum,
uint32_t wavSampleRate) override {
ReleaseModel();
acoustic_model_predictor_ =
LoadModel(AcousticModelPath, cpuThreadNum, cpuPowerMode);
if (acoustic_model_predictor_ == nullptr) {
return false;
}
vocoder_predictor_ = LoadModel(VocoderPath, cpuThreadNum, cpuPowerMode);
if (vocoder_predictor_ == nullptr) {
return false;
}
wav_sample_rate_ = wavSampleRate;
return true;
}
virtual ~Predictor() {
ReleaseModel();
ReleaseWav();
}
std::shared_ptr<PaddlePredictor> LoadModel(
const std::string &modelPath,
int cpuThreadNum,
PowerMode cpuPowerMode) override {
if (modelPath.empty()) {
return nullptr;
}
MobileConfig config;
config.set_model_from_file(modelPath);
config.set_threads(cpuThreadNum);
config.set_power_mode(cpuPowerMode);
return CreatePaddlePredictor<MobileConfig>(config);
}
void ReleaseModel() override {
acoustic_model_predictor_ = nullptr;
vocoder_predictor_ = nullptr;
}
bool RunModel(const std::vector<int64_t> &phones) override {
if (!IsLoaded()) {
return false;
}
auto start = std::chrono::system_clock::now();
VocoderOutputToWav(GetVocoderOutput(GetAcousticModelOutput(phones)));
auto end = std::chrono::system_clock::now();
std::chrono::duration<float> duration = end - start;
inference_time_ = duration.count() * 1000;
return true;
}
std::unique_ptr<const Tensor> GetAcousticModelOutput(
const std::vector<int64_t> &phones) override {
auto phones_handle = acoustic_model_predictor_->GetInput(0);
phones_handle->Resize({static_cast<int64_t>(phones.size())});
phones_handle->CopyFromCpu(phones.data());
acoustic_model_predictor_->Run();
auto am_output_handle = acoustic_model_predictor_->GetOutput(0);
std::cout << "Acoustic Model Output shape: ";
auto shape = am_output_handle->shape();
for (auto s : shape) {
std::cout << s << ", ";
}
std::cout << std::endl;
return am_output_handle;
}
std::unique_ptr<const Tensor> GetVocoderOutput(
std::unique_ptr<const Tensor> &&amOutput) override {
auto mel_handle = vocoder_predictor_->GetInput(0);
auto dims = amOutput->shape();
mel_handle->Resize(dims);
auto am_output_data = amOutput->mutable_data<float>();
mel_handle->CopyFromCpu(am_output_data);
vocoder_predictor_->Run();
auto voc_output_handle = vocoder_predictor_->GetOutput(0);
std::cout << "Vocoder Output shape: ";
auto shape = voc_output_handle->shape();
for (auto s : shape) {
std::cout << s << ", ";
}
std::cout << std::endl;
return voc_output_handle;
}
void VocoderOutputToWav(
std::unique_ptr<const Tensor> &&vocOutput) override {
int64_t output_size = 1;
for (auto dim : vocOutput->shape()) {
output_size *= dim;
}
auto output_data = vocOutput->mutable_data<float>();
SaveFloatWav(output_data, output_size);
}
void SaveFloatWav(float *floatWav, int64_t size) override;
bool IsLoaded() override {
return acoustic_model_predictor_ != nullptr &&
vocoder_predictor_ != nullptr;
}
float GetInferenceTime() override { return inference_time_; }
const std::vector<WavDataType> &GetWav() { return wav_; }
int GetWavSize() override { return wav_.size() * sizeof(WavDataType); }
float GetWavDuration() override {
return static_cast<float>(GetWavSize()) / sizeof(WavDataType) /
static_cast<float>(wav_sample_rate_) * 1000;
}
float GetRTF() override { return GetInferenceTime() / GetWavDuration(); }
void ReleaseWav() override { wav_.clear(); }
bool WriteWavToFile(const std::string &wavPath) override {
std::ofstream fout(wavPath, std::ios::binary);
if (!fout.is_open()) {
return false;
}
WavHeader header;
header.audio_format = GetWavAudioFormat();
header.data_size = GetWavSize();
header.size = sizeof(header) - 8 + header.data_size;
header.sample_rate = wav_sample_rate_;
header.byte_rate = header.sample_rate * header.num_channels *
header.bits_per_sample / 8;
header.block_align = header.num_channels * header.bits_per_sample / 8;
fout.write(reinterpret_cast<const char *>(&header), sizeof(header));
fout.write(reinterpret_cast<const char *>(wav_.data()),
header.data_size);
fout.close();
return true;
}
protected:
struct WavHeader {
char riff[4] = {'R', 'I', 'F', 'F'};
uint32_t size = 0;
char wave[4] = {'W', 'A', 'V', 'E'};
char fmt[4] = {'f', 'm', 't', ' '};
uint32_t fmt_size = 16;
uint16_t audio_format = 0;
uint16_t num_channels = 1;
uint32_t sample_rate = 0;
uint32_t byte_rate = 0;
uint16_t block_align = 0;
uint16_t bits_per_sample = sizeof(WavDataType) * 8;
char data[4] = {'d', 'a', 't', 'a'};
uint32_t data_size = 0;
};
enum WavAudioFormat {
WAV_FORMAT_16BIT_PCM = 1,
WAV_FORMAT_32BIT_FLOAT = 3
};
protected:
inline uint16_t GetWavAudioFormat();
inline float Abs(float number) { return (number < 0) ? -number : number; }
protected:
float inference_time_ = 0;
uint32_t wav_sample_rate_ = 0;
std::vector<WavDataType> wav_;
std::shared_ptr<PaddlePredictor> acoustic_model_predictor_ = nullptr;
std::shared_ptr<PaddlePredictor> vocoder_predictor_ = nullptr;
};
template <>
uint16_t Predictor<int16_t>::GetWavAudioFormat() {
return Predictor::WAV_FORMAT_16BIT_PCM;
}
template <>
uint16_t Predictor<float>::GetWavAudioFormat() {
return Predictor::WAV_FORMAT_32BIT_FLOAT;
}
template <>
void Predictor<int16_t>::SaveFloatWav(float *floatWav, int64_t size) {
wav_.resize(size);
float maxSample = 0.01;
for (int64_t i = 0; i < size; i++) {
float sample = Abs(floatWav[i]);
if (sample > maxSample) {
maxSample = sample;
}
}
for (int64_t i = 0; i < size; i++) {
wav_[i] = floatWav[i] * 32767.0f / maxSample;
}
}
template <>
void Predictor<float>::SaveFloatWav(float *floatWav, int64_t size) {
wav_.resize(size);
std::copy_n(floatWav, size, wav_.data());
}