05360171创建于 2022年3月18日历史提交
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================

import argparse


def parse_fastpitch_args(parent, add_help=False):
    """
    Parse commandline arguments.
    """
    parser = argparse.ArgumentParser(parents=[parent], add_help=add_help,
                                     allow_abbrev=False)
    io = parser.add_argument_group('io parameters')
    io.add_argument('--n-mel-channels', default=80, type=int,
                    help='Number of bins in mel-spectrograms')
    io.add_argument('--max-seq-len', default=2048, type=int,
                    help='')

    symbols = parser.add_argument_group('symbols parameters')
    symbols.add_argument('--n-symbols', default=148, type=int,
                         help='Number of symbols in dictionary')
    symbols.add_argument('--padding-idx', default=0, type=int,
                         help='Index of padding symbol in dictionary')
    symbols.add_argument('--symbols-embedding-dim', default=384, type=int,
                         help='Input embedding dimension')

    in_fft = parser.add_argument_group('input FFT parameters')
    in_fft.add_argument('--in-fft-n-layers', default=6, type=int,
                        help='Number of FFT blocks')
    in_fft.add_argument('--in-fft-n-heads', default=1, type=int,
                        help='Number of attention heads')
    in_fft.add_argument('--in-fft-d-head', default=64, type=int,
                        help='Dim of attention heads')
    in_fft.add_argument('--in-fft-conv1d-kernel-size', default=3, type=int,
                        help='Conv-1D kernel size')
    in_fft.add_argument('--in-fft-conv1d-filter-size', default=1536, type=int,
                        help='Conv-1D filter size')
    in_fft.add_argument('--in-fft-output-size', default=384, type=int,
                        help='Output dim')
    in_fft.add_argument('--p-in-fft-dropout', default=0.1, type=float,
                        help='Dropout probability')
    in_fft.add_argument('--p-in-fft-dropatt', default=0.1, type=float,
                        help='Multi-head attention dropout')
    in_fft.add_argument('--p-in-fft-dropemb', default=0.0, type=float,
                        help='Dropout added to word+positional embeddings')

    out_fft = parser.add_argument_group('output FFT parameters')
    out_fft.add_argument('--out-fft-n-layers', default=6, type=int,
                         help='Number of FFT blocks')
    out_fft.add_argument('--out-fft-n-heads', default=1, type=int,
                         help='Number of attention heads')
    out_fft.add_argument('--out-fft-d-head', default=64, type=int,
                         help='Dim of attention head')
    out_fft.add_argument('--out-fft-conv1d-kernel-size', default=3, type=int,
                         help='Conv-1D kernel size')
    out_fft.add_argument('--out-fft-conv1d-filter-size', default=1536, type=int,
                         help='Conv-1D filter size')
    out_fft.add_argument('--out-fft-output-size', default=384, type=int,
                         help='Output dim')
    out_fft.add_argument('--p-out-fft-dropout', default=0.1, type=float,
                         help='Dropout probability for out_fft')
    out_fft.add_argument('--p-out-fft-dropatt', default=0.1, type=float,
                         help='Multi-head attention dropout')
    out_fft.add_argument('--p-out-fft-dropemb', default=0.0, type=float,
                         help='Dropout added to word+positional embeddings')

    dur_pred = parser.add_argument_group('duration predictor parameters')
    dur_pred.add_argument('--dur-predictor-kernel-size', default=3, type=int,
                          help='Duration predictor conv-1D kernel size')
    dur_pred.add_argument('--dur-predictor-filter-size', default=256, type=int,
                          help='Duration predictor conv-1D filter size')
    dur_pred.add_argument('--p-dur-predictor-dropout', default=0.1, type=float,
                          help='Dropout probability for duration predictor')
    dur_pred.add_argument('--dur-predictor-n-layers', default=2, type=int,
                          help='Number of conv-1D layers')

    pitch_pred = parser.add_argument_group('pitch predictor parameters')
    pitch_pred.add_argument('--pitch-predictor-kernel-size', default=3, type=int,
                            help='Pitch predictor conv-1D kernel size')
    pitch_pred.add_argument('--pitch-predictor-filter-size', default=256, type=int,
                            help='Pitch predictor conv-1D filter size')
    pitch_pred.add_argument('--p-pitch-predictor-dropout', default=0.1, type=float,
                            help='Pitch probability for pitch predictor')
    pitch_pred.add_argument('--pitch-predictor-n-layers', default=2, type=int,
                            help='Number of conv-1D layers')

    energy_pred = parser.add_argument_group('energy predictor parameters')
    energy_pred.add_argument('--energy-conditioning', action='store_true')
    energy_pred.add_argument('--energy-predictor-kernel-size', default=3, type=int,
                            help='Pitch predictor conv-1D kernel size')
    energy_pred.add_argument('--energy-predictor-filter-size', default=256, type=int,
                            help='Pitch predictor conv-1D filter size')
    energy_pred.add_argument('--p-energy-predictor-dropout', default=0.1, type=float,
                            help='Pitch probability for energy predictor')
    energy_pred.add_argument('--energy-predictor-n-layers', default=2, type=int,
                            help='Number of conv-1D layers')

    cond = parser.add_argument_group('conditioning parameters')
    cond.add_argument('--pitch-embedding-kernel-size', default=3, type=int,
                      help='Pitch embedding conv-1D kernel size')
    cond.add_argument('--energy-embedding-kernel-size', default=3, type=int,
                      help='Pitch embedding conv-1D kernel size')
    cond.add_argument('--speaker-emb-weight', type=float, default=1.0,
                      help='Scale speaker embedding')

    return parser