From bd4e29b4dd337ab15cdedc7a0f0d34e6d976a1bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Sat, 7 Aug 2021 21:43:52 +0000 Subject: [PATCH] Add `compute_linear_spec=False` to `BaseTTSConfig` --- TTS/tts/configs/shared_configs.py | 35 +++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/TTS/tts/configs/shared_configs.py b/TTS/tts/configs/shared_configs.py index 4b916a17..8511b1bc 100644 --- a/TTS/tts/configs/shared_configs.py +++ b/TTS/tts/configs/shared_configs.py @@ -13,12 +13,16 @@ class GSTConfig(Coqpit): Args: gst_style_input_wav (str): Path to the wav file used to define the style of the output speech at inference. Defaults to None. + gst_style_input_weights (dict): Defines the weights for each style token used at inference. Defaults to None. + gst_embedding_dim (int): Defines the size of the GST embedding vector dimensions. Defaults to 256. + gst_num_heads (int): Number of attention heads used by the multi-head attention. Defaults to 4. + gst_num_style_tokens (int): Number of style token vectors. Defaults to 10. """ @@ -51,17 +55,23 @@ class CharactersConfig(Coqpit): Args: pad (str): characters in place of empty padding. Defaults to None. + eos (str): characters showing the end of a sentence. Defaults to None. + bos (str): characters showing the beginning of a sentence. Defaults to None. + characters (str): character set used by the model. Characters not in this list are ignored when converting input text to a list of sequence IDs. Defaults to None. + punctuations (str): characters considered as punctuation as parsing the input sentence. Defaults to None. + phonemes (str): characters considered as parsing phonemes. Defaults to None. + unique (bool): remove any duplicate characters in the character lists. It is a bandaid for compatibility with the old models trained with character lists with duplicates. @@ -95,54 +105,78 @@ class BaseTTSConfig(BaseTrainingConfig): Args: audio (BaseAudioConfig): Audio processor config object instance. + use_phonemes (bool): enable / disable phoneme use. + use_espeak_phonemes (bool): enable / disable eSpeak-compatible phonemes (only if use_phonemes = `True`). + compute_input_seq_cache (bool): enable / disable precomputation of the phoneme sequences. At the expense of some delay at the beginning of the training, It allows faster data loader time and precise limitation with `max_seq_len` and `min_seq_len`. + text_cleaner (str): Name of the text cleaner used for cleaning and formatting transcripts. + enable_eos_bos_chars (bool): enable / disable the use of eos and bos characters. + test_senteces_file (str): Path to a txt file that has sentences used at test time. The file must have a sentence per line. + phoneme_cache_path (str): Path to the output folder caching the computed phonemes for each sample. + characters (CharactersConfig): Instance of a CharactersConfig class. + batch_group_size (int): Size of the batch groups used for bucketing. By default, the dataloader orders samples by the sequence length for a more efficient and stable training. If `batch_group_size > 1` then it performs bucketing to prevent using the same batches for each epoch. + loss_masking (bool): enable / disable masking loss values against padded segments of samples in a batch. + min_seq_len (int): Minimum input sequence length to be used at training. + max_seq_len (int): Maximum input sequence length to be used at training. Larger values result in more VRAM usage. + compute_f0 (int): (Not in use yet). + + compute_linear_spec (bool): + If True data loader computes and returns linear spectrograms alongside the other data. + use_noise_augment (bool): Augment the input audio with random noise. + add_blank (bool): Add blank characters between each other two characters. It improves performance for some models at expense of slower run-time due to the longer input sequence. + datasets (List[BaseDatasetConfig]): List of datasets used for training. If multiple datasets are provided, they are merged and used together for training. + optimizer (str): Optimizer used for the training. Set one from `torch.optim.Optimizer` or `TTS.utils.training`. Defaults to ``. + optimizer_params (dict): Optimizer kwargs. Defaults to `{"betas": [0.8, 0.99], "weight_decay": 0.0}` + lr_scheduler (str): Learning rate scheduler for the training. Use one from `torch.optim.Scheduler` schedulers or `TTS.utils.training`. Defaults to ``. + lr_scheduler_params (dict): Parameters for the generator learning rate scheduler. Defaults to `{"warmup": 4000}`. + test_sentences (List[str]): List of sentences to be used at testing. Defaults to '[]' """ @@ -166,6 +200,7 @@ class BaseTTSConfig(BaseTrainingConfig): min_seq_len: int = 1 max_seq_len: int = float("inf") compute_f0: bool = False + compute_linear_spec: bool = False use_noise_augment: bool = False add_blank: bool = False # dataset