From bd4e29b4dd337ab15cdedc7a0f0d34e6d976a1bd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Sat, 7 Aug 2021 21:43:52 +0000
Subject: [PATCH] Add `compute_linear_spec=False` to `BaseTTSConfig`

---
 TTS/tts/configs/shared_configs.py | 35 +++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/TTS/tts/configs/shared_configs.py b/TTS/tts/configs/shared_configs.py
index 4b916a17..8511b1bc 100644
--- a/TTS/tts/configs/shared_configs.py
+++ b/TTS/tts/configs/shared_configs.py
@@ -13,12 +13,16 @@ class GSTConfig(Coqpit):
     Args:
         gst_style_input_wav (str):
             Path to the wav file used to define the style of the output speech at inference. Defaults to None.
+
         gst_style_input_weights (dict):
             Defines the weights for each style token used at inference. Defaults to None.
+
         gst_embedding_dim (int):
             Defines the size of the GST embedding vector dimensions. Defaults to 256.
+
         gst_num_heads (int):
             Number of attention heads used by the multi-head attention. Defaults to 4.
+
         gst_num_style_tokens (int):
             Number of style token vectors. Defaults to 10.
     """
@@ -51,17 +55,23 @@ class CharactersConfig(Coqpit):
     Args:
         pad (str):
             characters in place of empty padding. Defaults to None.
+
         eos (str):
             characters showing the end of a sentence. Defaults to None.
+
         bos (str):
             characters showing the beginning of a sentence. Defaults to None.
+
         characters (str):
             character set used by the model. Characters not in this list are ignored when converting input text to
             a list of sequence IDs. Defaults to None.
+
         punctuations (str):
             characters considered as punctuation as parsing the input sentence. Defaults to None.
+
         phonemes (str):
             characters considered as parsing phonemes. Defaults to None.
+
         unique (bool):
             remove any duplicate characters in the character lists. It is a bandaid for compatibility with the old
             models trained with character lists with duplicates.
@@ -95,54 +105,78 @@ class BaseTTSConfig(BaseTrainingConfig):
     Args:
         audio (BaseAudioConfig):
             Audio processor config object instance.
+
         use_phonemes (bool):
             enable / disable phoneme use.
+
         use_espeak_phonemes (bool):
             enable / disable eSpeak-compatible phonemes (only if use_phonemes = `True`).
+
         compute_input_seq_cache (bool):
             enable / disable precomputation of the phoneme sequences. At the expense of some delay at the beginning of
             the training, It allows faster data loader time and precise limitation with `max_seq_len` and
             `min_seq_len`.
+
         text_cleaner (str):
             Name of the text cleaner used for cleaning and formatting transcripts.
+
         enable_eos_bos_chars (bool):
             enable / disable the use of eos and bos characters.
+
         test_senteces_file (str):
             Path to a txt file that has sentences used at test time. The file must have a sentence per line.
+
         phoneme_cache_path (str):
             Path to the output folder caching the computed phonemes for each sample.
+
         characters (CharactersConfig):
             Instance of a CharactersConfig class.
+
         batch_group_size (int):
             Size of the batch groups used for bucketing. By default, the dataloader orders samples by the sequence
             length for a more efficient and stable training. If `batch_group_size > 1` then it performs bucketing to
             prevent using the same batches for each epoch.
+
         loss_masking (bool):
             enable / disable masking loss values against padded segments of samples in a batch.
+
         min_seq_len (int):
             Minimum input sequence length to be used at training.
+
         max_seq_len (int):
             Maximum input sequence length to be used at training. Larger values result in more VRAM usage.
+
         compute_f0 (int):
             (Not in use yet).
+
+        compute_linear_spec (bool):
+            If True data loader computes and returns linear spectrograms alongside the other data.
+
         use_noise_augment (bool):
             Augment the input audio with random noise.
+
         add_blank (bool):
             Add blank characters between each other two characters. It improves performance for some models at expense
             of slower run-time due to the longer input sequence.
+
         datasets (List[BaseDatasetConfig]):
             List of datasets used for training. If multiple datasets are provided, they are merged and used together
             for training.
+
         optimizer (str):
             Optimizer used for the training. Set one from `torch.optim.Optimizer` or `TTS.utils.training`.
             Defaults to ``.
+
         optimizer_params (dict):
             Optimizer kwargs. Defaults to `{"betas": [0.8, 0.99], "weight_decay": 0.0}`
+
         lr_scheduler (str):
             Learning rate scheduler for the training. Use one from `torch.optim.Scheduler` schedulers or
             `TTS.utils.training`. Defaults to ``.
+
         lr_scheduler_params (dict):
             Parameters for the generator learning rate scheduler. Defaults to `{"warmup": 4000}`.
+
         test_sentences (List[str]):
             List of sentences to be used at testing. Defaults to '[]'
     """
@@ -166,6 +200,7 @@ class BaseTTSConfig(BaseTrainingConfig):
     min_seq_len: int = 1
     max_seq_len: int = float("inf")
     compute_f0: bool = False
+    compute_linear_spec: bool = False
     use_noise_augment: bool = False
     add_blank: bool = False
     # dataset